def check_options(opts): mkdir(opts.workdir) # transform filtering reads option opts.filter = filters_to_bin(opts.filter) if not path.exists(opts.workdir): raise IOError('ERROR: workdir not found.') if opts.format == 'hic': if not opts.juicerjar: raise IOError('ERROR: juicer jar file needed for "hic" export.') # for LUSTRE file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count())
def check_options(opts): mkdir(opts.workdir) # transform filtering reads option opts.filter = filters_to_bin(opts.filter) # enlighten plotting parameter writing if opts.only_plot: opts.plot = True if opts.interactive: if opts.nox: raise Exception('ERROR: no screen no fun.\n' 'Interactive plot incompatible with noX option.') opts.plot = True opts.only_plot = True # check resume if not path.exists(opts.workdir): raise IOError('ERROR: workdir not found.') # check resume if opts.triangular and opts.coord2: raise NotImplementedError('ERROR: triangular is only available for ' 'symmetric matrices.') # for LUSTRE file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check if job already run using md5 digestion of parameters try: if already_run(opts): if not opts.force: if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit( 'WARNING: exact same job already computed, see JOBs table above' ) else: warn( 'WARNING: exact same job already computed, overwriting...') except IOError: warn(("" "\nWARNING:\n new working directory created. It's ok... " "but next time use TADbit since the beginning!! :)"))
def check_options(opts): mkdir(opts.workdir) # transform filtering reads option opts.filter = filters_to_bin(opts.filter) # enlighten plotting parameter writing if opts.only_plot: opts.plot = True if opts.interactive: if opts.nox: raise Exception('ERROR: no screen no fun.\n' 'Interactive plot incompatible with noX option.') opts.plot = True opts.only_plot = True # check resume if not path.exists(opts.workdir): raise IOError('ERROR: workdir not found.') # check resume if opts.triangular and opts.coord2: raise NotImplementedError('ERROR: triangular is only available for ' 'symmetric matrices.') # for LUSTRE file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check if job already run using md5 digestion of parameters try: if already_run(opts): if not opts.force: if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit('WARNING: exact same job already computed, see JOBs table above') else: warn('WARNING: exact same job already computed, overwriting...') except IOError: warn(("" "\nWARNING:\n new working directory created. It's ok... " "but next time use TADbit from the beginning!! :)"))
def write_matrix(inbam, resolution, biases, outdir, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region1=None, start1=None, end1=None, clean=True, region2=None, start2=None, end2=None, tmpdir='.', ncpus=8, verbose=True): if not isinstance(filter_exclude, int): filter_exclude = filters_to_bin(filter_exclude) regions, rand_hash, bin_coords, chunks = read_bam( inbam, filter_exclude, resolution, ncpus=ncpus, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, verbose=verbose) bamfile = pysam.AlignmentFile(inbam, 'rb') sections = OrderedDict(zip(bamfile.references,[x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] if biases: bias1, bias2, decay, bads1, bads2 = get_biases_region(biases, bin_coords) else: bads1 = bads2 = {} start_bin1, start_bin2 = bin_coords[::2] if verbose: printime(' - Writing matrices') fnam = outdir + '{}_mat_{}kb.tsv'.format(region1, resolution / 1000) mkdir (outdir) out = open(os.path.join(outdir, fnam), 'w') # pull all sub-matrices and write full matrix for c,j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if k < j: # we are only going to keep half of the matrix continue if j not in bads1 and k not in bads2 and abs(j-k) in decay[c]: n = v / bias1[j] / bias2[k] / decay[c][abs(j-k)] pos1 = j + section_pos[region1][0] pos2 = k + section_pos[region1][0] out.write('{}\t{}\t{}\t{}\n'.format(pos1, pos2, v, n)) out.close() # this is the last thing we do in case something goes wrong os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))) if verbose: printime('\nDone.')
def check_options(opts): mkdir(opts.workdir) # transform filtering reads option opts.filter = filters_to_bin(opts.filter) # check custom normalization if opts.normalization == 'custom': if not opts.biases_path: raise IOError( 'ERROR: biases file required for "custom" normalization.') elif not path.exists(opts.biases_path): raise IOError('ERROR: biases not found at path: %s' % opts.biases_path) # check resume if not path.exists(opts.workdir): raise IOError('ERROR: workdir not found.') # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check if job already run using md5 digestion of parameters try: if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit( 'WARNING: exact same job already computed, see JOBs table above' ) except IOError: # new working directory pass
def check_options(opts): mkdir(opts.workdir) # transform filtering reads option opts.filter = filters_to_bin(opts.filter) # enlight plotting parameter writing if opts.only_plot: opts.plot = True if opts.interactive: opts.plot = True opts.only_plot = True # check resume if not path.exists(opts.workdir): raise IOError('ERROR: workdir not found.') # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check if job already run using md5 digestion of parameters if already_run(opts): if not opts.force: if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit( 'WARNING: exact same job already computed, see JOBs table above' ) else: warn('WARNING: exact same job already computed, overwritting...')
def check_options(opts): mkdir(opts.workdir) # transform filtering reads option opts.filter = filters_to_bin(opts.filter) # check custom normalization if opts.normalization=='custom': if not opts.biases_path: raise IOError('ERROR: biases file required for "custom" normalization.') elif not path.exists(opts.biases_path): raise IOError('ERROR: biases not found at path: %s' % opts.biases_path) # check resume if not path.exists(opts.workdir): raise IOError('ERROR: workdir not found.') # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check if job already run using md5 digestion of parameters try: if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit('WARNING: exact same job already computed, see JOBs table above') except IOError: # new working directory pass
def write_matrix(inbam, resolution, biases, outfile, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region1=None, start1=None, end1=None, clean=True, region2=None, start2=None, end2=None, nchunks=100, tmpdir='.', ncpus=8, verbose=True, window=None): if not isinstance(filter_exclude, int): filter_exclude = filters_to_bin(filter_exclude) _, rand_hash, bin_coords, chunks = read_bam(inbam, filter_exclude, resolution, ncpus=ncpus, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, nchunks=nchunks, verbose=verbose) bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] if biases: bias1, bias2, decay, bads1, bads2 = get_biases_region( biases, bin_coords) transform = lambda x, c, j, k: x / bias1[j] / bias2[k] / decay[c][abs( k - j)] transform2 = lambda x, j, k: x / bias1[j] / bias2[k] else: bads1 = bads2 = {} transform = transform2 = lambda x, c, k, j: x if bads1 is bads2: badcols = bads1 else: # should never happen badcols = bads1 badcols.update(bads2) if verbose: printime(' - Writing matrices') mkdir(os.path.split(os.path.abspath(outfile))[0]) # write the rest of the file to be sorted out = open(outfile, 'w') nheader = 0 for i, c in enumerate(bamfile.references): out.write('# CHROM\t{}\t{}\n'.format(c, bamfile.lengths[i])) nheader += 1 out.write('# RESOLUTION\t{}\n'.format(resolution)) nheader += 1 out.write('# BADCOLS\t{}\n'.format(','.join(map(str, badcols.keys())))) nheader += 1 if window == 'all': outside = lambda c_, j_, k_: False elif window == 'intra': outside = lambda c_, j_, k_: c_ == '' elif window == 'inter': outside = lambda c_, j_, k_: c_ != '' else: min_, max_ = window outside = lambda c_, j_, k_: (k_ - j_) < min_ or (k_ - j_) > max_ # pull all sub-matrices and write full matrix for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if k < j or j in badcols or k in badcols: # we keep only half matrix continue if outside(c, j, k): continue try: n = transform(v, c, j, k) # normalize except KeyError: n = transform2(v, j, k) # normalize no decay out.write('{}\t{}\t{}\t{}\n'.format(j, k, v, n)) out.close() # this is the last thing we do in case something goes wrong if clean: os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))) return nheader
def get_options(): parser = ArgumentParser(usage="%(prog)s -i PATH -r INT [options]") parser.add_argument('-i', '--infile', dest='inbam', metavar='', required=True, default=False, help='input HiC-BAM file.') parser.add_argument('-o', '--outdir', dest='outdir', metavar='', default=True, help='output directory.') parser.add_argument('-t', '--tarfile', dest='tarfile', metavar='', default=False, help='''skip the generation of files, directly append them to a tar file (does not need to be created).''') parser.add_argument('--tmp', dest='tmpdir', metavar='', default=False, help='''path where to store temporary files (by default outdir is used).''') parser.add_argument('-r', '--resolution', dest='reso', type=int, metavar='', required=True, help='''wanted resolution form the generated matrix''') parser.add_argument('-b', '--biases', dest='biases', metavar='', help='''path to pickle file with array of biases''') parser.add_argument('-c', '--coord', dest='coord1', metavar='', default=None, help='''Coordinate of the region to retrieve. By default all genome, arguments can be either one chromosome name, or the coordinate in the form: "-c chr3:110000000-120000000"''') parser.add_argument('-c2', '--coord2', dest='coord2', metavar='', default=None, help='''Coordinate of a second region to retrieve the matrix in the intersection with the first region.''') parser.add_argument('-C', '--cpus', dest='cpus', metavar='', type=int, default=8, help='''[%(default)s] number of cpus to be used for parsing the HiC-BAM file''') parser.add_argument('--matrices', dest='matrices', metavar='', type=str, nargs='+', default=['norm', 'raw', 'decay'], help='''[%(default)s] which matrix to generate''') parser.add_argument('-f', '--format', dest='format', default='abc', choices=['abc', 'mat'], required=False, help='''[%(default)s] format in which to write the output matrix (choose from %(choices)s)''' ) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true', help='display no running information') parser.add_argument( '-F', '--filter', dest='filter', nargs='+', type=int, metavar='INT', default=[1, 2, 3, 4, 6, 7, 9, 10], choices=range(1, 11), help=( """[%(default)s] Use filters to define a set os valid pair of reads e.g.: '--apply 1 2 3 4 8 9 10'. Where these numbers""" + "correspond to: %s" % (', '.join(['%2d: %15s' % (k, MASKED[k]['name']) for k in MASKED])))) parser.add_argument('--nchunks', dest='nchunks', action='store', default=None, type=int, help='''maximum number of chunks into which to cut the BAM''') opts = parser.parse_args() # convert filters to binary for samtools opts.filter = filters_to_bin(opts.filter) if not opts.biases and ('norm' in opts.matrices or 'decay' in opts.matrices): raise Exception('ERROR: should provide path to bias file.') if not opts.tmpdir: opts.tmpdir = opts.outdir return opts
def get_options(): parser = ArgumentParser(usage="%(prog)s -i PATH -r INT [options]") parser.add_argument('-i', '--infile', dest='inbam', metavar='', required=True, default=False, help='input HiC-BAM file.') parser.add_argument('-o', '--outdir', dest='outdir', metavar='', default=True, help='output directory.') parser.add_argument('-t', '--tarfile', dest='tarfile', metavar='', default=False, help='''skip the generation of files, directly append them to a tar file (does not need to be created).''') parser.add_argument('--tmp', dest='tmpdir', metavar='', default=False, help='''path where to store temporary files (by default outdir is used).''') parser.add_argument('-r', '--resolution', dest='reso', type=int, metavar='', required=True, help='''wanted resolution form the generated matrix''') parser.add_argument('-b', '--biases', dest='biases', metavar='', help='''path to pickle file with array of biases''') parser.add_argument('-c', '--coord', dest='coord1', metavar='', default=None, help='''Coordinate of the region to retrieve. By default all genome, arguments can be either one chromosome name, or the coordinate in the form: "-c chr3:110000000-120000000"''') parser.add_argument('-c2', '--coord2', dest='coord2', metavar='', default=None, help='''Coordinate of a second region to retrieve the matrix in the intersection with the first region.''') parser.add_argument('-C', '--cpus', dest='cpus', metavar='', type=int, default=8, help='''[%(default)s] number of cpus to be used for parsing the HiC-BAM file''') parser.add_argument('--matrices', dest='matrices', metavar='', type=str, nargs='+', default=['norm', 'raw', 'decay'], help='''[%(default)s] which matrix to generate''') parser.add_argument('-f', '--format', dest='format', default='abc', choices=['abc', 'mat'], required=False, help='''[%(default)s] format in which to write the output matrix (choose from %(choices)s)''') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true', help='display no running information') parser.add_argument('-F', '--filter', dest='filter', nargs='+', type=int, metavar='INT', default=[1, 2, 3, 4, 6, 7, 9, 10], choices = range(1, 11), help=("""[%(default)s] Use filters to define a set os valid pair of reads e.g.: '--apply 1 2 3 4 8 9 10'. Where these numbers""" + "correspond to: %s" % (', '.join( ['%2d: %15s' % (k, MASKED[k]['name']) for k in MASKED])))) parser.add_argument('--nchunks', dest='nchunks', action='store', default=None, type=int, help='''maximum number of chunks into which to cut the BAM''') opts = parser.parse_args() # convert filters to binary for samtools opts.filter = filters_to_bin(opts.filter) if not opts.biases and ('norm' in opts.matrices or 'decay' in opts.matrices): raise Exception('ERROR: should provide path to bias file.') if not opts.tmpdir: opts.tmpdir = opts.outdir return opts
def main(): """ main function """ opts = get_options() filter_exclude = filters_to_bin(opts.filter) tadbit_bam = opts.tadbit_bam hicup_bam = opts.hicup_bam map_folder = opts.map_folder nreads = opts.nreads * 1_000_000 tag_dict = { (1, 1): (67, 131), (0, 0): (115, 179), (1, 0): (99, 147), (0, 1): (83, 163), } out = open(hicup_bam, 'w') for seqs in get_mapped_chunk(map_folder, nreads): bamfile = AlignmentFile(tadbit_bam, 'rb') refs = bamfile.references printime(f' - processing BAM (for {len(seqs) / 1_000_000}M reads)') for r in bamfile.fetch(multiple_iterators=False): if r.flag & filter_exclude: continue rid = r.qname ridname = rid.split('#')[0] pos1 = r.reference_start + 1 which, len1 = r.cigar[0] tags = dict(r.tags) if which == 6: # first read-end s1, s2 = tags['S1'], tags['S2'] else: s2, s1 = tags['S1'], tags['S2'] if s1 == 0: pos1 = pos1 - len1 + 1 try: seq, qal = seqs[ridname, pos1] except KeyError: continue crm1 = r.reference_name crm2 = refs[r.mrnm] pos2 = r.mpos + 1 len2 = r.tlen dist = 0 if crm1 != crm2 else abs(pos2 - pos1) tags = dict(r.tags) if s2 == 0: pos2 = pos2 - len2 + 1 flag = tag_dict[s1, s2][0] out.write((f'{r.qname}\t{flag}\t{crm1}\t{pos1}\t{len1}\t' f'{len(seq)}M\t{crm2}\t{pos2}\t{dist}\t{seq}\t' f'{qal}\tMD:Z:{len1}\tPG:Z:MarkDuplicates\tNM:i:0\t' f'AS:i:{len1}\tXS:i:1\n')) bamfile.close() seqs.clear() out.close()