def main(): opts = get_options() inbam = opts.inbam resolution = opts.reso filter_exclude = opts.filter min_count = opts.min_count ncpus = opts.cpus factor = 1 outdir = opts.outdir sigma = 2 mkdir(outdir) sys.stdout.write('\nNormalization of full genome\n') biases, decay, badcol = read_bam(inbam, filter_exclude, resolution, min_count=min_count, ncpus=ncpus, sigma=sigma, factor=factor, outdir=outdir, check_sum=opts.check_sum) printime(' - Saving biases and badcol columns') # biases out = open(os.path.join(outdir, 'biases_%s.pickle' % ( nicer(resolution).replace(' ', ''))), 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': resolution}, out) out.close() # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=()) printime('\nDone.')
def main(): opts = get_options() inbam = opts.inbam resolution = opts.reso filter_exclude = opts.filter min_count = opts.min_count ncpus = opts.cpus factor = 1 outdir = opts.outdir sigma = 2 mkdir(outdir) sys.stdout.write('\nNormalization of full genome\n') biases, decay, badcol = read_bam(inbam, filter_exclude, resolution, min_count=min_count, ncpus=ncpus, sigma=sigma, factor=factor, outdir=outdir, check_sum=opts.check_sum) printime(' - Saving biases and badcol columns') # biases out = open( os.path.join(outdir, 'biases_%s.pickle' % (nicer(resolution).replace(' ', ''))), 'w') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': resolution }, out) out.close() # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=()) printime('\nDone.')
def read_bam(inbam, filter_exclude, resolution, min_count=2500, sigma=2, ncpus=8, factor=1, outdir='.', check_sum=False): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] + 1 bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm + 1)]) start_bin = 0 end_bin = len(bins) + 1 total = len(bins) total = end_bin - start_bin + 1 regs = [] begs = [] ends = [] njobs = min(total, 100) + 1 nbins = total / njobs + 1 for i in range(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop at the right place nbins = end_bin - i try: (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1] except IndexError: (crm1, beg1), (crm2, end2) = bins[i], bins[-1] if crm1 != crm2: end1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(end1 * resolution + resolution) # last nt included ends.append(end2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(end2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)]) printime('\n - Parsing BAM (%d chunks)' % (len(regs))) bins_dict = dict([(j, i) for i, j in enumerate(bins)]) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): procs.append( pool.apply_async(read_bam_frag, args=( inbam, filter_exclude, bins, bins_dict, resolution, outdir, region, start, end, ))) pool.close() print_progress(procs) pool.join() ## COLLECT RESULTS verbose = True cisprc = {} for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)): if verbose: if not countbin % 10 and countbin: sys.stdout.write(' ') if not countbin % 50 and countbin: sys.stdout.write(' %9s\n ' % ('%s/%s' % (countbin, len(regs)))) sys.stdout.write('.') sys.stdout.flush() fname = os.path.join(outdir, 'tmp_bins_%s:%d-%d.pickle' % (region, start, end)) tmp_cisprc = load(open(fname)) cisprc.update(tmp_cisprc) if verbose: print '%s %9s\n' % (' ' * (54 - (countbin % 50) - (countbin % 50) / 10), '%s/%s' % (len(regs), len(regs))) # out = open(os.path.join(outdir, 'dicos_%s.pickle' % ( # nicer(resolution).replace(' ', ''))), 'w') # dump(cisprc, out) # out.close() # bad columns def func_gen(x, *args): cmd = "zzz = " + func_restring % (args) exec(cmd) in globals(), locals() #print cmd try: return np.lib.asarray_chkfinite(zzz) except: # avoid the creation of NaNs when invalid values for power or log return x print ' - Removing columns with too few or too much interactions' if not min_count: badcol = filter_by_cis_percentage( cisprc, sigma=sigma, verbose=True, savefig=os.path.join(outdir + 'filtered_bins_%s.png' % (nicer(resolution).replace(' ', '')))) else: print ' -> too few interactions defined as less than %9d interactions' % ( min_count) for k in cisprc: cisprc[k] = cisprc[k][1] badcol = {} countL = 0 countZ = 0 for c in xrange(total): if cisprc.get(c, 0) < min_count: badcol[c] = cisprc.get(c, 0) countL += 1 if not c in cisprc: countZ += 1 print ' -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % ( len(badcol), countZ, countL, total, float(len(badcol)) / total * 100) printime(' - Rescaling biases') size = len(bins) biases = [cisprc.get(k, 1.) for k in range(size)] mean_col = float(sum(biases)) / len(biases) biases = dict([(k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)]) # collect subset-matrices and write genomic one # out = open(os.path.join(outdir, # 'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w') pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append(pool.apply_async(sum_nrm_matrix, args=( fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) target = (sumnrm / float(size * size * factor))**0.5 biases = dict([(b, biases[b] * target) for b in biases]) # check the sum if check_sum: pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append( pool.apply_async(sum_nrm_matrix, args=( fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) print 'SUM:', sumnrm printime(' - Rescaling decay') # normalize decay by size of the diagonal, and by Vanilla correction # (all cells must still be equals to 1 in average) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append( pool.apply_async(sum_dec_matrix, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results sumdec = {} for proc in procs: for k, v in proc.get().iteritems(): try: sumdec[k] += v except KeyError: sumdec[k] = v # count the number of cells per diagonal # TODO: parallelize # find larget chromsome len_big = max(section_pos[c][1] - section_pos[c][0] for c in section_pos) # initialize dictionary ndiags = dict((k, 0) for k in xrange(len_big)) for crm in section_pos: beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1] chr_size = end_chr - beg_chr thesebads = [b for b in badcol if beg_chr <= b <= end_chr] for dist in xrange(1, chr_size): ndiags[dist] += chr_size - dist # from this we remove bad columns # bad columns will only affect if they are at least as distant from # a border as the distance between the longest diagonal and the # current diagonal. bad_diag = set( ) # 2 bad rows can point to the same bad cell in diagonal maxp = end_chr - dist minp = beg_chr + dist for b in thesebads: if b <= maxp: bad_diag.add(b) if b >= minp: bad_diag.add(b - dist) ndiags[dist] -= len(bad_diag) # chr_sizeerent behavior for longest diagonal: ndiags[0] += chr_size - len(thesebads) # normalize sum per diagonal by total number of cells in diagonal for k in sumdec: try: sumdec[k] /= ndiags[k] except ZeroDivisionError: # all columns at this distance are "bad" pass return biases, sumdec, badcol
def read_bam(inbam, filter_exclude, resolution, min_count=2500, sigma=2, ncpus=8, factor=1, outdir='.', check_sum=False): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict(zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] + 1 bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm + 1)]) start_bin = 0 end_bin = len(bins) + 1 total = len(bins) total = end_bin - start_bin + 1 regs = [] begs = [] ends = [] njobs = min(total, 100) + 1 nbins = total / njobs + 1 for i in range(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop at the right place nbins = end_bin - i try: (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1] except IndexError: (crm1, beg1), (crm2, end2) = bins[i], bins[-1] if crm1 != crm2: end1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(end1 * resolution + resolution) # last nt included ends.append(end2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(end2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)]) printime('\n - Parsing BAM (%d chunks)' % (len(regs))) bins_dict = dict([(j, i) for i, j in enumerate(bins)]) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): procs.append(pool.apply_async( read_bam_frag, args=(inbam, filter_exclude, bins, bins_dict, resolution, outdir, region, start, end,))) pool.close() print_progress(procs) pool.join() ## COLLECT RESULTS verbose = True cisprc = {} for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)): if verbose: if not countbin % 10 and countbin: sys.stdout.write(' ') if not countbin % 50 and countbin: sys.stdout.write(' %9s\n ' % ('%s/%s' % (countbin , len(regs)))) sys.stdout.write('.') sys.stdout.flush() fname = os.path.join(outdir, 'tmp_bins_%s:%d-%d.pickle' % (region, start, end)) tmp_cisprc = load(open(fname)) cisprc.update(tmp_cisprc) if verbose: print '%s %9s\n' % (' ' * (54 - (countbin % 50) - (countbin % 50) / 10), '%s/%s' % (len(regs),len(regs))) # out = open(os.path.join(outdir, 'dicos_%s.pickle' % ( # nicer(resolution).replace(' ', ''))), 'w') # dump(cisprc, out) # out.close() # bad columns def func_gen(x, *args): cmd = "zzz = " + func_restring % (args) exec(cmd) in globals(), locals() #print cmd try: return np.lib.asarray_chkfinite(zzz) except: # avoid the creation of NaNs when invalid values for power or log return x print ' - Removing columns with too few or too much interactions' if not min_count: badcol = filter_by_cis_percentage( cisprc, sigma=sigma, verbose=True, savefig=os.path.join(outdir + 'filtered_bins_%s.png' % ( nicer(resolution).replace(' ', '')))) else: print ' -> too few interactions defined as less than %9d interactions' % ( min_count) for k in cisprc: cisprc[k] = cisprc[k][1] badcol = {} countL = 0 countZ = 0 for c in xrange(total): if cisprc.get(c, 0) < min_count: badcol[c] = cisprc.get(c, 0) countL += 1 if not c in cisprc: countZ += 1 print ' -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % ( len(badcol), countZ, countL, total, float(len(badcol)) / total * 100) printime(' - Rescaling biases') size = len(bins) biases = [cisprc.get(k, 1.) for k in range(size)] mean_col = float(sum(biases)) / len(biases) biases = dict([(k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)]) # collect subset-matrices and write genomic one # out = open(os.path.join(outdir, # 'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w') pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) target = (sumnrm / float(size * size * factor))**0.5 biases = dict([(b, biases[b] * target) for b in biases]) # check the sum if check_sum: pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) print 'SUM:', sumnrm printime(' - Rescaling decay') # normalize decay by size of the diagonal, and by Vanilla correction # (all cells must still be equals to 1 in average) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append(pool.apply_async(sum_dec_matrix, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results sumdec = {} for proc in procs: for k, v in proc.get().iteritems(): try: sumdec[k] += v except KeyError: sumdec[k] = v # count the number of cells per diagonal # TODO: parallelize # find larget chromsome len_big = max(section_pos[c][1] - section_pos[c][0] for c in section_pos) # initialize dictionary ndiags = dict((k, 0) for k in xrange(len_big)) for crm in section_pos: beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1] chr_size = end_chr - beg_chr thesebads = [b for b in badcol if beg_chr <= b <= end_chr] for dist in xrange(1, chr_size): ndiags[dist] += chr_size - dist # from this we remove bad columns # bad columns will only affect if they are at least as distant from # a border as the distance between the longest diagonal and the # current diagonal. bad_diag = set() # 2 bad rows can point to the same bad cell in diagonal maxp = end_chr - dist minp = beg_chr + dist for b in thesebads: if b <= maxp: bad_diag.add(b) if b >= minp: bad_diag.add(b - dist) ndiags[dist] -= len(bad_diag) # chr_sizeerent behavior for longest diagonal: ndiags[0] += chr_size - len(thesebads) # normalize sum per diagonal by total number of cells in diagonal for k in sumdec: try: sumdec[k] /= ndiags[k] except ZeroDivisionError: # all columns at this distance are "bad" pass return biases, sumdec, badcol
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception( 'ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception( 'ERROR: missing restriction enzyme name for oneD normalization' ) if not opts.mappability: raise Exception( 'ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % ( len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception( 'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception( "ERROR: chromosomes in FASTA different the ones in BAM") # get mappability ~2 min printime(' - Parsing mappability') fh = open(opts.mappability) mappability = dict((c, []) for c in refs) line = fh.next() crmM, begM, endM, val = line.split() crm = crmM if crmM not in mappability: print(' skipping %s' % crmM) while crmM not in mappability: line = fh.next() crmM, begM, endM, val = line.split() crm = crmM while any(not mappability[c] for c in mappability): for begB in xrange(0, len(genome[crmM]), opts.reso): endB = begB + opts.reso tmp = 0 try: while True: crmM, begM, endM, val = line.split() if crm != crmM: try: while crmM not in refs: line = fh.next() crmM, _ = line.split('\t', 1) except StopIteration: pass break begM = int(begM) endM = int(endM) if endM > endB: weight = endB - begM if weight >= 0: tmp += weight * float(val) break weight = endM - (begM if begM > begB else begB) if weight < 0: break tmp += weight * float(val) line = fh.next() except StopIteration: pass mappability[crm].append(tmp / opts.reso) crm = crmM mappability = reduce(lambda x, y: x + y, (mappability[c] for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos - 200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols) bad_col_image = path.join( outdir, 'filtered_bins_%s_%s.png' % (nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % (opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print(' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join( outdir, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': opts.reso }, out) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def format_yticks(tickstring, _=None): tickstring = int(tickstring * reso + pltbeg2) return nicer(tickstring if tickstring else 1, comma=',', allowed_decimals=1)
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False, genome_seq=None, resolution=None, axe=None, savefig=None, normalized=False): """ :param data: input file name, or HiC_data object or list of lists :param 10 min_diff: lower limit (in number of bins) :param 1000 max_diff: upper limit (in number of bins) to look for :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :returns: slope, intercept and R square of each of the 3 correlations """ resolution = resolution or 1 dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)]) if isinstance(data, str): fhandler = open(data) line = fhandler.next() while line.startswith('#'): line = fhandler.next() try: while True: _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9) if cr1 != cr2: line = fhandler.next() continue diff = abs(int(ps1) / resolution - int(ps2) / resolution) if max_diff > diff >= min_diff: dist_intr[diff] += 1 line = fhandler.next() except StopIteration: pass fhandler.close() elif isinstance(data, HiC_data): if normalized: get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y] else: get_data = lambda x, y: data[x, y] max_diff = min(len(data), max_diff) if data.section_pos: for crm in data.section_pos: for diff in xrange(min_diff, min( (max_diff, 1 + data.chromosomes[crm]))): for i in xrange(data.section_pos[crm][0], data.section_pos[crm][1] - diff): dist_intr[diff] += get_data(i, i + diff) else: for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i, i + diff]): dist_intr[diff] += get_data(i, diff) else: if genome_seq: max_diff = min(max(genome_seq.values()), max_diff) cnt = 0 for crm in genome_seq: for diff in xrange(min_diff, min( (max_diff, genome_seq[crm]))): for i in xrange(cnt, cnt + genome_seq[crm] - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] cnt += genome_seq[crm] else: max_diff = min(len(data), max_diff) for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] if not axe: fig=plt.figure() axe = fig.add_subplot(111) # remove last part of the plot in case no interaction is count... reduce max_dist for diff in xrange(max_diff - 1, min_diff, -1): try: if not dist_intr[diff]: del(dist_intr[diff]) max_diff -=1 continue except KeyError: max_diff -=1 continue break xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0])) x = [] y = [] for k in xrange(len(xp)): if yp[k]: x.append(xp[k]) y.append(yp[k]) axe.plot(x, y, 'k.') best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0) logx = np.log(x) logy = np.log(y) ntries = 100 # set k for better fit # for k in xrange(1, ntries/5, ntries/5/5): if resolution == 1: k = 1 for i in xrange(3, ntries-2-k): v1 = i * len(x) / ntries try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1 = b1 = r21 = 0 r21 *= r21 for j in xrange(i + 1 + k, ntries - 2 - k): v2 = j * len(x) / ntries try: a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2]) a3, b3, r23, _, _ = linregress(logx[v2+k: ], logy[v2+k: ]) except ValueError: a2 = b2 = r22 = 0 a3 = b3 = r23 = 0 r2 = r21 + r22**2 + r23**2 if r2 > best[0]: best = (r2, v1, v2, a1, a2, a3, b1, b2, b3, k) # plot line of best fit (v1, v2, a1, a2, a3, b1, b2, b3, k) = best[1:] yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2+k: ], yfit3(x[v2+k: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) else: # from 0.7 Mb v1 = 700000 / resolution # to 10 Mb v2 = 10000000 / resolution try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1, b1, r21 = 0, 0, 0 try: a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2]) except ValueError: a2, b2, r22 = 0, 0, 0 try: a3, b3, r23, _, _ = linregress(logx[v2: ], logy[v2: ]) except ValueError: a3, b3, r23 = 0, 0, 0 yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1:v2], yfit2(x[v1:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2: ], yfit3(x[v2: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) axe.set_ylabel('Log interaction count') axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution)) axe.legend(loc='lower left', frameon=False) axe.set_xscale('log') axe.set_yscale('log') axe.set_xlim((min_diff, max_diff)) try: axe.set_ylim((0, max(y))) except ValueError: pass if savefig: tadbit_savefig(savefig) plt.close('all') elif show==True: plt.show() plt.close('all') return (a1, b1, r21), (a2, b2, r22), (a3, b3, r23)
def read_bam(inbam, filter_exclude, resolution, biases, ncpus=8, region1=None, start1=None, end1=None, region2=None, start2=None, end2=None, outdir='.'): bamfile = pysam.AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] + 1 bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm + 1)]) start_bin = 0 end_bin = len(bins) + 1 if region1: regions = [region1] start_bin = [i for i, b in enumerate(bins) if b[0] == region1][0] end_bin = [ i for i, b in enumerate(bins[start_bin:], start_bin) if b[0] == region1 ][-1] else: regions = bamfile.references total = len(bins) if start1 or end1: raise Exception('ERROR: Cannot use start/end1 without region') if start1: start_bin = section_pos[region1][0] + start1 / resolution else: start1 = 0 if end1: end_bin = section_pos[region1][0] + end1 / resolution else: end = len(bins) end1 = (section_pos[region1][1] - section_pos[region1][0]) * resolution total = end_bin - start_bin + 1 regs = [] begs = [] ends = [] njobs = min(total, 100) + 1 nbins = total / njobs + 1 for i in xrange(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop at the right place nbins = end_bin - i try: (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1] except IndexError: (crm1, beg1), (crm2, fin2) = bins[i], bins[-1] if crm1 != crm2: fin1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(fin1 * resolution + resolution) # last nt included ends.append(fin2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(fin2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # reduce dictionaries bins = [] for crm in regions: beg_crm = section_pos[crm][0] if len(regions) == 1: start = start_bin - beg_crm end = end_bin - beg_crm else: start = 0 end = section_pos[crm][1] - section_pos[crm][0] + 1 bins.extend([(crm, i) for i in xrange(start, end)]) bins_dict1 = dict([(j, i) for i, j in enumerate(bins)]) if region2: bins = [] beg_crm = section_pos[region2][0] if start2: start_bin2 = section_pos[region2][0] + start2 / resolution end_bin2 = section_pos[region2][0] + end2 / resolution else: start2 = 0 start_bin2 = 0 end_bin2 = section_pos[region2][1] end2 = sections[region2] * resolution start = start_bin2 - beg_crm end = end_bin2 - beg_crm bins = [(region2, i) for i in xrange(start, end)] bins_dict2 = dict([(j, i) for i, j in enumerate(bins)]) else: bins_dict2 = bins_dict1 pool = mu.Pool(ncpus) ## RUN! printime('\n - Parsing BAM (%d chunks)' % (len(regs))) procs = [] for i, (region, b, e) in enumerate(zip(regs, begs, ends)): if ncpus == 1: read_bam_frag( inbam, filter_exclude, bins_dict1, bins_dict2, resolution, outdir, region, b, e, ) else: procs.append( pool.apply_async(read_bam_frag, args=( inbam, filter_exclude, bins_dict1, bins_dict2, resolution, outdir, region, b, e, ))) pool.close() print_progress(procs) pool.join() printime(' - Writing matrices') bias1 = dict((k - start_bin, v) for k, v in biases.get('biases', {}).iteritems() if start_bin <= k <= end_bin) if region2: bias2 = dict((k - start_bin2, v) for k, v in biases.get('biases', {}).iteritems() if start_bin2 <= k <= end_bin2) else: bias2 = bias1 decay = biases.get('decay', {}) bads1 = dict((k - start_bin, v) for k, v in biases.get('badcol', {}).iteritems() if start_bin <= k <= end_bin) if region2: bads2 = dict((k - start_bin2, v) for k, v in biases.get('badcol', {}).iteritems() if start_bin2 <= k <= end_bin2) else: bads2 = bads1 # hic_data = HiC_data((), len(bins_dict), sections, # bins_dict, resolution=resolution) if len(regions) == 1: if region2: name = '%s:%d-%d_%s:%d-%d' % (region1, start1 / resolution, end1 / resolution, region2, start2 / resolution, end2 / resolution) else: name = '%s:%d-%d' % (region1, start1 / resolution, end1 / resolution) else: name = 'full' out_raw = open( os.path.join( outdir, 'matrix_raw_%s_%s.abc' % (name, nicer(resolution).replace(' ', ''))), 'w') out_raw.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_raw.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_raw.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_raw.write('# BADS %s\n' % (','.join([str(b) for b in bads1]))) if biases: out_nrm = open( os.path.join( outdir, 'matrix_nrm_%s_%s.abc' % (name, nicer(resolution).replace(' ', ''))), 'w') out_nrm.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_nrm.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_nrm.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_nrm.write('# BADS %s\n' % (','.join([str(b) for b in bads1]))) out_dec = open( os.path.join( outdir, 'matrix_dec_%s_%s.abc' % (name, nicer(resolution).replace(' ', ''))), 'w') out_dec.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_dec.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_dec.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_dec.write('# BADS %s\n' % (','.join([str(b) for b in bads1]))) def write2matrix(a, b, c): out_raw.write('%d\t%d\t%d\n' % (a, b, c)) def write2matrices(a, b, c): out_raw.write('%d\t%d\t%d\n' % (a, b, c)) out_nrm.write('%d\t%d\t%f\n' % (a, b, c / bias1[a] / bias2[b])) out_dec.write('%d\t%d\t%f\n' % (a, b, c / bias1[a] / bias2[b] / decay[abs(a - b)])) def write2matrices_2reg(a, b, c): out_raw.write('%d\t%d\t%d\n' % (a, b, c)) out_nrm.write('%d\t%d\t%f\n' % (a, b, c / bias1[a] / bias2[b])) out_dec.write( '%d\t%d\t%f\n' % (a, b, c / bias1[a] / bias2[b] / decay[abs((a + start_bin) - (b + start_bin2))])) def write2matrices_err(a, b, c): out_raw.write('%d\t%d\t%d\n' % (a, b, c)) out_nrm.write('%d\t%d\t%f\n' % (a, b, c / bias1[a] / bias2[b])) try: out_dec.write('%d\t%d\t%f\n' % (a, b, c / bias1[a] / bias2[b] / decay[abs(a - b)])) except KeyError: # different chromsomes out_dec.write('%d\t%d\t%s\n' % (a, b, 'nan')) if biases: if len(regions) == 1: if region2: write = write2matrices_2reg else: write = write2matrices else: write = write2matrices_err else: write = write2matrix sys.stdout.write(' ') for i, (region, start, end) in enumerate(zip(regs, begs, ends)): if not i % 10 and i: sys.stdout.write(' ') if not i % 50 and i: sys.stdout.write(' %9s\n ' % ('%s/%s' % (i, len(regs)))) sys.stdout.write('.') sys.stdout.flush() fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) dico = load(open(fname)) for (j, k), v in dico.iteritems(): if j in bads1 or k in bads2: continue write(j, k, v) os.system('rm -f %s' % (fname)) out_raw.close() if biases: out_nrm.close() out_dec.close() print '%s %9s\n' % (' ' * (54 - (i % 50) - (i % 50) / 10), '%s/%s' % (len(regs), len(regs)))
def __repr__(self): return 'Experiment %s (resolution: %s, TADs: %s, Hi-C rows: %s, normalized: %s)' % ( self.name, nicer(self.resolution), len(self.tads) or None, self.size, self._normalization if self._normalization else 'None')
def read_bam(inbam, filter_exclude, resolution, biases, ncpus=8, region1=None, start1=None, end1=None, verbose=False, region2=None, start2=None, end2=None, outdir=None, tmpdir='/tmp/', normalized=False, by_decay=False, get_all_data=False, use_bads=False): """ Extracts a (normalized) submatrix at wanted resolution from pseudo-BAM file :param inbam: path to pseudoBAM file :param filter_exclude: :param resolution: :param biases: path to pickle file with biases and low-coverage columns :param 8 ncpus: :param None region1: chromosome name of region 1 :param None start1: start genomic coordinate of region 1 :param None end1: end genomic coordinate of region 1 :param None region1: chromosome name of region 2 (if not given use region1) :param None start1: start genomic coordinate of region 2 (if not given use region1) :param None end1: end genomic coordinate of region 2 (if not given use region1) :param False normalized: returns the dictionary of Vanilla normalized matrix :param False decay: returns the dictionary of Decay normalized matrix (decay option can not be used at the same time as normalized option) :param False get_all_data: returns: dictionary of interactions. If get_all_data is set to True, returns a dictionary with all biases used and bads1 columns (keys of the dicitionary are: matrix, bias1, bias2, bads1, bads1, decay). """ if outdir: mkdir(outdir) mkdir(tmpdir) bamfile = pysam.AlignmentFile(inbam, 'rb') sections = OrderedDict(zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] + 1 bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm + 1)]) start_bin = 0 end_bin = len(bins) + 1 if region1: regions = [region1] start_bin = [i for i, b in enumerate(bins) if b[0] == region1][0] end_bin = [i for i, b in enumerate(bins[start_bin:], start_bin) if b[0] == region1][-1] else: regions = bamfile.references total = len(bins) if start1 or end1: raise Exception('ERROR: Cannot use start/end1 without region') if start1: start_bin = section_pos[region1][0] + start1 / resolution else: start1 = 0 if end1: end_bin = section_pos[region1][0] + end1 / resolution else: end = len(bins) end1 = (section_pos[region1][1] - section_pos[region1][0]) * resolution total = end_bin - start_bin + 1 regs = [] begs = [] ends = [] njobs = min(total, 100) + 1 nbins = total / njobs + 1 for i in xrange(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop at the right place nbins = end_bin - i try: (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1] except IndexError: (crm1, beg1), (crm2, fin2) = bins[i], bins[-1] if crm1 != crm2: fin1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(fin1 * resolution + resolution) # last nt included ends.append(fin2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(fin2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # reduce dictionaries bins = [] for crm in regions: beg_crm = section_pos[crm][0] if len(regions) == 1: start = start_bin - beg_crm end = end_bin - beg_crm else: start = 0 end = section_pos[crm][1] - section_pos[crm][0] + 1 bins.extend([(crm, i) for i in xrange(start, end)]) bins_dict1 = dict([(j, i) for i, j in enumerate(bins)]) if region2: bins = [] beg_crm = section_pos[region2][0] if start2 is not None: start_bin2 = section_pos[region2][0] + start2 / resolution end_bin2 = section_pos[region2][0] + end2 / resolution else: start2 = 0 start_bin2 = 0 end_bin2 = section_pos[region2][1] end2 = sections[region2] * resolution start = start_bin2 - beg_crm end = end_bin2 - beg_crm bins = [(region2, i) for i in xrange(start, end)] bins_dict2 = dict([(j, i) for i, j in enumerate(bins)]) else: bins_dict2 = bins_dict1 pool = mu.Pool(ncpus) ## RUN! if verbose: printime('\n - Parsing BAM (%d chunks)' % (len(regs))) procs = [] for i, (region, b, e) in enumerate(zip(regs, begs, ends)): if ncpus == 1: read_bam_frag(inbam, filter_exclude, bins_dict1, bins_dict2, resolution, tmpdir, region, b, e,) else: procs.append(pool.apply_async( read_bam_frag, args=(inbam, filter_exclude, bins_dict1, bins_dict2, resolution, tmpdir, region, b, e,))) pool.close() if verbose: print_progress(procs) pool.join() if verbose: printime(' - Writing matrices') bias1 = dict((k - start_bin, v) for k, v in biases.get('biases', {}).iteritems() if start_bin <= k < end_bin) if region2: bias2 = dict((k - start_bin2, v) for k, v in biases.get('biases', {}).iteritems() if start_bin2 <= k < end_bin2) else: bias2 = bias1 decay = biases.get('decay' , {}) bads1 = dict((k - start_bin, v) for k, v in biases.get('badcol', {}).iteritems() if start_bin <= k < end_bin) if region2: bads2 = dict((k - start_bin2, v) for k, v in biases.get('badcol', {}).iteritems() if start_bin2 <= k < end_bin2) else: bads2 = bads1 if use_bads: bads2 = bads1 = {} # hic_data = HiC_data((), len(bins_dict), sections, # bins_dict, resolution=resolution) if len(regions) == 1: if region2: name = '%s:%d-%d_%s:%d-%d' % (region1, start1 / resolution, end1 / resolution, region2, start2 / resolution, end2 / resolution) else: name = '%s:%d-%d' % (region1, start1 / resolution, end1 / resolution) else: name = 'full' if outdir: out_raw = open(os.path.join(outdir, 'matrix_raw_%s_%s.abc' % ( name, nicer(resolution).replace(' ', ''))), 'w') out_raw.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_raw.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_raw.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_raw.write('# BADS %s\n' % (','.join([str(b) for b in bads1]))) if biases: out_nrm = open(os.path.join(outdir, 'matrix_nrm_%s_%s.abc' % ( name, nicer(resolution).replace(' ', ''))), 'w') out_nrm.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_nrm.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_nrm.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_nrm.write('# BADS %s\n' % (','.join([str(b) for b in bads1]))) out_dec = open(os.path.join(outdir, 'matrix_dec_%s_%s.abc' % ( name, nicer(resolution).replace(' ', ''))), 'w') out_dec.write('# %s resolution:%d\n' % ( name, resolution)) if region2: out_dec.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_dec.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_dec.write('# BADS %s\n' % (','.join([str(b) for b in bads1]))) def write2matrix(a, b, c): out_raw.write('%d\t%d\t%d\n' % (a, b, c)) def write2matrices(a, b, c): out_raw.write('%d\t%d\t%d\n' % (a, b, c)) out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b]))) out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b] * decay[abs(a-b)]))) def write2matrices_2reg(a, b, c): out_raw.write('%d\t%d\t%d\n' % (a, b, c)) out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b]))) out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b] * decay[abs((a + start_bin) - (b + start_bin2))]))) def write2matrices_err(a, b, c): out_raw.write('%d\t%d\t%d\n' % (a, b, c)) out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b]))) try: out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b] * decay[abs(a-b)]))) except KeyError: # different chromsomes out_dec.write('%d\t%d\t%s\n' % (a, b, 'nan')) if biases: if len(regions) == 1: if region2: write = write2matrices_2reg else: write = write2matrices else: write = write2matrices_err else: write = write2matrix if verbose: sys.stdout.write(' ') dico = {} for i, (region, start, end) in enumerate(zip(regs, begs, ends)): if not i % 10 and i: if verbose: sys.stdout.write(' ') if not i % 50 and i: if verbose: sys.stdout.write(' %9s\n ' % ('%s/%s' % (i , len(regs)))) if verbose: sys.stdout.write('.') sys.stdout.flush() fname = os.path.join(tmpdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) if outdir: dico = load(open(fname)) for (j, k), v in dico.iteritems(): if j in bads1 or k in bads2: continue write(j, k, v) else: dico.update(load(open(fname))) os.system('rm -f %s' % (fname)) if outdir: out_raw.close() if biases: out_nrm.close() out_dec.close() if verbose: print '%s %9s\n' % (' ' * (54 - (i % 50) - (i % 50) / 10), '%s/%s' % (len(regs),len(regs))) if normalized and by_decay: warn('WARNING: choose either normalized or by_decay. Using decay normalization') if not outdir: if by_decay: if region2: for i, j in dico: if i in bads1 or j in bads2: continue try: dico[(i, j)] /= bias1[i] * bias2[j] * decay[abs((i + start_bin) - (j + start_bin2))] except KeyError: dico[(i, j)] = float('nan') # no value in decay else: for i, j in dico: dico[(i, j)] /= bias1[i] * bias2[j] * decay[abs(i - j)] elif normalized: for i, j in dico: dico[(i, j)] /= bias1[i] * bias2[j] if get_all_data: return {'matrix': dico, 'bias1' : bias1, 'bias2' : bias2, 'bads1' : bads1, 'bads2' : bads2, 'decay' : decay} return dico
def plot_filtering(nears, ratio, size, cut_count, cut_ratio, outfile, base_position=None, next_position=None, last_position=None, resolution=1, legend=''): plt.figure(figsize=(8.5, 5.5)) axe = plt.subplot() axe.set_position((0.12, 0.1, 0.55, 0.8)) pl = plt.plot([ratio.get(k, 0) for k in range(size)], [nears.get(k, 0) for k in range(size)], 'k.', ms=1 if size > 50_000 else 2 if size > 20_000 else 3, alpha=0.01 if size > 500_000 else 0.05 if size > 200_000 else 0.1 if size > 50_000 else 0.2 if size > 20_000 else 0.3) ylim = np.percentile(list(nears.values()), 95) plt.ylim(0, ylim) xlim = np.percentile(list(ratio.values()), 95) plt.xlim(0, xlim) fb = plt.fill_between([0, cut_ratio], ylim, color='tab:red', alpha=0.4, lw=0) plt.fill_betweenx([0, cut_count], cut_ratio, xlim, color='tab:red', alpha=0.4, lw=0) plt.ylabel('interactions per {} bin'.format(nicer(resolution)), size=12) plt.xlabel('interaction ratio between {0}-{1} and {1}-{2}'.format( nicer(resolution * base_position), nicer(resolution * next_position), nicer(resolution * last_position)), size=12) plt.text(xlim, cut_count, 'Minimum sum: {}'.format(cut_count), ha='right', va='bottom', size=11) plt.text(cut_ratio, ylim, 'Minimum cis/trans ratio: {}'.format(cut_ratio), ha='left', va='top', size=11, rotation=90) plt.title( 'Distribution of interaction\nsums and cis/trans ratio by {} bin'. format(nicer(resolution)), size=13) plt.legend(pl + [fb], [ '{} bin'.format(nicer(resolution)), 'Filtered space:\n low ratio or count' ], bbox_to_anchor=(1, 0.9), frameon=False, fontsize=10, markerscale=4, title=legend, title_fontsize=11) plt.savefig(outfile)
def run(opts): check_options(opts) launch_time = time.localtime() print( ''' %s%s - Region: Chromosome %s from %d to %d at resolution %s (%d particles) ''' % ('Preparing ' if opts.job_list else '', ('Optimization\n' + '*' * (21 if opts.job_list else 11)) if opts.optimize else ('Modeling\n' + '*' * (18 if opts.job_list else 8)), opts.crm, opts.ori_beg, opts.ori_end, nicer(opts.reso), opts.end - opts.beg)) # load data if opts.matrix: crm = load_hic_data(opts) else: # FIXME: copied from somewhere else (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) hic_data.bias = dict( (int(l.split()[0]), float(l.split()[1])) for l in open(biases)) exp = crm.experiments[0] opts.beg, opts.end = opts.beg or 1, opts.end or exp.size # prepare output folders batch_job_hash = digest_parameters( opts, get_md5=True, extra=[ 'maxdist', 'upfreq', 'lowfreq', 'scale', 'dcutoff', 'nmodels_run', 'job_list', 'rand', 'nmodels', 'nkeep', 'optimize', 'optimization_id', 'cpus', 'workdir', 'matrix', 'ori_beg', 'ori_end' ]) mkdir(path.join(opts.workdir, '06_model')) outdir = path.join( opts.workdir, '06_model', '%s_chr%s_%s-%s' % (batch_job_hash, opts.crm, opts.beg, opts.end)) mkdir(outdir) # in case we are not going to run if opts.job_list: job_file_handler = open( path.join( outdir, 'job_list_%s.q' % ('optimization' if opts.optimize else 'modeling')), 'w') else: job_file_handler = None ############### # Optimization print ' o Optimizing parameters' if opts.optimize: optimization(exp, opts, job_file_handler, outdir) finish_time = time.localtime() print('\n optimization done') # correlate all optimization and get best set of parameters if not (opts.optimize and opts.job_list): optpar, results = correlate_models(opts, outdir, exp) else: results = [] ########### # Modeling if not opts.optimize: big_run(exp, opts, job_file_handler, outdir, optpar) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, outdir, results, batch_job_hash, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) coord1 = opts.coord1 if not coord1: region1 = None start1 = None end1 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None printime('Importing hic in %s format' % opts.format) if opts.format == 'matrix' or opts.format == 'text': with gzopen(opts.input) as f_thing: masked, chroms_gen, crm, beg, _, _ = read_file_header(f_thing) if not chroms_gen or (region1 and region1 not in chroms_gen): raise Exception( '''ERROR: Chromosome size not included in import file. Please include the chromosome sizes of the data that you want to import in the header of the file. Example: # CRM chr1 249250621''') elif opts.format == 'cooler': if is_cooler(opts.input, opts.reso if opts.reso > 1 else None): chroms_gen = parse_header(opts.input, opts.reso if opts.reso > 1 else None) if not chroms_gen or (region1 and region1 not in chroms_gen): raise Exception( '''ERROR: Chromosome size not included in import file. ''') else: raise Exception('''ERROR: The input file is not a cooler''') chroms = OrderedDict( (crm, int(chroms_gen[crm] // opts.reso) + 1) for crm in chroms_gen) sections = [] if not region1: size = 0 for crm in chroms: size += chroms[crm] sections.extend([(crm, i) for i in range(chroms[crm])]) elif not start1: size = chroms[region1] sections.extend([(region1, i) for i in range(size)]) else: #size = (end1 - start1)//opts.reso size = chroms[region1] sections.extend([ (region1, i) for i in range(start1 // opts.reso, (end1 // opts.reso)) ]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) bias_file = None badcol = {} if opts.format == 'text': with gzopen(opts.input) as f_thing: matrix = abc_reader(f_thing, size, start1 // opts.reso if start1 else None) size_mat = size elif opts.format == 'matrix': with gzopen(opts.input) as in_f: matrix, size_mat, _, masked, _ = autoreader(in_f) if size != size_mat: raise Exception('''ERROR: The size of the specified region is different from the data in the matrix''') elif opts.format == 'cooler': matrix, weights, size, header = parse_cooler( opts.input, opts.reso if opts.reso > 1 else None, normalized=True, raw_values=True) masked = {} size_mat = size if len(set(weights)) > 1: printime('Transforming cooler weights to biases') outdir_norm = path.join(opts.workdir, '04_normalization') mkdir(outdir_norm) bias_file = path.join( outdir_norm, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'wb') badcol.update((i, True) for i, m in enumerate(weights) if m == 0) dump( { 'biases': dict((k, b if b > 0 else float('nan')) for k, b in enumerate(weights)), 'decay': {}, 'badcol': badcol, 'resolution': opts.reso }, out, HIGHEST_PROTOCOL) out.close() hic = HiC_data(matrix, size_mat, dict_sec=dict_sec, chromosomes=chroms, masked=masked, resolution=opts.reso) #from pytadbit.mapping.analyze import hic_map #hic_map(hic, normalized=False, focus='chr1', show=True, cmap='viridis') printime('Creating BAM file') outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) total_counts = create_BAMhic(hic, opts.cpus, outbam, chroms_gen, opts.reso, samtools=opts.samtools) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, total_counts, size_mat, bias_file, len(badcol), outbam + '.bam', launch_time, finish_time)
def load_parameters_fromdb(opts): if 'tmpdb' in opts and opts.tmpdb: dbfile = opts.tmpdb else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() if not opts.jobid: # get the JOBid of the parsing job try: cur.execute(""" select distinct Id from JOBs where Type = 'Normalize' """) jobids = cur.fetchall() parse_jobid = jobids[0][0] except IndexError: cur.execute(""" select distinct Id from JOBs where Type = '%s' """ % ('Filter')) jobids = cur.fetchall() try: parse_jobid = jobids[0][0] except IndexError: parse_jobid = 1 if len(jobids) > 1: cur.execute(""" select distinct JOBid from NORMALIZE_OUTPUTs where Resolution = %d """ % (opts.reso)) jobs = cur.fetchall() try: parse_jobid = jobs[0][0] except IndexError: raise Exception('ERROR: no normalization found at %s' % ( nicer(opts.reso))) if len(jobs ) > 1: raise Exception('ERROR: more than one possible input found, use' '"tadbit describe" and select corresponding ' 'jobid with --jobid') else: parse_jobid = opts.jobid # fetch path to BAM files # try: biases = mreads = reso = None try: cur.execute(""" select distinct Path, PATHs.id from PATHs where paths.jobid = %s and paths.Type = 'BIASES' """ % parse_jobid) biases, biases_id = cur.fetchall()[0] cur.execute(""" select distinct Path, PATHs.id from PATHs inner join NORMALIZE_OUTPUTs on PATHs.Id = NORMALIZE_OUTPUTs.Input where NORMALIZE_OUTPUTs.JOBid = %d; """ % parse_jobid) mreads, mreads_id = cur.fetchall()[0] cur.execute(""" select distinct Resolution from NORMALIZE_OUTPUTs where NORMALIZE_OUTPUTs.JOBid = %d; """ % parse_jobid) reso = int(cur.fetchall()[0][0]) if reso != opts.reso: warn('WARNING: input resolution does not match ' 'the one of the precomputed normalization') except IndexError: raise Exception('ERROR: normalization not found') return biases, mreads, biases_id, mreads_id
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print('WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))) if len(fas - bam) <= 50: print('\n'.join([(' - ' + c) for c in (fas - bam)])) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1) if len(mappability[c]) < len(refs) // opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) // opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # pad mappability at the end if the size is close to gc_content if len(mappability)<len(gc_content) and len(mappability)/len(gc_content) > 0.95: mappability += [float('nan')] * (len(gc_content)-len(mappability)) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in range(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1` # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path, cis_limit=opts.cis_limit, trans_limit=opts.trans_limit, min_ratio=opts.ratio_limit, fast_filter=opts.fast_filter) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'wb') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def format_yticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg2) return nicer(tickstring if tickstring else 1, coma=True)
def parse_mappability_bedGraph(fname, resolution, wanted_chrom=None, save_cache=True, reload_cache=False): """ parse BEDgraph containing mappability. GEM mappability file obtained with: gem-indexer -i hg38.fa -o hg38 gem-mappability -I hg38.gem -l 50 -o hg38.50mer -T 8 gem-2-wig -I hg38.gem -i hg38.50mer.mappability -o hg38.50mer wigToBigWig hg38.50mer.wig hg38.50mer.sizes hg38.50mer.bw bigWigToBedGraph hg38.50mer.bw hg38.50mer.bedGraph :param fnam: path to BED file with mappability :param resolution: to bin the resulting dictionary :param wanted_chrom: in case only one chromosome is needed :param True save_cache: save a cached version of this file for faster loadings (depends on the resolution) :param False reload_cache: reload cached genome :returns: a dictionary with chromosomes as keys, with average mappability per bin. """ tadbit_fname = fname + '_mappability_%s.TADbit' % (nicer(resolution, sep='')) if path.exists(tadbit_fname) and not reload_cache: def read_line(line): crm, elements = line.split() return crm, map(float, elements.split(',')) return dict(read_line(l) for l in open(tadbit_fname)) fh = open(fname) line = fh.next() crmM, begM, endM, val = line.split() crm = crmM if wanted_chrom: if crmM != wanted_chrom: print(' skipping %s' % crmM) while crmM != wanted_chrom: line = fh.next() crmM, begM, endM, val = line.split() crm = crmM mappability = {} mappability[crm] = [] begB = 0 while True: endB = begB + resolution tmp = 0 try: while True: crmM, begM, endM, val = line.split() if crm != crmM: mappability[crmM] = [] begB = -resolution if wanted_chrom: raise StopIteration break endM = int(endM) if endM > endB: weight = endB - int(begM) if weight >= 0: tmp += weight * float(val) break begM = int(begM) weight = endM - (begM if begM > begB else begB) if weight < 0: break tmp += weight * float(val) line = fh.next() except StopIteration: mappability[crm].append(tmp / resolution) break mappability[crm].append(tmp / resolution) crm = crmM begB += resolution print " saving mappabilty to cache..." if save_cache: out = open(tadbit_fname, 'w') for crm in mappability: out.write(crm + '\t' + ','.join(map(str, mappability[crm])) + '\n') out.close() return mappability
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v != 'raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict( zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemeted for ' 'matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1, p + opts.reso) for r, reg in enumerate(regions) for p in range( starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer( opts.reso).replace(' ', ''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) + '\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: cmap = plt.get_cmap(opts.cmap) if norm != 'raw': cmap.set_bad('grey', 1.) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace( ' ', ''), ('_' + param_hash), opts.format) out_plots[norm_string] = path.join(outdir, fnam) if opts.interactive: _ = plt.figure(figsize=(8, 7)) else: _ = plt.figure(figsize=(16, 14)) # ax1 = plt.subplot(111) ax1 = plt.axes([0.1, 0.1, 0.7, 0.8]) ax2 = plt.axes([0.82, 0.1, 0.07, 0.8]) matrix = array([ array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2) ]) mini = np_min(matrix[nonzero(matrix)]) / 2. matrix[matrix == 0] = mini m = zeros_like(matrix) for bad1 in bads1: m[:, bad1] = 1 for bad2 in bads2: m[bad2, :] = 1 matrix = log2(ma.masked_array(matrix, m)) ax1.imshow(matrix, interpolation='None', origin='lower', cmap=cmap, vmin=vmin, vmax=vmax) if len(regions) <= 2: pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = pltbeg1 if len( regions) == 1 else 0 if start2 is None else start2 pltend2 = pltend1 if len(regions) == 1 else sections[ regions[-1]] if end2 is None else end2 ax1.set_xlabel('{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1)) ax1.set_ylabel('{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)) def format_xticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg1) return nicer(tickstring if tickstring else 1, coma=True) def format_yticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg2) return nicer(tickstring if tickstring else 1, coma=True) ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks)) ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks)) labels = ax1.get_xticklabels() plt.setp(labels, rotation=-25, ha='left') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) else: vals = [0] keys = [''] for crm in regions: vals.append(section_pos[crm][0] / opts.reso) keys.append(crm) vals.append(section_pos[crm][1] / opts.reso) ax1.set_yticks(vals) ax1.set_yticklabels('') ax1.set_yticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xticks(vals) ax1.set_xticklabels('') ax1.set_xticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_xticklabels(keys, minor=True) for t in ax1.xaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xlabel('Chromosomes') ax1.set_ylabel('Chromosomes') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) data = [i for d in matrix for i in d if isfinite(i)] mindata = nanmin(data) maxdata = nanmax(data) gradient = linspace(maxdata, mindata, max((len(matrix), len(matrix[0])))) gradient = dstack((gradient, gradient))[0] h = ax2.hist(data, color='darkgrey', linewidth=2, orientation='horizontal', bins=50, histtype='step', normed=True) _ = ax2.imshow(gradient, aspect='auto', cmap=cmap, extent=(0, max(h[0]), mindata, maxdata)) ax2.yaxis.tick_right() ax2.yaxis.set_label_position("right") ax2.set_xticks([]) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (name, norm, nicer(opts.reso))) ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90) ax2.set_xlabel('Count') if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update( write_matrix(mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s ' % tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def write_matrix(inbam, resolution, biases, outdir, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), normalizations=('decay', ), region1=None, start1=None, end1=None, clean=True, region2=None, start2=None, end2=None, extra='', half_matrix=True, nchunks=100, tmpdir='.', append_to_tar=None, ncpus=8, cooler=False, verbose=True): """ Writes matrix file from a BAM file containing interacting reads. The matrix will be extracted from the genomic BAM, the genomic coordinates of this matrix will be at the intersection of two regions defined byt the parameters region1, start1, end1 and region2, start2, end2. If the wanted matrix is than the second coodinate can be skipped. :param inbam: path to BAM file (generated byt TADbit) :param resolution: resolution at which we want to write the matrix :param biases: path to a file with biases :param outdir: path to a folder where to write output files :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the set of valid pair of reads. :param ('decay',) normalization: tuple with normalizations to use, can be 'decay', 'norm' or/and 'raw'. One file per normalization will be created. :param None region1: chromosome name of the first region from which to extract the matrix :param None region1: chromosome name of the first region from which to extract the matrix :param None start1: start coordinate of the first region from which to extract the matrix :param None end1: end coordinate of the first region from which to extract the matrix :param None region2: chromosome name of the second region from which to extract the matrix :param None start2: start coordinate of the second region from which to extract the matrix :param None end2: end coordinate of the second region from which to extract the matrix :param True half_matrix: writes only half of the matrix (and the diagonal) :param '.' tmpdir: where to write temporary files :param None append_to_tar: path to a TAR file were generated matrices will be written directly :param 8 ncpus: number of cpus to use to read the BAM file :param True verbose: speak :param 100 nchunks: maximum number of chunks into which to cut the BAM :returns: path to output files """ if start1 is not None and end1: if end1 - start1 < resolution: raise Exception( 'ERROR: region1 should be at least as big as resolution') if start2 is not None and end2: if end2 - start2 < resolution: raise Exception( 'ERROR: region2 should be at least as big as resolution') if isinstance(normalizations, list): normalizations = tuple(normalizations) elif isinstance(normalizations, str): normalizations = tuple([normalizations]) if not isinstance(filter_exclude, int): filter_exclude = filters_to_bin(filter_exclude) regions, rand_hash, bin_coords, chunks = read_bam(inbam, filter_exclude, resolution, ncpus=ncpus, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, nchunks=nchunks, verbose=verbose) if region1: regions = [region1] if region2: regions.append(region2) bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x for x in bamfile.lengths])) if biases: bias1, bias2, decay, bads1, bads2 = get_biases_region( biases, bin_coords) elif normalizations != ('raw', ): raise Exception( 'ERROR: should provide path to file with biases (pickle).') else: bads1 = bads2 = {} start_bin1, start_bin2 = bin_coords[::2] if verbose: printime(' - Writing matrices') # define output file name name = _generate_name(regions, (start1, start2), (end1, end2), resolution) # prepare file header outfiles = [] if cooler: if 'h5py' not in modules: raise Exception( 'ERROR: cooler output is not available. Probably ' + 'you need to install h5py\n') if 'decay' in normalizations or 'raw&decay' in normalizations: raise Exception( 'ERROR: decay and raw&decay matrices cannot be exported ' 'to cooler format. Cooler only accepts weights per column/row') fnam = 'raw_%s_%s%s.mcool' % (name, nicer(resolution).replace(' ', ''), ('_' + extra) if extra else '') if os.path.exists(os.path.join(outdir, fnam)): os.remove(os.path.join(outdir, fnam)) out_raw = cooler_file(os.path.join(outdir, fnam), resolution, sections, regions) out_raw.create_bins() out_raw.prepare_matrix(start_bin1, start_bin2) outfiles.append((os.path.join(outdir, fnam), fnam)) else: if 'raw' in normalizations: fnam = 'raw_%s_%s%s.abc' % (name, nicer(resolution).replace( ' ', ''), ('_' + extra) if extra else '') if append_to_tar: out_raw = StringIO() outfiles.append((out_raw, fnam)) else: out_raw = open(os.path.join(outdir, fnam), 'w') outfiles.append((os.path.join(outdir, fnam), fnam)) for reg in regions: out_raw.write('# CRM %s\t%d\n' % (reg, sections[reg])) out_raw.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_raw.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_raw.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_raw.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) # write file header if 'norm' in normalizations: fnam = 'nrm_%s_%s%s.abc' % (name, nicer(resolution).replace( ' ', ''), ('_' + extra) if extra else '') if append_to_tar: out_nrm = StringIO() outfiles.append((out_nrm, fnam)) else: out_nrm = open(os.path.join(outdir, fnam), 'w') outfiles.append((os.path.join(outdir, fnam), fnam)) for reg in regions: out_nrm.write('# CRM %s\t%d\n' % (reg, sections[reg])) out_nrm.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_nrm.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_nrm.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_nrm.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if 'decay' in normalizations or 'raw&decay' in normalizations: fnam = 'dec_%s_%s%s.abc' % (name, nicer(resolution).replace( ' ', ''), ('_' + extra) if extra else '') if append_to_tar: out_dec = StringIO() outfiles.append((out_dec, fnam)) else: out_dec = open(os.path.join(outdir, fnam), 'w') outfiles.append((os.path.join(outdir, fnam), fnam)) for reg in regions: out_dec.write('# CRM %s\t%d\n' % (reg, sections[reg])) out_dec.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_dec.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_dec.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_dec.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) # functions to write lines of pairwise interactions def write_raw(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_raw.write('{}\t{}\t{}\n'.format(a, b, v)) def writer(_, a, b, v): out_raw.write('{}\t{}\t{}\n'.format(a, b, v)) return writer2 if func else writer def write_bias(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_nrm.write('{}\t{}\t{}\n'.format(a, b, v / bias1[a] / bias2[b])) def writer(_, a, b, v): out_nrm.write('{}\t{}\t{}\n'.format(a, b, v / bias1[a] / bias2[b])) return writer2 if func else writer def write_expc(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) def writer(c, a, b, v): out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) return writer2 if func else writer def write_expc_2reg(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs((a + start_bin1) - (b + start_bin2))])) def writer(c, a, b, v): out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs((a + start_bin1) - (b + start_bin2))])) return writer2 if func else writer def write_expc_err(func=None): def writer2(c, a, b, v): func(c, a, b, v) try: out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\t{}\n'.format(a, b, 'nan')) def writer(c, a, b, v): try: out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\t{}\n'.format(a, b, 'nan')) return writer2 if func else writer def write_raw_and_expc(func=None): def writer2(c, a, b, v): func(c, a, b, v) try: out_dec.write('{}\t{}\t{}\t{}\n'.format( a, b, v, v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\t{}\t{}\n'.format( a, b, v, v / bias1[a] / bias2[b])) def writer(c, a, b, v): try: out_dec.write('{}\t{}\t{}\t{}\n'.format( a, b, v, v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\t{}\t{}\n'.format( a, b, v, v / bias1[a] / bias2[b])) return writer2 if func else writer write = None if 'raw' in normalizations: write = write_raw(write) if 'norm' in normalizations and not cooler: write = write_bias(write) if 'decay' in normalizations and not cooler: if len(regions) == 1: if region2: write = write_expc_2reg(write) else: write = write_expc(write) else: write = write_expc_err(write) if 'raw&decay' in normalizations and not cooler: write = write_raw_and_expc(write) # pull all sub-matrices and write full matrix if region2 is not None: # already half-matrix in this case half_matrix = False if cooler: for ichunk, c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean, include_chunk_count=True): if j > k: continue if j not in bads1 and k not in bads2: out_raw.write_iter(ichunk, j, k, v) out_raw.close() else: if half_matrix: for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if k > j: continue if j not in bads1 and k not in bads2: write(c, j, k, v) else: for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if j not in bads1 and k not in bads2: write(c, j, k, v) fnames = {} if append_to_tar: lock = LockFile(append_to_tar) with lock: archive = taropen(append_to_tar, "a:") for fobj, fnam in outfiles: fobj.seek(0) info = archive.tarinfo(name=fnam) info.size = len(fobj.buf) archive.addfile(tarinfo=info, fileobj=fobj) archive.close() else: if cooler: fnames['RAW'] = out_raw.name if 'norm' in normalizations: fnam = 'nrm_%s_%s%s.mcool' % (name, nicer(resolution).replace( ' ', ''), ('_' + extra) if extra else '') copyfile(outfiles[0][0], os.path.join(outdir, fnam)) out_nrm = cooler_file(os.path.join(outdir, fnam), resolution, sections, regions) bias_data_row = [1. / b if b > 0 else 0 for b in bias1] bias_data_col = [1. / b if b > 0 else 0 for b in bias2] out_nrm.write_weights(bias_data_row, bias_data_col, *bin_coords) outfiles.append((os.path.join(outdir, fnam), fnam)) fnames['NRM'] = os.path.join(outdir, fnam) else: if 'raw' in normalizations: out_raw.close() fnames['RAW'] = out_raw.name if 'norm' in normalizations: out_nrm.close() fnames['NRM'] = out_nrm.name if 'decay' in normalizations: out_dec.close() fnames['DEC'] = out_dec.name if 'raw&decay' in normalizations: out_dec.close() fnames['RAW&DEC'] = out_dec.name # this is the last thing we do in case something goes wrong if clean: os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))) return fnames
def write_matrix(inbam, resolution, biases, outdir, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), normalizations=('decay',), region1=None, start1=None, end1=None, clean=True, region2=None, start2=None, end2=None, extra='', half_matrix=True, nchunks=None, tmpdir='.', append_to_tar=None, ncpus=8, verbose=True): """ Writes matrix file from a BAM file containing interacting reads. The matrix will be extracted from the genomic BAM, the genomic coordinates of this matrix will be at the intersection of two regions defined byt the parameters region1, start1, end1 and region2, start2, end2. If the wanted matrix is than the second coodinate can be skipped. :param inbam: path to BAM file (generated byt TADbit) :param resolution: resolution at which we want to write the matrix :param biases: path to a file with biases :param outdir: path to a folder where to write output files :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the set of valid pair of reads. :param ('decay',) normalization: tuple with normalizations to use, can be 'decay', 'norm' or/and 'raw'. One file per normalization will be created. :param None region1: chromosome name of the first region from which to extract the matrix :param None region1: chromosome name of the first region from which to extract the matrix :param None start1: start coordinate of the first region from which to extract the matrix :param None end1: end coordinate of the first region from which to extract the matrix :param None region2: chromosome name of the second region from which to extract the matrix :param None start2: start coordinate of the second region from which to extract the matrix :param None end2: end coordinate of the second region from which to extract the matrix :param True half_matrix: writes only half of the matrix (and the diagonal) :param '.' tmpdir: where to write temporary files :param None append_to_tar: path to a TAR file were generated matrices will be written directly :param 8 ncpus: number of cpus to use to read the BAM file :param True verbose: speak :param None nchunks: maximum number of chunks into which to cut the BAM :returns: path to output files """ if start1 is not None and end1: if end1 - start1 < resolution: raise Exception('ERROR: region1 should be at least as big as resolution') if start2 is not None and end2: if end2 - start2 < resolution: raise Exception('ERROR: region2 should be at least as big as resolution') if isinstance(normalizations, list): normalizations = tuple(normalizations) elif isinstance(normalizations, str): normalizations = tuple([normalizations]) if not isinstance(filter_exclude, int): filter_exclude = filters_to_bin(filter_exclude) regions, rand_hash, bin_coords, chunks = read_bam( inbam, filter_exclude, resolution, ncpus=ncpus, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, nchunks=nchunks, verbose=verbose) if region1: regions = [region1] if region2: regions.append(region2) bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict(zip(bamfile.references, [x for x in bamfile.lengths])) if biases: bias1, bias2, decay, bads1, bads2 = get_biases_region(biases, bin_coords) elif normalizations != ('raw', ): raise Exception('ERROR: should provide path to file with biases (pickle).') else: bads1 = bads2 = {} start_bin1, start_bin2 = bin_coords[::2] if verbose: printime(' - Writing matrices') # define output file name name = _generate_name(regions, (start1, start2), (end1, end2), resolution) # prepare file header outfiles = [] if 'raw' in normalizations: fnam = 'raw_%s_%s%s.abc' % (name, nicer(resolution).replace(' ', ''), ('_' + extra) if extra else '') if append_to_tar: out_raw = StringIO() outfiles.append((out_raw, fnam)) else: out_raw = open(os.path.join(outdir, fnam), 'w') outfiles.append((os.path.join(outdir, fnam), fnam)) for reg in regions: out_raw.write('# CRM %s\t%d\n' % (reg, sections[reg])) out_raw.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_raw.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_raw.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_raw.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) # write file header if 'norm' in normalizations: fnam = 'nrm_%s_%s%s.abc' % (name, nicer(resolution).replace(' ', ''), ('_' + extra) if extra else '') if append_to_tar: out_nrm = StringIO() outfiles.append((out_nrm, fnam)) else: out_nrm = open(os.path.join(outdir, fnam), 'w') outfiles.append((os.path.join(outdir, fnam), fnam)) for reg in regions: out_nrm.write('# CRM %s\t%d\n' % (reg, sections[reg])) out_nrm.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_nrm.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_nrm.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_nrm.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if 'decay' in normalizations or 'raw&decay' in normalizations: fnam = 'dec_%s_%s%s.abc' % (name, nicer(resolution).replace(' ', ''), ('_' + extra) if extra else '') if append_to_tar: out_dec = StringIO() outfiles.append((out_dec, fnam)) else: out_dec = open(os.path.join(outdir, fnam), 'w') outfiles.append((os.path.join(outdir, fnam), fnam)) for reg in regions: out_dec.write('# CRM %s\t%d\n' % (reg, sections[reg])) out_dec.write('# %s resolution:%d\n' % ( name, resolution)) if region2: out_dec.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_dec.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_dec.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) # functions to write lines of pairwise interactions def write_raw(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_raw.write('{}\t{}\t{}\n'.format(a, b, v)) def writer(_, a, b, v): out_raw.write('{}\t{}\t{}\n'.format(a, b, v)) return writer2 if func else writer def write_bias(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_nrm.write('{}\t{}\t{}\n'.format(a, b, v / bias1[a] / bias2[b])) def writer(_, a, b, v): out_nrm.write('{}\t{}\t{}\n'.format(a, b, v / bias1[a] / bias2[b])) return writer2 if func else writer def write_expc(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs(a-b)])) def writer(c, a, b, v): out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs(a-b)])) return writer2 if func else writer def write_expc_2reg(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs((a + start_bin1) - (b + start_bin2))])) def writer(c, a, b, v): out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs((a + start_bin1) - (b + start_bin2))])) return writer2 if func else writer def write_expc_err(func=None): def writer2(c, a, b, v): func(c, a, b, v) try: out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs(a-b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\t{}\n'.format(a, b, 'nan')) def writer(c, a, b, v): try: out_dec.write('{}\t{}\t{}\n'.format( a, b, v / bias1[a] / bias2[b] / decay[c][abs(a-b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\t{}\n'.format(a, b, 'nan')) return writer2 if func else writer def write_raw_and_expc(func=None): def writer2(c, a, b, v): func(c, a, b, v) try: out_dec.write('{}\t{}\t{}\t{}\n'.format( a, b, v, v / bias1[a] / bias2[b] / decay[c][abs(a-b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\t{}\t{}\n'.format( a, b, v, v / bias1[a] / bias2[b])) def writer(c, a, b, v): try: out_dec.write('{}\t{}\t{}\t{}\n'.format( a, b, v, v / bias1[a] / bias2[b] / decay[c][abs(a-b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\t{}\t{}\n'.format( a, b, v, v / bias1[a] / bias2[b])) return writer2 if func else writer write = None if 'raw' in normalizations: write = write_raw(write) if 'norm' in normalizations: write = write_bias(write) if 'decay' in normalizations: if len(regions) == 1: if region2: write = write_expc_2reg(write) else: write = write_expc(write) else: write = write_expc_err(write) if 'raw&decay' in normalizations: write = write_raw_and_expc(write) # pull all sub-matrices and write full matrix if region2 is not None: # already half-matrix in this case half_matrix = False if half_matrix: for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if k > j: continue if j not in bads1 and k not in bads2: write(c, j, k, v) else: for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if j not in bads1 and k not in bads2: write(c, j, k, v) fnames = {} if append_to_tar: lock = LockFile(append_to_tar) with lock: archive = taropen(append_to_tar, "a:") for fobj, fnam in outfiles: fobj.seek(0) info = archive.tarinfo(name=fnam) info.size=len(fobj.buf) archive.addfile(tarinfo=info, fileobj=fobj) archive.close() else: if 'raw' in normalizations: out_raw.close() fnames['RAW'] = out_raw.name if 'norm' in normalizations: out_nrm.close() fnames['NRM'] = out_nrm.name if 'decay' in normalizations: out_dec.close() fnames['DEC'] = out_dec.name if 'raw&decay' in normalizations: out_dec.close() fnames['RAW&DEC'] = out_dec.name # this is the last thing we do in case something goes wrong if clean: os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))) return fnames
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None if opts.figsize: opts.figsize = map(float, opts.figsize.split(',')) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v !='raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None if opts.plot and not opts.force_plot: if opts.interactive: max_size = 1500**2 else: max_size = 5000**2 else: max_size = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict(zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = OrderedDict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean, max_size=max_size) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemented ' 'for matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions) for p in range(starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(opts.reso, sep=''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) + '\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: # transform matrix matrix = array([array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2)]) m = zeros_like(matrix) for bad1 in bads1: m[:,bad1] = 1 for bad2 in bads2: m[bad2,:] = 1 matrix = ma.masked_array(matrix, m) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s%s.%s' % ( norm, name, nicer(opts.reso, sep=''), ('_' + param_hash), '_tri' if opts.triangular else '', opts.format) out_plots[norm_string] = path.join(outdir, fnam) pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = 0 if start2 is None else start2 pltend2 = sections[regions[-1]] if end2 is None else end2 xlabel = '{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1) ylabel = '{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2) section_pos = OrderedDict((k, section_pos[k]) for k in section_pos if k in regions) ax1, _ = plot_HiC_matrix( matrix, triangular=opts.triangular, vmin=vmin, vmax=vmax, cmap=opts.cmap, figsize=opts.figsize, bad_color=opts.bad_color if norm != 'raw' else None) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % ( name, norm, nicer(opts.reso)), y=1.05) _format_axes(ax1, start1, end1, start2, end2, opts.reso, regions, section_pos, sections, opts.xtick_rotation, triangular=False) if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update(write_matrix( mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s '% tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1) if len(mappability[c]) < len(refs) / opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) / opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path) bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % ( nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def read_bam(inbam, filter_exclude, resolution, min_count=2500, normalization='Vanilla', mappability=None, n_rsites=None, cg_content=None, sigma=2, ncpus=8, factor=1, outdir='.', extra_out='', only_valid=False, normalize_only=False, max_njobs=100, min_perc=None, max_perc=None, extra_bads=None): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm)]) start_bin = 0 end_bin = len(bins) total = len(bins) regs = [] begs = [] ends = [] njobs = min(total, max_njobs) + 1 nbins = total / njobs + 1 for i in range(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop nbins = end_bin - i try: (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1] except IndexError: try: (crm1, beg1), (crm2, end2) = bins[i], bins[-1] except IndexError: break if crm1 != crm2: end1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(end1 * resolution + resolution) # last nt included ends.append(end2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(end2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)]) printime(' - Parsing BAM (%d chunks)' % (len(regs))) bins_dict = dict([(j, i) for i, j in enumerate(bins)]) pool = mu.Pool(ncpus) procs = [] read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter for i, (region, start, end) in enumerate(zip(regs, begs, ends)): procs.append( pool.apply_async(read_bam_frag, args=( inbam, filter_exclude, bins, bins_dict, resolution, outdir, extra_out, region, start, end, ))) pool.close() print_progress(procs) pool.join() ## COLLECT RESULTS cisprc = {} printime(' - Collecting cis and total interactions per bin (%d chunks)' % (len(regs))) stdout.write(' ') for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)): if not countbin % 10 and countbin: stdout.write(' ') if not countbin % 50 and countbin: stdout.write(' %9s\n ' % ('%s/%s' % (countbin, len(regs)))) stdout.write('.') stdout.flush() fname = path.join( outdir, 'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) tmp_cisprc = load(open(fname)) system('rm -f %s' % fname) cisprc.update(tmp_cisprc) stdout.write('\n') printime(' - Removing columns with too few or too much interactions') if len(bamfile.references) == 1 and min_count is None: raise Exception("ERROR: only one chromosome can't filter by " "cis-percentage, set min_count instead") elif min_count is None and len(bamfile.references) > 1: badcol = filter_by_cis_percentage( cisprc, sigma=sigma, verbose=True, min_perc=min_perc, max_perc=max_perc, size=total, savefig=path.join( outdir, 'filtered_bins_%s_%s.png' % (nicer(resolution).replace(' ', ''), extra_out))) else: print( ' -> too few interactions defined as less than %9d ' 'interactions') % (min_count) badcol = {} countL = 0 countZ = 0 for c in xrange(total): if cisprc.get(c, [0, 0])[1] < min_count: badcol[c] = cisprc.get(c, [0, 0])[1] countL += 1 if not c in cisprc: countZ += 1 print ' -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % ( len(badcol), countZ, countL, total, float(len(badcol)) / total * 100) # no mappability will result in NaNs, better to filter out these columns if mappability: badcol.update((i, True) for i, m in enumerate(mappability) if not m) # add manually columns to bad columns if extra_bads: removed_manually = 0 for ebc in extra_bads: c, ebc = ebc.split(':') b, e = map(int, ebc.split('-')) b = b / resolution + section_pos[c][0] e = e / resolution + section_pos[c][0] removed_manually += (e - b) badcol.update(dict((p, 'manual') for p in xrange(b, e))) printime(' - Removed %d columns manually.' % removed_manually) raw_cisprc = sum( float(cisprc[k][0]) / cisprc[k][1] for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol)) printime(' - Rescaling sum of interactions per bins') size = len(bins) biases = [ float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1] for k in xrange(size) ] if normalization == 'Vanilla': printime(' - Vanilla normalization') mean_col = nanmean(biases) biases = dict( (k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)) elif normalization == 'oneD': printime(' - oneD normalization') if len( set([ len(biases), len(mappability), len(n_rsites), len(cg_content) ])) > 1: print "biases", "mappability", "n_rsites", "cg_content" print len(biases), len(mappability), len(n_rsites), len(cg_content) raise Exception('Error: not all arrays have the same size') tmp_oneD = path.join(outdir, 'tmp_oneD_%s' % (extra_out)) mkdir(tmp_oneD) biases = oneD(tmp_dir=tmp_oneD, tot=biases, map=mappability, res=n_rsites, cg=cg_content) biases = dict((k, b) for k, b in enumerate(biases)) rmtree(tmp_oneD) else: raise NotImplementedError('ERROR: method %s not implemented' % normalization) # collect subset-matrices and write genomic one # out = open(os.path.join(outdir, # 'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w') printime(' - Getting sum of normalized bins') pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join( outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append(pool.apply_async(sum_nrm_matrix, args=( fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) target = (sumnrm / float(size * size * factor))**0.5 biases = dict([(b, biases[b] * target) for b in biases]) if not normalize_only: printime(' - Computing Cis percentage') # Calculate Cis percentage pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join( outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append( pool.apply_async(get_cis_perc, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results cis = total = 0 for proc in procs: c, t = proc.get() cis += c total += t norm_cisprc = float(cis) / total print ' * Cis-percentage: %.1f%%' % (norm_cisprc * 100) else: norm_cisprc = 0. printime(' - Rescaling decay') # normalize decay by size of the diagonal, and by Vanilla correction # (all cells must still be equals to 1 in average) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join( outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append( pool.apply_async(sum_dec_matrix, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results nrmdec = {} rawdec = {} for proc in procs: tmpnrm, tmpraw = proc.get() for c, d in tmpnrm.iteritems(): for k, v in d.iteritems(): try: nrmdec[c][k] += v rawdec[c][k] += tmpraw[c][k] except KeyError: try: nrmdec[c][k] = v rawdec[c][k] = tmpraw[c][k] except KeyError: nrmdec[c] = {k: v} rawdec[c] = {k: tmpraw[c][k]} # count the number of cells per diagonal # TODO: parallelize # find largest chromosome len_crms = dict( (c, section_pos[c][1] - section_pos[c][0]) for c in section_pos) # initialize dictionary ndiags = dict( (c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections) for crm in section_pos: beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1] chr_size = end_chr - beg_chr thesebads = [b for b in badcol if beg_chr <= b <= end_chr] for dist in xrange(1, chr_size): ndiags[crm][dist] += chr_size - dist # from this we remove bad columns # bad columns will only affect if they are at least as distant from # a border as the distance between the longest diagonal and the # current diagonal. bad_diag = set( ) # 2 bad rows can point to the same bad cell in diagonal maxp = end_chr - dist minp = beg_chr + dist for b in thesebads: if b < maxp: # not inclusive!! bad_diag.add(b) if b >= minp: bad_diag.add(b - dist) ndiags[crm][dist] -= len(bad_diag) # different behavior for longest diagonal: ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr for b in thesebads) # normalize sum per diagonal by total number of cells in diagonal signal_to_noise = 0.05 min_n = signal_to_noise**-2. # equals 400 when default for crm in sections: if not crm in nrmdec: nrmdec[crm] = {} rawdec[crm] = {} tmpdec = 0 # store count by diagonal tmpsum = 0 # store count by diagonal ndiag = 0 val = 0 previous = [ ] # store diagonals to be summed in case not reaching the minimum for k in ndiags[crm]: tmpdec += nrmdec[crm].get(k, 0.) tmpsum += rawdec[crm].get(k, 0.) previous.append(k) if tmpsum > min_n: ndiag = sum(ndiags[crm][k] for k in previous) val = tmpdec # backup of tmpdec kept for last ones outside the loop try: ratio = val / ndiag for k in previous: nrmdec[crm][k] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass previous = [] tmpdec = 0 tmpsum = 0 # last ones we average with previous result if len(previous) == len(ndiags[crm]): nrmdec[crm] = {} elif tmpsum < min_n: ndiag += sum(ndiags[crm][k] for k in previous) val += tmpdec try: ratio = val / ndiag for k in previous: nrmdec[crm][k] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False, genome_seq=None, resolution=None, axe=None, savefig=None, normalized=False): """ :param data: input file name, or HiC_data object or list of lists :param 10 min_diff: lower limit (in number of bins) :param 1000 max_diff: upper limit (in number of bins) to look for :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ resolution = resolution or 1 dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)]) if isinstance(data, str): fhandler = open(data) line = fhandler.next() while line.startswith('#'): line = fhandler.next() try: while True: _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9) if cr1 != cr2: line = fhandler.next() continue diff = abs(int(ps1) / resolution - int(ps2) / resolution) if max_diff > diff >= min_diff: dist_intr[diff] += 1 line = fhandler.next() except StopIteration: pass fhandler.close() elif isinstance(data, HiC_data): if normalized: get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y] else: get_data = lambda x, y: data[x, y] max_diff = min(len(data), max_diff) if data.section_pos: for crm in data.section_pos: for diff in xrange(min_diff, min( (max_diff, 1 + data.chromosomes[crm]))): for i in xrange(data.section_pos[crm][0], data.section_pos[crm][1] - diff): dist_intr[diff] += get_data(i, i + diff) else: for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i, i + diff]): dist_intr[diff] += get_data(i, diff) else: if genome_seq: max_diff = min(max(genome_seq.values()), max_diff) cnt = 0 for crm in genome_seq: for diff in xrange(min_diff, min( (max_diff, genome_seq[crm]))): for i in xrange(cnt, cnt + genome_seq[crm] - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] cnt += genome_seq[crm] else: max_diff = min(len(data), max_diff) for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] if not axe: fig=plt.figure() axe = fig.add_subplot(111) # remove last part of the plot in case no interaction is count... reduce max_dist for diff in xrange(max_diff - 1, min_diff, -1): try: if not dist_intr[diff]: del(dist_intr[diff]) max_diff -=1 continue except KeyError: max_diff -=1 continue break xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0])) x = [] y = [] for k in xrange(len(xp)): if yp[k]: x.append(xp[k]) y.append(yp[k]) axe.plot(x, y, 'k.') best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0) logx = np.log(x) logy = np.log(y) ntries = 100 # set k for better fit # for k in xrange(1, ntries/5, ntries/5/5): if resolution == 1: k = 1 for i in xrange(3, ntries-2-k): v1 = i * len(x) / ntries try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1 = b1 = r21 = 0 r21 *= r21 for j in xrange(i + 1 + k, ntries - 2 - k): v2 = j * len(x) / ntries try: a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2]) a3, b3, r23, _, _ = linregress(logx[v2+k: ], logy[v2+k: ]) except ValueError: a2 = b2 = r22 = 0 a3 = b3 = r23 = 0 r2 = r21 + r22**2 + r23**2 if r2 > best[0]: best = (r2, v1, v2, a1, a2, a3, b1, b2, b3, k) # plot line of best fit (v1, v2, a1, a2, a3, b1, b2, b3, k) = best[1:] yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2+k: ], yfit3(x[v2+k: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) else: # from 0.7 Mb v1 = 700000 / resolution # to 10 Mb v2 = 10000000 / resolution try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1, b1, r21 = 0, 0, 0 try: a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2]) except ValueError: a2, b2, r22 = 0, 0, 0 try: a3, b3, r23, _, _ = linregress(logx[v2: ], logy[v2: ]) except ValueError: a3, b3, r23 = 0, 0, 0 yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1:v2], yfit2(x[v1:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2: ], yfit3(x[v2: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) axe.set_ylabel('Log interaction count') axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution)) axe.legend(loc='lower left', frameon=False) axe.set_xscale('log') axe.set_yscale('log') axe.set_xlim((min_diff, max_diff)) try: axe.set_ylim((0, max(y))) except ValueError: pass if savefig: tadbit_savefig(savefig) plt.close('all') elif show==True: plt.show() plt.close('all')