def correlate_matrices(hic_data1, hic_data2, max_dist=10, intra=False, savefig=None, show=False, savedata=None): """ Compare the iteractions of two Hi-C matrices at a given distance, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 1 resolution: to be used for scaling the plot :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will not look further than 10 times the resolution) :param None savefig: path to save the plot :param False intra: only takes into account intra-chromosomal contacts :param False show: displays the plot :returns: list of correlations and list of genomic distances """ corr = [] dist = [] if (intra and hic_data1.sections and hic_data2.sections and hic_data1.sections == hic_data2.sections): for i in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for crm in hic_data1.section_pos: for j in xrange(hic_data1.section_pos[crm][0], hic_data1.section_pos[crm][1] - i): diag1.append(hic_data1[j, i + j]) diag2.append(hic_data2[j, i + j]) corr.append(spearmanr(diag1, diag2)[0]) dist.append(i) else: if intra: warn('WARNING: hic_dta does not contain chromosome coordinates, ' + 'intra set to False') for i in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for j in xrange(len(hic_data1) - i): diag1.append(hic_data1[j, i + j]) diag2.append(hic_data2[j, i + j]) corr.append(spearmanr(diag1, diag2)[0]) dist.append(i) if show or savefig: plt.plot(dist, corr, color='orange', linewidth=3, alpha=.8) plt.xlabel('Genomic distance in bins') plt.ylabel('Spearman rank correlation') plt.xlim((0, dist[-1])) if savefig: tadbit_savefig(savefig) if show: plt.close('all') if savedata: out = open(savedata, 'w') out.write('# genomic distance\tSpearman rank correlation\n') for i in xrange(len(corr)): out.write('%s\t%s\n' % (dist[i], corr[i])) out.close() return corr, dist
def do_3d_plot(nam, outfile, size, count, minmax, sigma=0, log=False): fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(1, 1, 1, projection='3d') beg = -size / 2 end = size / 2 X = np.arange(beg, end, 1) Y = np.arange(beg, end, 1) X, Y = np.meshgrid(X, Y) Z = np.array([np.array([float(i) for i in l.split()]) for l in open(nam) if not l.startswith('#')]) plt.title(nam + '\nMean: %.3f, median: %.3f, standard-deviation: %.3f (N=%d)' % (np.mean(Z), np.median(Z), np.std(Z), count)) if sigma: Z = ndimage.gaussian_filter(Z, sigma=sigma, order=0) if log: Z = np.log(Z) zspan = minmax if minmax else np.max(np.abs(Z)) zmax = zspan zmin = -zspan else: zspan = minmax if minmax else np.max(np.abs(Z - 1)) zmin = -zspan + 1 zmax = zspan + 1 cmap = 'coolwarm' # 'coolwarm' _ = ax.contourf(X, Y, Z, zdir='z', offset=zmin, cmap=cmap, vmin=zmin, vmax=zmax) surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cmap, linewidth=0, antialiased=True, alpha=1, vmin=zmin, vmax=zmax, shade=True) ax.set_zlim3d(zmin, zmax) ax.view_init(elev=15, azim=25) cb = fig.colorbar(surf, shrink=0.5, aspect=20) cb.set_label('%sverage normalized interactions%s' % ('Log a' if log else 'A', '\nSmoothed with $\sigma=%s$' % sigma)) tadbit_savefig(outfile)
def objective_function(self, log=False, smooth=True, axe=None, savefig=None): """ This function plots the objective function value per each Monte-Carlo step. :param False log: log plot :param True smooth: curve smoothing :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ show = False if not axe: fig = plt.figure(figsize=(7, 7)) axe = fig.add_subplot(111) show = True axe.patch.set_facecolor('lightgrey') axe.patch.set_alpha(0.4) axe.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') axe.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') axe.set_axisbelow(True) axe.minorticks_on() # always on, not only for log # remove tick marks axe.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) axe.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') else: fig = axe.get_figure() # text plt.xlabel('Iteration number') plt.ylabel('IMP Objective Function Value') plt.title('Model ' + str(self['rand_init'])) # smooth nrjz = self['log_objfun'][1:] if smooth: xnew = linspace(0, len(nrjz), 10000) nrjz_smooth = spline(range(len(nrjz)), nrjz, xnew, order=3) axe.plot(xnew, nrjz_smooth, color='darkred') else: axe.plot(nrjz, color='darkred') # plot axe.plot(nrjz, color='darkred', marker='o', alpha=.5, ms=4, ls='None') # log if log: axe.set_yscale('log') if savefig: tadbit_savefig(savefig) elif show:
def plot_iterative_mapping(fnam1, fnam2, total_reads=None, axe=None, savefig=None): """ :param fnam: input file name :param total_reads: total number of reads in the initial FASTQ file :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :returns: a dictionary with the number of reads per mapped length """ count_by_len = {} total_reads = total_reads or 1 if not axe: fig=plt.figure() _ = fig.add_subplot(111) colors = ['olive', 'darkcyan'] for i, fnam in enumerate([fnam1, fnam2]): fhandler = open(fnam) line = while line.startswith('#'): line = try: count_by_len[i] = {} while True: _, length, _, _ = line.rsplit('\t', 3) try: count_by_len[i][int(length)] += 1 except KeyError: count_by_len[i][int(length)] = 1 line = except StopIteration: pass fhandler.close() lengths = sorted(count_by_len[i].keys()) for k in lengths[::-1]: count_by_len[i][k] += sum([count_by_len[i][j] for j in lengths if j < k]) plt.plot(lengths, [float(count_by_len[i][l]) / total_reads for l in lengths], label='read' + str(i + 1), linewidth=2, color=colors[i]) plt.xlabel('read length (bp)') if total_reads != 1: plt.ylabel('Proportion of mapped reads') else: plt.ylabel('Number of mapped reads') plt.legend(loc=4) if savefig: tadbit_savefig(savefig) elif not axe: return count_by_len
def plot_genomic_distribution(fnam, first_read=True, resolution=10000, genome_seq=None, axe=None, ylim=None, savefig=None): """ :param fnam: input file name :param True first_read: map first read. :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ distr = {} idx1, idx2 = (1, 3) if first_read else (7, 9) for line in open(fnam): crm, pos = line.split()[idx1:idx2] pos = int(pos) / resolution try: distr[crm][pos] += 1 except KeyError: try: distr[crm][pos] = 1 except KeyError: distr[crm] = {pos: 1} if not axe: fig=plt.figure(figsize=(15, 3 * len(distr.keys()))) max_y = max([max(distr[c].values()) for c in distr]) max_x = max([len(distr[c].values()) for c in distr]) for i, crm in enumerate(genome_seq if genome_seq else distr): plt.subplot(len(distr.keys()), 1, i + 1) plt.plot(range(max(distr[crm])), [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))], color='red', lw=1.5, alpha=0.7) if genome_seq: if ylim: plt.vlines(len(genome_seq[crm]) / resolution, ylim[0], ylim[1]) else: plt.vlines(len(genome_seq[crm]) / resolution, 0, max_y) plt.xlim((0, max_x)) plt.ylim(ylim or (0, max_y)) plt.title(crm) if savefig: tadbit_savefig(savefig) elif not axe:
def plot_distance_vs_interactions(fnam, min_diff=100, max_diff=1000000, resolution=100, axe=None, savefig=None): """ :param fnam: input file name :param 100 min_diff: lower limit kn genomic distance (usually equal to read length) :param 1000000 max_diff: upper limit in genomic distance to look for :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ dist_intr = {} for line in open(fnam): _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.rsplit('\t', 9) if cr1 != cr2: continue diff = resolution * (abs(int(ps1) - int(ps2)) / resolution) if max_diff > diff > min_diff: dist_intr.setdefault(diff, 0) dist_intr[diff] += 1 for k in dist_intr.keys()[:]: if dist_intr[k] <= 2: del(dist_intr[k]) if not axe: fig=plt.figure() ax = fig.add_subplot(111) x, y = zip(*sorted(dist_intr.items(), key=lambda x:x[0])) plt.plot(x, y, 'k.') # sigma = 10 # p_x = gaussian_filter1d(x, sigma) # p_y = gaussian_filter1d(y, sigma) # plot line of best fit # plt.plot(p_x, p_y,color= 'darkgreen', lw=2, label='Gaussian fit') plt.yscale('log') plt.xscale('log') plt.xlabel('Log genomic distance (binned by %d bp)' % resolution) plt.ylabel('Log interaction count') # plt.legend() if savefig: tadbit_savefig(savefig) elif not axe:
def hic_map(data, genome_seq, biases=None, masked=None, resolution=100000, savefig=None, show=False, savedata=None, focus=None): if isinstance(data, str): fnam = data cumcs = {} total = 0 for crm in genome_seq: cumcs[crm] = total total += len(genome_seq[crm]) / resolution + 1 # bin the data data = [[0 for _ in xrange(total + 1)] for _ in xrange(total + 1)] masked = masked or set() for line in open(fnam): read, cr1, ps1, _, _, _, _, cr2, ps2, _, _, _, _ = line.split() if read in masked: continue ps1 = int(ps1) / resolution ps2 = int(ps2) / resolution try: data[cumcs[cr1] + ps1][cumcs[cr2] + ps2] += 1 except: break else: hic_data = data beg, end = focus if focus else (0, len(hic_data)) beg -= 1 if focus else 0 if biases: data = [[hic_data[len(hic_data) * i + j] / (biases[i] * biases[j]) for j in xrange(beg, end)] for i in xrange(beg, end)] else: data = [[hic_data[len(hic_data) * i + j] for j in xrange(beg, end)] for i in xrange(beg, end)] # do the plot if show or savefig: import numpy as np plt.figure(figsize=(16, 12)) plt.imshow(np.log2(data), origin='lower', cmap='gist_earth', interpolation='nearest') plt.colorbar() if savefig: tadbit_savefig(savefig) elif show: if savedata: out = open(savedata, 'w') for line in data: out.write('\t'.join([str(cell) for cell in line]) + '\n') out.close()
def correlate_matrices(hic_data1, hic_data2, max_dist=10, savefig=None, show=False, savedata=None): """ Compare the iteractions of two Hi-C matrices at a given distance, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 1 resolution: to be used for scaling the plot :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will not look further than 10 times the resolution) :param None savefig: path to save the plot :param False show: displays the plot :returns: list of correlations and list of genomic distances """ corr = [] dist = [] for i in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for j in xrange(len(hic_data1) - i): diag1.append(hic_data1[j, i + j]) diag2.append(hic_data2[j, i + j]) corr.append(spearmanr(diag1, diag2)[0]) dist.append(i) if show or savefig: plt.plot(dist, corr, color='orange', linewidth=3, alpha=.8) plt.xlabel('Genomic distance in bins') plt.ylabel('Spearman rank correlation') plt.xlim((0, dist[-1])) if savefig: tadbit_savefig(savefig) if show: plt.close('all') if savedata: out = open(savedata, 'w') out.write('# genomic distance\tSpearman rank correlation\n') for i in xrange(len(corr)): out.write('%s\t%s\n' % (dist[i], corr[i])) out.close() return corr, dist
def quality_plot(fnam, r_enz=None, nreads=float('inf'), axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' )]) if isinstance(r_enz, list): r_enzs = r_enz elif isinstance(r_enz, str): r_enzs = [r_enz] for k in RESTRICTION_ENZYMES.keys(): for i in range(len(r_enzs)): if k.lower() == str(r_enz[i]).lower(): r_enz[i] = k # else let it as None quals = [] henes = [] sites = {} fixes = {} liges = {} ligep = {} tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) elif fnam.endswith('.dsrc'): proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE) fhandler = proc.stdout else: fhandler = open(fnam) if len(r_enzs) == 1 and r_enzs[0] is None: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_sites = {} d_sites = {} for r_enz in r_enzs: r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '') d_sites[r_enz] = repaired(r_enz) sites[r_enz] = [] # initialize dico to store undigested sites fixes[r_enz] = [] # initialize dico to store digested sites l_sites = religateds(r_enzs) site = {} fixe = {} for r_enz in r_enzs: site[r_enz] = re.compile(r_sites[r_enz]) fixe[r_enz] = re.compile(d_sites[r_enz]) # ligation sites should appear in lower case in the sequence lige = {} for k in l_sites: liges[k] = [] # initialize dico to store sites ligep[k] = 0 # initialize dico to store sites l_sites[k] = l_sites[k].lower() lige[k] = re.compile(l_sites[k]) while len(quals) <= nreads: try: next(fhandler) except StopIteration: break seq = next(fhandler) # ligation sites replaced by lower case to ease the search for lig in l_sites.values(): seq = seq.replace(lig.upper(), lig) for r_enz in r_enzs: sites[r_enz].extend( [m.start() for m in site[r_enz].finditer(seq)]) # TODO: you cannot have a repaired/fixed site in the middle of # the sequence, this could be only checked at the beginning fixes[r_enz].extend( [m.start() for m in fixe[r_enz].finditer(seq)]) for k in lige: # for each paired of cut-site liges[k].extend([m.start() for m in lige[k].finditer(seq)]) ligep[k] += l_sites[k] in seq # store the number of Ns found in the sequences if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = izip_longest(*quals, fillvalue=float('nan')) meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals]) max_seq_len = len(meanquals) if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: # configure plot if len(r_enzs) == 1 and r_enzs[0] is None: # do both plots _, ax = plt.subplots(1, 1, figsize=(15, 6)) else: # only do the quality_plot plot _, (ax, ax2) = plt.subplots(2, 1, figsize=(15, 12)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(max_seq_len), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, max_seq_len)) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() # quality_plot plot axb.plot([henes.count(i) for i in range(max_seq_len)], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, max_seq_len)) # Hi-C plot if not (len(r_enzs) == 1 and r_enzs[0] is None): ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (', '.join(map(str, r_enzs)), nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') # seq_len is the length of the line to plot. we don't want to plot # if there is no room for the cut-site, or ligation site. site_len = max((max([len(r_sites[k]) for k in r_sites]), max([len(l_sites[k]) for k in l_sites]), max([len(d_sites[k]) for k in d_sites]))) seq_len = max_seq_len - site_len # transform dictionaries of positions into dictionaries of counts for r_enz in sites: sites[r_enz] = [sites[r_enz].count(k) for k in range(seq_len)] # Undigested fixes[r_enz] = [fixes[r_enz].count(k) for k in range(seq_len)] # DE for r1, r2 in liges: liges[(r1, r2)] = [liges[(r1, r2)].count(k) for k in range(seq_len)] # OK # in case the pattern of the repaired cut-site contains the target # cut-site pattern. These sites were counted twice, once in the # undigested, and once in the repaired. We remove them from the # repaired: for r_enz in r_enzs: if d_sites[r_enz] in r_sites[r_enz]: pos = r_sites[r_enz].find(d_sites[r_enz]) fixes[r_enz] = (fixes[r_enz][:pos] + [ fixes[r_enz][k] - sites[r_enz][k - pos] for k in range(pos, seq_len) ]) # same for ligated sites for r_enz1 in r_enzs: for r_enz2 in r_enzs: if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]: continue pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1]) fixes[r_enz1] = (fixes[r_enz1][:pos] + [ fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos] for k in range(pos, seq_len) ]) # remove anything that could be in between the two read ends if paired: for k in sites: sites[k][max_seq_len // 2 - site_len:max_seq_len // 2] = [float('nan')] * site_len fixes[k][max_seq_len // 2 - site_len:max_seq_len // 2] = [float('nan')] * site_len for k in liges: liges[k][max_seq_len // 2 - site_len:max_seq_len // 2] = [float('nan')] * site_len # plot undigested cut-sites color = iter(, 0.95, len(r_enzs)))) for r_enz in sites: # print 'undigested', r_enz # print sites[r_enz][:20] ax2.plot( sites[r_enz], linewidth=2,, alpha=0.9, label='Undigested RE site (%s: %s)' % (r_enz, r_sites[r_enz]) if any([f > 0 for f in fixes[r_enz]]) else 'Undigested & Dangling-Ends (%s: %s)' % (r_enz, r_sites[r_enz])) ax2.set_ylabel('Undigested') ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) lines, labels = ax2.get_legend_handles_labels() ax3 = ax2.twinx() color = iter(, 0.95, len(liges)))) for r1, r2 in liges: # print 'ligated', r1, r2 # print liges[(r1, r2)][:20] ax3.plot(liges[(r1, r2)], linewidth=2,, alpha=0.9, label='Ligated (%s-%s: %s)' % (r1, r2, l_sites[(r1, r2)].upper())) ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Ligated') tmp_lines, tmp_labels = ax3.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) color = iter(, 0.95, len(r_enzs)))) for i, r_enz in enumerate(r_enzs): if any([f > 0 for f in fixes[r_enz]]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) # print 'repaired', r_enz # print fixes[r_enz][:20] ax4.plot(fixes[r_enz], linewidth=2,, alpha=0.9, label='Dangling-ends (%s: %s)' % (r_enz, d_sites[r_enz])) ax4.yaxis.label.set_color('darkgreen') ax4.tick_params(axis='y', colors='darkgreen', **tkw) ax4.set_ylabel('Dangling-ends') tmp_lines, tmp_labels = ax4.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) else: ax2.set_ylabel('Undigested & Dangling-ends') ax2.set_xlim((0, max_seq_len)) # Count ligation sites lig_cnt = {} for k in liges: lig_cnt[k] = (nansum(liges[k]) - liges[k][0] - liges[k][max_seq_len // 2]) # Count undigested sites sit_cnt = {} for r_enz in r_enzs: sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] - sites[r_enz][max_seq_len // 2]) # Count Dangling-Ends des = {} for r_enz in r_enzs: if any([f > 0 for f in fixes[r_enz]]): des[r_enz] = ( (100. * (fixes[r_enz][0] + (fixes[r_enz][(max_seq_len // 2)] if paired else 0))) // nreads) else: des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][ (max_seq_len // 2)] if paired else 0))) // nreads # Decorate plot title = '' for r_enz in r_enzs: lcnt = float( sum([ lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1) for r_enz1 in r_enzs for r_enz2 in r_enzs if r_enz1 == r_enz or r_enz2 == r_enz ])) title += ( 'Percentage of digested sites (not considering Dangling-Ends) ' '%s: %.1f%%\n' % (r_enz, 100. * float(lcnt) / (lcnt + sit_cnt[r_enz]))) for r_enz in r_enzs: title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz, des[r_enz]) for r_enz1 in r_enzs: for r_enz2 in r_enzs: title += ( 'Percentage of reads with ligation site (%s-%s): %.1f%% \n' % (r_enz1, r_enz2, (ligep[(r_enz1, r_enz2)] * 100.) / nreads)) plt.title(title.strip(), size=10, ha='left', x=0) plt.subplots_adjust(right=0.85) ax2.legend(lines, labels, bbox_to_anchor=(0.75, 1.0), loc=3, borderaxespad=0., frameon=False, fontsize=9) plt.tight_layout() if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: for k in ligep: ligep[k] = (ligep[k] * 100.) / nreads if len(r_enzs) == 1 and r_enzs[0] is None: return {}, {} return des, ligep
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None if opts.figsize: opts.figsize = map(float, opts.figsize.split(',')) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v !='raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None if opts.plot and not opts.force_plot: if opts.interactive: max_size = 1500**2 else: max_size = 5000**2 else: max_size = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict(zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = OrderedDict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean, max_size=max_size) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemented ' 'for matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions) for p in range(starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(opts.reso, sep=''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % ( + '\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: # transform matrix matrix = array([array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2)]) m = zeros_like(matrix) for bad1 in bads1: m[:,bad1] = 1 for bad2 in bads2: m[bad2,:] = 1 matrix = ma.masked_array(matrix, m) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s%s.%s' % ( norm, name, nicer(opts.reso, sep=''), ('_' + param_hash), '_tri' if opts.triangular else '', opts.format) out_plots[norm_string] = path.join(outdir, fnam) pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = 0 if start2 is None else start2 pltend2 = sections[regions[-1]] if end2 is None else end2 xlabel = '{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1) ylabel = '{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2) section_pos = OrderedDict((k, section_pos[k]) for k in section_pos if k in regions) ax1, _ = plot_HiC_matrix( matrix, triangular=opts.triangular, vmin=vmin, vmax=vmax, cmap=opts.cmap, figsize=opts.figsize, bad_color=opts.bad_color if norm != 'raw' else None) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % ( name, norm, nicer(opts.reso)), y=1.05) _format_axes(ax1, start1, end1, start2, end2, opts.reso, regions, section_pos, sections, opts.xtick_rotation, triangular=False) if opts.interactive: plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update(write_matrix( mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s '% tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) quals = [] henes = [] sites = [] fixes = [] liges = [] ligep = 0 tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) elif fnam.endswith('.dsrc'): proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE) fhandler = proc.stdout else: fhandler = open(fnam) if not r_enz: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '') l_site = religated(r_enz) d_site = repaired(r_enz) if r_site*2 == l_site: # in case the religated site equals 2 restriction sites (like DnpII) site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site) fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site) else: site = re.compile(r_site) fixe = re.compile(d_site) lige = re.compile(l_site) if nreads: while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: if r_enz: _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(len(line.strip())), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, len(line))) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, len(line))) if r_enz: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( r_enz, nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') seq_len = len(line) - max((len(r_site), len(l_site), len(d_site))) sites = [sites.count(k) for k in xrange(seq_len)] # Undigested liges = [liges.count(k) for k in xrange(seq_len)] # OK fixes = [fixes.count(k) for k in xrange(seq_len)] # DE if d_site in r_site: pos = r_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)]) if d_site in l_site: pos = l_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)]) site_len = max((len(r_site), len(l_site), len(d_site))) if paired: sites[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len liges[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len fixes[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len ax2.plot(sites, linewidth=2, color='darkred') ax2.set_ylabel('Undigested RE site (%s)' % r_site) ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) ax3 = ax2.twinx() ax3.plot(liges, linewidth=2, color='darkblue') ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Religated (%s)' % l_site) if any([f > 0 for f in fixes]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) ax4.plot(fixes, linewidth=2, color='darkorange') ax4.yaxis.label.set_color('darkorange') ax4.tick_params(axis='y', colors='darkorange', **tkw) ax4.set_ylabel('Dangling-ends (%s)' % d_site) else: ax2.set_ylabel('RE site & Dangling-ends (%s)' % r_site) ax2.set_xlim((0, len(line))) lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2]) sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2]) des = ((100. * (fixes[0] + (fixes[(len(line) / 2)] if paired else 0))) / nreads) if any([f > 0 for f in fixes]) else ( 100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' + 'Percentage of reads with ligation site: %.0f%%') %( (100. * lig_cnt) / (lig_cnt + sit_cnt), des, (ligep * 100.) / nreads)) plt.subplots_adjust(right=0.85) if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: return des, (ligep * 100.) / nreads
def quality_plot(fnam, nreads=None, axe=None, savefig=None): """ Plot the qualities :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ phred = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' quals = [] if fnam.endswith('.gz'): fhandler = gopen(fnam) else: fhandler = open(fnam) if nreads: while True: try: next(fhandler) except EOFError: break next(fhandler) next(fhandler) line = next(fhandler) quals.append([phred.index(i) for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break next(fhandler) next(fhandler) next(fhandler) line = next(fhandler) quals.append([phred.index(i) for i in line.strip()]) fhandler.close() quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() plt.clf() else: fig = plt.figure() plt.clf() ax = fig.add_subplot(111) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') plt.figure(figsize=(15, 7)) plt.errorbar(range(len(line.strip())), meanquals, yerr=errorquals, ecolor='orange') plt.xlim((0, len(line))) plt.xlabel('Sequence') plt.ylabel('PHRED score') if savefig: tadbit_savefig(savefig) elif not axe:
def mmp_score(matrix, nrand=10, verbose=False, savefig=None): """ :param matrix: list of lists :param 10 nrand: number of randomizations :param None savefig: path where to save figure :returns: 1- MMP score which ranges from 0 (bad) to 1 (good), and 2- the expected correlation of the contact matrices of the modeled chromatin with the original Hi-C data (plus the 3- lower and 4- upper values expected in 95% of the cases) """ data = np.array([np.array([v for v in l]) for l in matrix]) if verbose: sys.stdout.write(' - getting EigenVectors\n') egval, _ = np.linalg.eigh(data) # sort eigenvalues/vectors idx = (-egval).argsort() egval = egval[idx] regvals = [] if verbose: sys.stdout.write(' - randomization\n') for i in xrange(int(nrand)): if verbose: sys.stdout.write('\r ' + str(i + 1) + ' / ' + str(nrand)) sys.stdout.flush() regval, _ = np.linalg.eigh(randomize_matrix(data)) regval = [abs(j) for j in regval] regval.sort(reverse=True) regvals.append( regval) if verbose: sys.stdout.write('\n') regvals = zip(*regvals) rvmean = [] for rv in regvals: rvmean.append(np.mean(rv)) total = sum(rvmean)/100 rvmean = [i/total for i in rvmean] err = [] for rv in regvals: rvstd = np.std(rv/total) err.append(2 * rvstd) zdata = sorted(np.log2([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data)) if data[i][j]])) skewness = skew(zdata) kurtness = kurtosis(zdata) if savefig: _ = plt.figure(figsize=(14, 8)) gs = gridspec.GridSpec(7, 5, wspace=0.5, hspace=1.5) ax1 = plt.subplot(gs[: , 0:3]) ax2 = plt.subplot(gs[1:5 , 3: ]) ax3 = plt.subplot(gs[5:7 , 3: ]) img = ax2.imshow(np.log2(data), interpolation='none') plt.colorbar(img, ax=ax2) if savefig: ax2.set_title('Original matrix', size=12) ax2.tick_params(axis='both', which='major', labelsize=10) ax2.set_xlabel('Bin') ax2.set_ylabel('Bin') normfit = sc_norm.pdf(zdata, np.mean(zdata), np.std(zdata)) _ = ax3.plot(zdata, normfit, ':o', color='grey', ms=3, alpha=.4, markersize=.5) ax3.tick_params(axis='both', which='major', labelsize=10) ax3.hist(zdata, bins=20, density=True, alpha=0.7, color='r') ax3.set_xlabel('Z-score') ax3.set_ylabel('Frequency') rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' rcParams['axes.axisbelow'] = True rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' rcParams['axes.axisbelow'] = True rcParams['axes.grid'] = True rcParams['grid.color'] = 'w' rcParams['grid.linestyle'] = '-' rcParams['grid.linewidth'] = 2 # rcParams['grid.alpha'] = .3 ax1.minorticks_on() ax1.grid(ls='-', color='w', alpha=.3, lw=2, which='major') ax1.grid(ls='-', b=True, color='w', alpha=.3, lw=1, which='minor') ax1.spines['top'].set_color('none') ax1.spines['right'].set_color('none') ax1.spines['bottom'].set_color('none') ax1.spines['left'].set_color('none') ax1.xaxis.set_ticks_position('bottom') ax1.yaxis.set_ticks_position('left') ax1.set_xscale('log') try: ax1.set_axis_bgcolor((.9,.9,.9)) except AttributeError: ax1.set_facecolor((.9,.9,.9)) ax1.errorbar(range(1, 1 + len(rvmean)), rvmean, yerr=err, ecolor='red', color='orange', lw=2, label='%s randomizations' % (nrand)) total = sum(abs(egval)) / 100 egval = np.array(sorted([e/total for e in abs(egval)], reverse=True)) for i in xrange(len(rvmean)): if rvmean[i] + err[i] > egval[i]: break signifidx = i size = len(data) sev = sum(egval[:signifidx]-rvmean[:signifidx]) if savefig: ax1.plot(range(1, 1 + len(rvmean)), egval, color='green', lw=2, label='Observed data') ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval, where=(np.array(rvmean) + np.array(err))<egval, facecolor='green', interpolate=True, alpha=0.2) ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval, where=(np.array(rvmean) + np.array(err))>egval, facecolor='red' , interpolate=True, alpha=0.2) ax1.set_xlim((0,len(rvmean))) ax1.set_ylim((0, max(max(rvmean), max(egval)))) ax1.legend(frameon=False, loc='upper right', prop={'size': 10}) ax1.set_xlabel('Log indexes of Eigenvalues') ax1.set_ylabel('Eigenvalues (percentage of total)') #plt.subplots_adjust(right=0.6) #img = + '/matrix_small.png') #fig.figimage(img, 640, -160) minv = float(min([i for d in data for i in d if i])) / 2 if minv == 0.5: minv = 1./(len(data)**2) mmp = -0.0002 * size + 0.0335 * skewness - 0.0229 * kurtness + 0.0069 * sev + 0.8126 if verbose: sys.stdout.write('\n') sys.stdout.write('\n Results\n') sys.stdout.write(' -------\n\n') if verbose: sys.stdout.write(' MMP score: %.4f\n\n' % mmp) ex_a1, ex_b1 = [0.6975926, 0.2548171] supa1, supb1 = [0.69300732000423904, 0.29858572176099613] lowa1, lowb1 = [0.70217788900976075, 0.211048473299004] scc = (mmp - ex_b1 ) / ex_a1 scc_up1 = (mmp - supb1 ) / supa1 scc_lw1 = (mmp - lowb1 ) / lowa1 if verbose: sys.stdout.write((' predicted dSCC is %.3f (%.3f-%.3f ' '68%% confidence)\n') % (scc , scc_up1 , scc_lw1 )) supa75, supb75 = [0.69230778430383244, 0.30526310790548261] lowa75, lowb75 = [0.70287742471016734, 0.20437108715451746] scc_up75 = (mmp - supb75 ) / supa75 scc_lw75 = (mmp - lowb75 ) / lowa75 if verbose: sys.stdout.write((' (%.3f-%.3f ' '75%% confidence)\n') % (scc_up75 , scc_lw75 )) supa2, supb2 = [0.68855373600821357, 0.34109720480765293] lowa2, lowb2 = [0.70663147300578644, 0.16853699025234709] scc_up2 = (mmp - supb2 ) / supa2 scc_lw2 = (mmp - lowb2 ) / lowa2 if verbose: sys.stdout.write((' (%.3f-%.3f ' '95%% confidence)\n') % (scc_up2 , scc_lw2 )) if savefig: # write the log log = '' log += ' 1- Matrix size (number of eigenvalues): %s\n' % (len(egval)) log += " 2- Skewness of the distribution: %0.3f\n" % (skewness) log += " 3- Kurtosis of the distribution: %0.3f\n" % (kurtness) log += " 4- Sum of differences signif EV real-rand: %0.3f\n\n" % (sev) plt.figtext(0.62, 0.77, log, size='small') log = "MMP score: %.3f\n" % (mmp) log += "Predicted dSCC: %.3f (%.3f-%.3f at 95%% conf)\n" % (scc, scc_up2, scc_lw2) plt.figtext(0.61, 0.87, log, size=12) tadbit_savefig(savefig) plt.close('all') return mmp, scc, scc_up2 , scc_lw2
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v != 'raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict( zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemeted for ' 'matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1, p + opts.reso) for r, reg in enumerate(regions) for p in range( starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer( opts.reso).replace(' ', ''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % ( + '\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: cmap = plt.get_cmap(opts.cmap) if norm != 'raw': cmap.set_bad('grey', 1.) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace( ' ', ''), ('_' + param_hash), opts.format) out_plots[norm_string] = path.join(outdir, fnam) if opts.interactive: _ = plt.figure(figsize=(8, 7)) else: _ = plt.figure(figsize=(16, 14)) # ax1 = plt.subplot(111) ax1 = plt.axes([0.1, 0.1, 0.7, 0.8]) ax2 = plt.axes([0.82, 0.1, 0.07, 0.8]) matrix = array([ array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2) ]) mini = np_min(matrix[nonzero(matrix)]) / 2. matrix[matrix == 0] = mini m = zeros_like(matrix) for bad1 in bads1: m[:, bad1] = 1 for bad2 in bads2: m[bad2, :] = 1 matrix = log2(ma.masked_array(matrix, m)) ax1.imshow(matrix, interpolation='None', origin='lower', cmap=cmap, vmin=vmin, vmax=vmax) if len(regions) <= 2: pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = pltbeg1 if len( regions) == 1 else 0 if start2 is None else start2 pltend2 = pltend1 if len(regions) == 1 else sections[ regions[-1]] if end2 is None else end2 ax1.set_xlabel('{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1)) ax1.set_ylabel('{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)) def format_xticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg1) return nicer(tickstring if tickstring else 1, coma=True) def format_yticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg2) return nicer(tickstring if tickstring else 1, coma=True) ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks)) ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks)) labels = ax1.get_xticklabels() plt.setp(labels, rotation=-25, ha='left') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) else: vals = [0] keys = [''] for crm in regions: vals.append(section_pos[crm][0] / opts.reso) keys.append(crm) vals.append(section_pos[crm][1] / opts.reso) ax1.set_yticks(vals) ax1.set_yticklabels('') ax1.set_yticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xticks(vals) ax1.set_xticklabels('') ax1.set_xticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_xticklabels(keys, minor=True) for t in ax1.xaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xlabel('Chromosomes') ax1.set_ylabel('Chromosomes') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) data = [i for d in matrix for i in d if isfinite(i)] mindata = nanmin(data) maxdata = nanmax(data) gradient = linspace(maxdata, mindata, max((len(matrix), len(matrix[0])))) gradient = dstack((gradient, gradient))[0] h = ax2.hist(data, color='darkgrey', linewidth=2, orientation='horizontal', bins=50, histtype='step', normed=True) _ = ax2.imshow(gradient, aspect='auto', cmap=cmap, extent=(0, max(h[0]), mindata, maxdata)) ax2.yaxis.tick_right() ax2.yaxis.set_label_position("right") ax2.set_xticks([]) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (name, norm, nicer(opts.reso))) ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90) ax2.set_xlabel('Count') if opts.interactive: plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update( write_matrix(mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s ' % tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6, savefig=None, show=False, savedata=None, **kwargs): """ Compare the iteractions of two Hi-C matrices using their 6 first eigenvectors, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 6 nvect: number of eigenvectors to compare :param None savefig: path to save the plot :param False show: displays the plot :param kwargs: any argument to pass to matplotlib imshow function :returns: matrix of correlations """ data1 = hic_data1.get_matrix() data2 = hic_data2.get_matrix() # get the log size = len(data1) data1 = nozero_log(data1, np.log2) data2 = nozero_log(data2, np.log2) # get the eigenvectors ev1, evect1 = eigh(data1) ev2, evect2 = eigh(data2) corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)] # sort eigenvectors according to their eigenvalues => first is last!! sort_perm = ev1.argsort() ev1.sort() evect1 = evect1[sort_perm] sort_perm = ev2.argsort() ev2.sort() evect2 = evect2[sort_perm] # calculate Pearson correlation for i in xrange(nvect): for j in xrange(nvect): corr[i][j] = abs(pearsonr(evect1[:,-i-1], evect2[:,-j-1])[0]) # plot axe = plt.axes([0.1, 0.1, 0.6, 0.8]) cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8]) if show or savefig: im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs) axe.set_xlabel('Eigen Vectors exp. 1') axe.set_ylabel('Eigen Vectors exp. 2') axe.set_xticks(range(nvect)) axe.set_yticks(range(nvect)) axe.set_xticklabels(range(1, nvect + 2)) axe.set_yticklabels(range(1, nvect + 2)) axe.xaxis.set_tick_params(length=0, width=0) axe.yaxis.set_tick_params(length=0, width=0) cbar = plt.colorbar(im, cax = cbaxes )'Pearson correlation', rotation=90*3, verticalalignment='bottom') axe2 = axe.twinx() axe2.set_yticks(range(nvect)) axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]]) axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3, verticalalignment='bottom') axe2.set_ylim((-0.5, nvect - 0.5)) axe2.yaxis.set_tick_params(length=0, width=0) axe3 = axe.twiny() axe3.set_xticks(range(nvect)) axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]]) axe3.set_xlabel('corresponding Eigen Values exp. 1') axe3.set_xlim((-0.5, nvect - 0.5)) axe3.xaxis.set_tick_params(length=0, width=0) axe.set_ylim((-0.5, nvect - 0.5)) axe.set_xlim((-0.5, nvect - 0.5)) if savefig: tadbit_savefig(savefig) if show: plt.close('all') if savedata: out = open(savedata, 'w') out.write('# ' + '\t'.join(['Eigen Vector %s'% i for i in xrange(nvect)]) + '\n') for i in xrange(nvect): out.write('\t'.join([str(corr[i][j]) for j in xrange(nvect)]) + '\n') out.close() return corr
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6, savefig=None, show=False, savedata=None): """ Compare the iteractions of two Hi-C matrices using their 6 first eigenvectors, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 6 nvect: number of eigenvectors to compare :param None savefig: path to save the plot :param False show: displays the plot :returns: matrix of correlations """ corr = [] ev1, evect1 = eigh(np.array([[hic_data1[i, j] for j in xrange(len(hic_data1))] for i in xrange(len(hic_data1))])) ev2, evect2 = eigh(np.array([[hic_data2[i, j] for j in xrange(len(hic_data2))] for i in xrange(len(hic_data2))])) corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)] sort_perm = ev1.argsort() ev1.sort() evect1 = evect1[sort_perm][::-1] sort_perm = ev2.argsort() ev2.sort() evect2 = evect2[sort_perm][::-1] for i in xrange(nvect): for j in xrange(nvect): corr[i][j] = abs(pearsonr(evect1[:,i], evect2[:,j])[0]) axe = plt.axes([0.1, 0.1, 0.6, 0.8]) cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8]) if show or savefig: im = axe.imshow(corr, interpolation="nearest",origin='lower') axe.set_xlabel('Eigen Vectors exp. 1') axe.set_ylabel('Eigen Vectors exp. 2') axe.set_xticklabels(range(nvect + 1), range(1, nvect + 2)) axe.set_yticklabels(range(nvect + 1), range(1, nvect + 2)) axe.xaxis.set_tick_params(length=0, width=0) axe.yaxis.set_tick_params(length=0, width=0) cbar = plt.colorbar(im, cax = cbaxes )'Pearson correlation', rotation=90*3, verticalalignment='bottom') axe2 = axe.twinx() axe2.set_yticks(range(nvect)) axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]]) axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3, verticalalignment='bottom') axe2.set_ylim((-0.5, nvect - 0.5)) axe2.yaxis.set_tick_params(length=0, width=0) axe3 = axe.twiny() axe3.set_xticks(range(nvect)) axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]]) axe3.set_xlabel('corresponding Eigen Values exp. 1') axe3.set_xlim((-0.5, nvect - 0.5)) axe3.xaxis.set_tick_params(length=0, width=0) axe.set_ylim((-0.5, nvect - 0.5)) axe.set_xlim((-0.5, nvect - 0.5)) if savefig: tadbit_savefig(savefig) if show: plt.close('all') if savedata: out = open(savedata, 'w') out.write('# ' + '\t'.join(['Eigen Vector %s'% i for i in xrange(nvect)]) + '\n') for i in xrange(nvect): out.write('\t'.join([str(corr[i][j]) for j in xrange(nvect)]) + '\n') out.close() return corr
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None, xlog=False): """ Plots the distribution of dangling-ends lengths :param fnam: input file name :param None savefig: path where to store the output images. :param 99.9 max_size: top percentage of distances to consider, within the top 0.01% are usually found very long outliers. :param False xlog: represent x axis in logarithmic scale """ distr = {} genome_seq = OrderedDict() fhandler = open(fnam) line = while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) line = des = [] if nreads: nreads /= 2 try: while True: (crm1, pos1, dir1, _, re1, _, crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12] if re1==re2 and crm1 == crm2 and dir1 != dir2: pos1, pos2 = int(pos1), int(pos2) if (pos2 > pos1) == int(dir1): des.append(abs(pos2 - pos1)) if len(des) == nreads: break line = except StopIteration: pass fhandler.close() ax = setup_plot(axe, figsize=(10, 5.5)) max_perc = np.percentile(des, max_size) perc99 = np.percentile(des, 99) perc01 = np.percentile(des, 1) perc95 = np.percentile(des, 95) perc05 = np.percentile(des, 5) desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3, label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99)) ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3) desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3, label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95)) deshist = ax.hist(des, bins=100, range=(0, max_perc), alpha=.7, color='darkred', label='Dangling-ends') ylims = ax.get_ylim() plots = [] ax.set_xlabel('Genomic distance between reads') ax.set_ylabel('Count') ax.set_title('Distribution of dangling-ends ' + 'lenghts\n(top %.1f%%, up to %0.f nts)' % (max_size, max_perc)) if xlog: ax.set_xscale('log') ax.set_xlim((50, max_perc)) plt.subplots_adjust(left=0.1, right=0.75) ax.legend(bbox_to_anchor=(1.4, 1), frameon=False) if savefig: tadbit_savefig(savefig) elif not axe: plt.close('all')
def draw_map( data, genome_seq, cumcs, savefig, show, one=False, clim=None, cmap="jet", decay=False, perc=10, name=None, cistrans=None, decay_resolution=10000, normalized=False, max_diff=None, ): _ = plt.figure(figsize=(15.0, 12.5)) if not max_diff: max_diff = len(data) ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205]) ax2 = plt.axes([0.07, 0.65, 0.21, 0.15]) if decay: ax3 = plt.axes([0.07, 0.42, 0.21, 0.15]) plot_distance_vs_interactions( data, genome_seq=genome_seq, axe=ax3, resolution=decay_resolution, max_diff=max_diff, normalized=normalized ) ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1) ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1) ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1) try: minoridata = np.nanmin(data) maxoridata = np.nanmax(data) except AttributeError: vals = [i for d in data for i in d if not np.isnan(i)] minoridata = np.min(vals) maxoridata = np.max(vals) totaloridata = np.nansum([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data))]) data = nozero_log(data, np.log2) vals = np.array([i for d in data for i in d]) vals = vals[np.isfinite(vals)] mindata = np.nanmin(vals) maxdata = np.nanmax(vals) diff = maxdata - mindata posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01 posF = 1.0 if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0 if cmap == "tadbit": cuts = perc cdict = {"red": [(0.0, 0.0, 0.0)], "green": [(0.0, 0.0, 0.0)], "blue": [(0.0, 0.5, 0.5)]} prev_pos = 0 median = (np.median(vals) - mindata) / diff for prc in np.linspace(posI, median, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.0) - mindata) / diff prc = ((prc - posI) / (median - posI)) + 1.0 / cuts except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict["red"].append([pos, prc, prc]) cdict["green"].append([pos, prc, prc]) cdict["blue"].append([pos, 1, 1]) prev_pos = pos for prc in np.linspace(median + 1.0 / cuts, posF, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.0) - mindata) / diff prc = (prc - median) / (posF - median) except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict["red"].append([pos, 1.0, 1.0]) cdict["green"].append([pos, 1 - prc, 1 - prc]) cdict["blue"].append([pos, 1 - prc, 1 - prc]) prev_pos = pos pos = (np.percentile(vals, 97.0) - mindata) / diff cdict["red"].append([pos, 0.1, 0.1]) cdict["green"].append([pos, 0, 0]) cdict["blue"].append([pos, 0, 0]) cdict["red"].append([1.0, 1, 1]) cdict["green"].append([1.0, 1, 1]) cdict["blue"].append([1.0, 0, 0]) cmap = LinearSegmentedColormap(cmap, cdict) clim = None else: cmap = plt.get_cmap(cmap) cmap.set_bad("darkgrey", 1) ax1.imshow(data, interpolation="none", cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None) size = len(data) for i in xrange(size): for j in xrange(i, size): if np.isnan(data[i][j]): data[i][j] = 0 data[j][i] = 0 # data[j][i] = data[i][j] evals, evect = eigh(data) sort_perm = evals.argsort() evect = evect[sort_perm] data = [i for d in data for i in d if not np.isnan(i)] gradient = np.linspace(np.nanmin(data), np.nanmax(data), size) gradient = np.vstack((gradient, gradient)) h = ax2.hist(data, color="darkgrey", linewidth=2, bins=20, histtype="step", normed=True) _ = ax2.imshow(gradient, aspect="auto", cmap=cmap, extent=(np.nanmin(data), np.nanmax(data), 0, max(h[0]))) if genome_seq: for crm in genome_seq: ax1.vlines( [cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5], cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5, color="w", linestyle="-", linewidth=1, alpha=1, ) ax1.hlines( [cumcs[crm][1] - 0.5, cumcs[crm][0] - 0.5], cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5, color="w", linestyle="-", linewidth=1, alpha=1, ) ax1.vlines( [cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5], cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5, color="k", linestyle="--", ) ax1.hlines( [cumcs[crm][1] - 0.5, cumcs[crm][0] - 0.5], cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5, color="k", linestyle="--", ) if not one: vals = [0] keys = [""] for crm in genome_seq: vals.append(cumcs[crm][0]) keys.append(crm) vals.append(cumcs[crm][1]) ax1.set_yticks(vals) ax1.set_yticklabels("") ax1.set_yticks([float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1)], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',') # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(minoridata)[::-1])])[::-1].strip(',') # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(maxoridata)[::-1])])[::-1].strip(',') plt.figtext( 0.05, 0.25, "".join( [ (name + "\n") if name else "", "Number of interactions: %s\n" % str(totaloridata), ("" if np.isnan(cistrans) else ("Percentage of cis interactions: %.0f%%\n" % (cistrans * 100))), "Min interactions: %s\n" % (minoridata), "Max interactions: %s\n" % (maxoridata), ] ), ) ax2.set_xlim((np.nanmin(data), np.nanmax(data))) ax2.set_ylim((0, max(h[0]))) ax1.set_xlim((-0.5, size - 0.5)) ax1.set_ylim((-0.5, size - 0.5)) ax2.set_xlabel("log interaction count") # we reduce the number of dots displayed.... we just want to see the shape subdata = np.array(list(set([float(int(d * 100)) / 100 for d in data]))) try: normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data)) except AttributeError: normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data)) ax2.plot(subdata, normfit, "w.", markersize=2.5, alpha=0.4) ax2.plot(subdata, normfit, "k.", markersize=1.5, alpha=1) ax2.set_title("skew: %.3f, kurtosis: %.3f" % (skew(data), kurtosis(data))) ax4.vlines(range(size), 0, evect[:, -1], color="k") ax4.hlines(0, 0, size, color="red") ax4.set_ylabel("E1") ax4.set_yticklabels([]) try: ax5.vlines(range(size), 0, evect[:, -2], color="k") except IndexError: pass ax5.hlines(0, 0, size, color="red") ax5.set_ylabel("E2") ax5.set_yticklabels([]) try: ax6.vlines(range(size), 0, evect[:, -3], color="k") except IndexError: pass ax6.hlines(0, 0, size, color="red") ax6.set_ylabel("E3") ax6.set_yticklabels([]) xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels() plt.setp(xticklabels, visible=False) if savefig: tadbit_savefig(savefig) elif show: plt.close("all")
def quality_plot(fnam, r_enz=None, nreads=float('inf'), axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) if isinstance(r_enz, list): r_enzs = r_enz elif isinstance(r_enz, str): r_enzs = [r_enz] for k in RESTRICTION_ENZYMES.keys(): for i in range(len(r_enzs)): if k.lower() == r_enz[i].lower(): r_enz[i] = k # else let it as None quals = [] henes = [] sites = {} fixes = {} liges = {} ligep = {} tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) elif fnam.endswith('.dsrc'): proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE) fhandler = proc.stdout else: fhandler = open(fnam) if not r_enzs: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_sites = {} d_sites = {} for r_enz in r_enzs: r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '') d_sites[r_enz] = repaired(r_enz) sites[r_enz] = [] # initialize dico to store undigested sites fixes[r_enz] = [] # initialize dico to store digested sites l_sites = religateds(r_enzs) site = {} fixe = {} for r_enz in r_enzs: site[r_enz] = re.compile(r_sites[r_enz]) fixe[r_enz] = re.compile(d_sites[r_enz]) # ligation sites should appear in lower case in the sequence lige = {} for k in l_sites: liges[k] = [] # initialize dico to store sites ligep[k] = 0 # initialize dico to store sites l_sites[k] = l_sites[k].lower() lige[k] = re.compile(l_sites[k]) while len(quals) <= nreads: try: next(fhandler) except StopIteration: break seq = next(fhandler) # ligation sites replaced by lower case to ease the search for lig in l_sites.values(): seq = seq.replace(lig.upper(), lig) for r_enz in r_enzs: sites[r_enz].extend([m.start() for m in site[r_enz].finditer(seq)]) # TODO: you cannot have a repaired/fixed site in the middle of # the sequence, this could be only checked at the beginning fixes[r_enz].extend([m.start() for m in fixe[r_enz].finditer(seq)]) for k in lige: # for each paired of cut-site liges[k].extend([m.start() for m in lige[k].finditer(seq)]) ligep[k] += l_sites[k] in seq # store the number of Ns found in the sequences if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = izip_longest(*quals, fillvalue=float('nan')) meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals]) max_seq_len = len(meanquals) if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: # configure plot if r_enz: # do both plots _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: # only do the quality_plot plot _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(max_seq_len), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, max_seq_len)) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() # quality_plot plot axb.plot([henes.count(i) for i in xrange(max_seq_len)], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, max_seq_len)) # Hi-C plot if r_enzs: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( ', '.join(r_enzs), nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') # seq_len is the length of the line to plot. we don't want to plot # if there is no room for the cut-site, or ligation site. site_len = max((max([len(r_sites[k]) for k in r_sites]), max([len(l_sites[k]) for k in l_sites]), max([len(d_sites[k]) for k in d_sites]))) seq_len = max_seq_len - site_len # transform dictionaries of positions into dictionaries of counts for r_enz in sites: sites[r_enz] = [sites[r_enz].count(k) for k in xrange(seq_len)] # Undigested fixes[r_enz] = [fixes[r_enz].count(k) for k in xrange(seq_len)] # DE for r1, r2 in liges: liges[(r1, r2)] = [liges[(r1, r2)].count(k) for k in xrange(seq_len)] # OK # in case the pattern of the repaired cut-site contains the target # cut-site pattern. These sites were counted twice, once in the # undigested, and once in the repaired. We remove them from the # repaired: for r_enz in r_enzs: if d_sites[r_enz] in r_sites[r_enz]: pos = r_sites[r_enz].find(d_sites[r_enz]) fixes[r_enz] = (fixes[r_enz][:pos] + [fixes[r_enz][k] - sites[r_enz][k-pos] for k in xrange(pos, seq_len)]) # same for ligated sites for r_enz1 in r_enzs: for r_enz2 in r_enzs: if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]: continue pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1]) fixes[r_enz1] = (fixes[r_enz1][:pos] + [fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos] for k in xrange(pos, seq_len)]) # remove anything that could be in between the two read ends if paired: for k in sites: sites[k][max_seq_len / 2 - site_len: max_seq_len / 2] = [float('nan')] * site_len fixes[k][max_seq_len / 2 - site_len: max_seq_len / 2] = [float('nan')] * site_len for k in liges: liges[k][max_seq_len / 2 - site_len: max_seq_len / 2] = [float('nan')] * site_len # plot undigested cut-sites color = iter(, 0.95, len(r_enzs)))) for r_enz in sites: # print 'undigested', r_enz # print sites[r_enz][:20] ax2.plot(sites[r_enz], linewidth=2, color =, alpha=0.9, label='Undigested RE site (%s: %s)' % (r_enz, r_sites[r_enz]) if any([f > 0 for f in fixes[r_enz]]) else 'Undigested & Dangling-Ends (%s: %s)' % (r_enz, r_sites[r_enz])) ax2.set_ylabel('Undigested') ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) lines, labels = ax2.get_legend_handles_labels() ax3 = ax2.twinx() color = iter(, 0.95, len(liges)))) for r1, r2 in liges: # print 'ligated', r1, r2 # print liges[(r1, r2)][:20] ax3.plot(liges[(r1, r2)], linewidth=2,, alpha=0.9, label = 'Ligated (%s-%s: %s)' % (r1, r2, l_sites[(r1, r2)].upper())) ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Ligated') tmp_lines, tmp_labels = ax3.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) color = iter(, 0.95, len(r_enzs)))) for i, r_enz in enumerate(r_enzs): if any([f > 0 for f in fixes[r_enz]]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) # print 'repaired', r_enz # print fixes[r_enz][:20] ax4.plot(fixes[r_enz], linewidth=2,, alpha=0.9, label='Dangling-ends (%s: %s)' % (r_enz, d_sites[r_enz])) ax4.yaxis.label.set_color('darkgreen') ax4.tick_params(axis='y', colors='darkgreen', **tkw) ax4.set_ylabel('Dangling-ends') tmp_lines, tmp_labels = ax4.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) else: ax2.set_ylabel('Undigested & Dangling-ends') ax2.set_xlim((0, max_seq_len)) # Count ligation sites lig_cnt = {} for k in liges: lig_cnt[k] = (nansum(liges[k]) - liges[k][0] - liges[k][max_seq_len / 2]) # Count undigested sites sit_cnt = {} for r_enz in r_enzs: sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] - sites[r_enz][max_seq_len / 2]) # Count Dangling-Ends des = {} for r_enz in r_enzs: if any([f > 0 for f in fixes[r_enz]]): des[r_enz] = ((100. * (fixes[r_enz][0] + (fixes[r_enz][(max_seq_len / 2)] if paired else 0))) / nreads) else: des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][(max_seq_len / 2)] if paired else 0))) / nreads # Decorate plot title = '' for r_enz in r_enzs: lcnt = float(sum([lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1) for r_enz1 in r_enzs for r_enz2 in r_enzs if r_enz1 == r_enz or r_enz2 == r_enz])) title += ('Percentage of digested sites (not considering Dangling-Ends) ' '%s: %.1f%%\n' % (r_enz, 100. * float(lcnt) / (lcnt + sit_cnt[r_enz]))) for r_enz in r_enzs: title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz, des[r_enz]) for r_enz1 in r_enzs: for r_enz2 in r_enzs: title += ('Percentage of reads with ligation site (%s-%s): %.1f%% \n' % (r_enz1, r_enz2, (ligep[(r_enz1, r_enz2)] * 100.) / nreads)) plt.title(title.strip(), size=10, ha='left', x=0) plt.subplots_adjust(right=0.85) ax2.legend(lines, labels, bbox_to_anchor=(0.75, 1.0), loc=3, borderaxespad=0., frameon=False, fontsize=9) plt.tight_layout() if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: for k in ligep: ligep[k] = (ligep[k] * 100.) / nreads return des, ligep
def plot_genomic_distribution(fnam, first_read=True, resolution=10000, axe=None, ylim=None, savefig=None, chr_names=None, nreads=None): """ :param fnam: input file name :param True first_read: uses first read. :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None chr_names: can pass a list of chromosome names in case only some them the need to be plotted (this option may last even more than default) """ distr = {} idx1, idx2 = (1, 3) if first_read else (7, 9) genome_seq = OrderedDict() fhandler = open(fnam) line = if chr_names: chr_names = set(chr_names) cond1 = lambda x: x not in chr_names else: cond1 = lambda x: False if nreads: cond2 = lambda x: x >= nreads else: cond2 = lambda x: False cond = lambda x, y: cond1(x) and cond2(y) count = 0 while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split('\t') genome_seq[crm] = int(clen) line = try: while True: crm, pos = line.strip().split('\t')[idx1:idx2] count += 1 if cond(crm, count): line = if cond2(count): break continue pos = int(pos) / resolution try: distr[crm][pos] += 1 except KeyError: try: distr[crm][pos] = 1 except KeyError: distr[crm] = {pos: 1} line = except StopIteration: pass fhandler.close() if not axe: _ = plt.figure(figsize=(15, 3 + 3 * len(distr.keys()))) max_y = max([max(distr[c].values()) for c in distr]) max_x = max([len(distr[c].values()) for c in distr]) ncrms = len(genome_seq if genome_seq else distr) for i, crm in enumerate(chr_names if chr_names else genome_seq if genome_seq else distr): plt.subplot(ncrms, 1, i + 1) try: plt.plot(range(max(distr[crm])), [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))], color='red', lw=1.5, alpha=0.7) except KeyError: pass if ylim: plt.vlines(genome_seq[crm] / resolution, ylim[0], ylim[1]) else: plt.vlines(genome_seq[crm] / resolution, 0, max_y) plt.xlim((0, max_x)) plt.ylim(ylim or (0, max_y)) plt.title(crm) if savefig: tadbit_savefig(savefig) elif not axe: plt.close('all')
def draw_map(data, genome_seq, cumcs, savefig, show, one=False, clim=None, cmap='jet', decay=False, perc=10, name=None, cistrans=None, decay_resolution=10000, normalized=False, max_diff=None): _ = plt.figure(figsize=(15.,12.5)) if not max_diff: max_diff = len(data) ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205]) ax2 = plt.axes([0.07, 0.65, 0.21, 0.15]) if decay: ax3 = plt.axes([0.07, 0.42, 0.21, 0.15]) plot_distance_vs_interactions(data, genome_seq=genome_seq, axe=ax3, resolution=decay_resolution, max_diff=max_diff, normalized=normalized) ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1) ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1) ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1) try: minoridata = np.nanmin(data) maxoridata = np.nanmax(data) except AttributeError: vals = [i for d in data for i in d if not np.isnan(i)] minoridata = np.min(vals) maxoridata = np.max(vals) totaloridata = np.nansum([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data[i]))]) # may not be square data = nozero_log(data, np.log2) vals = np.array([i for d in data for i in d]) vals = vals[np.isfinite(vals)] mindata = np.nanmin(vals) maxdata = np.nanmax(vals) diff = maxdata - mindata posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01 posF = 1.0 if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0 if cmap == 'tadbit': cuts = perc cdict = {'red' : [(0.0, 0.0, 0.0)], 'green': [(0.0, 0.0, 0.0)], 'blue' : [(0.0, 0.5, 0.5)]} prev_pos = 0 median = (np.median(vals) - mindata) / diff for prc in np.linspace(posI, median, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.) - mindata) / diff prc = ((prc - posI) / (median - posI)) + 1. / cuts except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict['red' ].append([pos, prc, prc]) cdict['green'].append([pos, prc, prc]) cdict['blue' ].append([pos, 1, 1]) prev_pos = pos for prc in np.linspace(median + 1. / cuts, posF, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.) - mindata) / diff prc = ((prc - median) / (posF - median)) except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict['red' ].append([pos, 1.0, 1.0]) cdict['green'].append([pos, 1 - prc, 1 - prc]) cdict['blue' ].append([pos, 1 - prc, 1 - prc]) prev_pos = pos pos = (np.percentile(vals ,97.) - mindata) / diff cdict['red' ].append([pos, 0.1, 0.1]) cdict['green'].append([pos, 0, 0]) cdict['blue' ].append([pos, 0, 0]) cdict['red' ].append([1.0, 1, 1]) cdict['green'].append([1.0, 1, 1]) cdict['blue' ].append([1.0, 0, 0]) cmap = LinearSegmentedColormap(cmap, cdict) clim = None else: cmap = plt.get_cmap(cmap) cmap.set_bad('darkgrey', 1) ax1.imshow(data, interpolation='none', cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None) size1 = len(data) size2 = len(data[0]) if size1 == size2: for i in xrange(size1): for j in xrange(i, size2): if np.isnan(data[i][j]): data[i][j] = 0 data[j][i] = 0 else: for i in xrange(size1): for j in xrange(size2): if np.isnan(data[i][j]): data[i][j] = 0 #data[j][i] = data[i][j] try: evals, evect = eigh(data) sort_perm = evals.argsort() evect = evect[sort_perm] except: evals, evect = None, None data = [i for d in data for i in d if not np.isnan(i)] gradient = np.linspace(np.nanmin(data), np.nanmax(data), max(size1, size2)) gradient = np.vstack((gradient, gradient)) h = ax2.hist(data, color='darkgrey', linewidth=2, bins=20, histtype='step', normed=True) _ = ax2.imshow(gradient, aspect='auto', cmap=cmap, extent=(np.nanmin(data), np.nanmax(data) , 0, max(h[0]))) if genome_seq: for crm in genome_seq: ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='w', linestyle='-', linewidth=1, alpha=1) ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='w', linestyle='-', linewidth=1, alpha=1) ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='k', linestyle='--') ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='k', linestyle='--') if not one: vals = [0] keys = [''] for crm in genome_seq: vals.append(cumcs[crm][0]) keys.append(crm) vals.append(cumcs[crm][1]) ax1.set_yticks(vals) ax1.set_yticklabels('') ax1.set_yticks([float(vals[i]+vals[i+1])/2 for i in xrange(len(vals) - 1)], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',') # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(minoridata)[::-1])])[::-1].strip(',') # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(maxoridata)[::-1])])[::-1].strip(',') plt.figtext(0.05,0.25, ''.join([ (name + '\n') if name else '', 'Number of interactions: %s\n' % str(totaloridata), ('' if np.isnan(cistrans) else ('Percentage of cis interactions: %.0f%%\n' % (cistrans*100))), 'Min interactions: %s\n' % (minoridata), 'Max interactions: %s\n' % (maxoridata)])) ax2.set_xlim((np.nanmin(data), np.nanmax(data))) ax2.set_ylim((0, max(h[0]))) ax1.set_xlim ((-0.5, size1 - .5)) ax1.set_ylim ((-0.5, size2 - .5)) ax2.set_xlabel('log interaction count') # we reduce the number of dots displayed.... we just want to see the shape subdata = np.array(list(set([float(int(d*100))/100 for d in data]))) try: normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data)) except AttributeError: normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data)) ax2.plot(subdata, normfit, 'w.', markersize=2.5, alpha=.4) ax2.plot(subdata, normfit, 'k.', markersize=1.5, alpha=1) ax2.set_title('skew: %.3f, kurtosis: %.3f' % (skew(data), kurtosis(data))) try: ax4.vlines(range(size1), 0, evect[:,-1], color='k') except (TypeError, IndexError): pass ax4.hlines(0, 0, size2, color='red') ax4.set_ylabel('E1') ax4.set_yticklabels([]) try: ax5.vlines(range(size1), 0, evect[:,-2], color='k') except (TypeError, IndexError): pass ax5.hlines(0, 0, size2, color='red') ax5.set_ylabel('E2') ax5.set_yticklabels([]) try: ax6.vlines(range(size1), 0, evect[:,-3], color='k') except (TypeError, IndexError): pass ax6.hlines(0, 0, size2, color='red') ax6.set_ylabel('E3') ax6.set_yticklabels([]) xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels() plt.setp(xticklabels, visible=False) if savefig: tadbit_savefig(savefig) elif show: plt.close('all')
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None, show=False, xlog=False, stats=('median', 'perc_max')): """ Plots the distribution of dangling-ends lengths :param fnam: input file name :param None savefig: path where to store the output images. :param 99.9 max_size: top percentage of distances to consider, within the top 0.01% are usually found very long outliers. :param False xlog: represent x axis in logarithmic scale :param ('median', 'perc_max') stats: returns this set of values calculated from the distribution of insert/fragment sizes. Possible values are: - 'median' median of the distribution - 'perc_max' percentil defined by the other parameter 'max_size' - 'first_deacay' starting from the median of the distribution to the first window where 10 consecutive insert sizes are counted less than a given value (this given value is equal to the sum of all sizes divided by 100 000) - 'MAD' Double Median Adjusted Deviation :returns: the median value and the percentile inputed as max_size. """ distr = {} genome_seq = OrderedDict() fhandler = open(fnam) line = while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) line = des = [] if nreads: nreads /= 2 try: while True: (crm1, pos1, dir1, _, re1, _, crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12] if re1==re2 and crm1 == crm2 and dir1 != dir2: pos1, pos2 = int(pos1), int(pos2) if (pos2 > pos1) == int(dir1): des.append(abs(pos2 - pos1)) if len(des) == nreads: break line = except StopIteration: pass fhandler.close() max_perc = np.percentile(des, max_size) perc99 = np.percentile(des, 99) perc01 = np.percentile(des, 1) perc50 = np.percentile(des, 50) perc95 = np.percentile(des, 95) perc05 = np.percentile(des, 5) to_return = {'median': perc50} cutoff = len(des) / 100000. count = 0 for v in xrange(int(perc50), int(max(des))): if des.count(v) < cutoff: count += 1 else: count = 0 if count >= 10: to_return['first_decay'] = v - 10 break else: raise Exception('ERROR: not found') to_return['perc_max'] = max_perc to_return['MAD'] = mad(des) if not savefig and not axe and not show: return [to_return[k] for k in stats] ax = setup_plot(axe, figsize=(10, 5.5)) desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3, label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99)) ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3) desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3, label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95)) deshist = ax.hist(des, bins=100, range=(0, max_perc), alpha=.7, color='darkred', label='Dangling-ends') ylims = ax.get_ylim() plots = [] ax.set_xlabel('Genomic distance between reads') ax.set_ylabel('Count') ax.set_title('Distribution of dangling-ends ' + 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % ( perc50, max_size, max_perc)) if xlog: ax.set_xscale('log') ax.set_xlim((50, max_perc)) plt.subplots_adjust(left=0.1, right=0.75) ax.legend(bbox_to_anchor=(1.4, 1), frameon=False) if savefig: tadbit_savefig(savefig) elif show and not axe: plt.close('all') return [to_return[k] for k in stats]
def mmp_score(matrix, nrand=10, verbose=False, savefig=None): """ :param matrix: list of lists :param 10 nrand: number of randomizations :param None savefig: path where to save figure :returns: 1- MMP score which ranges from 0 (bad) to 1 (good), and 2- the expected correlation of the contact matrices of the modeled chromatin with the original Hi-C data (plus the 3- lower and 4- upper values expected in 95% of the cases) """ data = np.array([np.array([v for v in l]) for l in matrix]) if verbose: sys.stdout.write(' - getting EigenVectors\n') egval, _ = np.linalg.eigh(data) # sort eigenvalues/vectors idx = (-egval).argsort() egval = egval[idx] regvals = [] if verbose: sys.stdout.write(' - randomization\n') for i in xrange(int(nrand)): if verbose: sys.stdout.write('\r ' + str(i + 1) + ' / ' + str(nrand)) sys.stdout.flush() regval, _ = np.linalg.eigh(randomize_matrix(data)) regval = [abs(j) for j in regval] regval.sort(reverse=True) regvals.append(regval) if verbose: sys.stdout.write('\n') regvals = zip(*regvals) rvmean = [] for rv in regvals: rvmean.append(np.mean(rv)) total = sum(rvmean) / 100 rvmean = [i / total for i in rvmean] err = [] for rv in regvals: rvstd = np.std(rv / total) err.append(2 * rvstd) zdata = sorted( np.log2([ data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data)) if data[i][j] ])) skewness = skew(zdata) kurtness = kurtosis(zdata) if savefig: _ = plt.figure(figsize=(14, 8)) gs = gridspec.GridSpec(7, 5, wspace=0.5, hspace=1.5) ax1 = plt.subplot(gs[:, 0:3]) ax2 = plt.subplot(gs[1:5, 3:]) ax3 = plt.subplot(gs[5:7, 3:]) img = ax2.imshow(np.log2(data), interpolation='none') plt.colorbar(img, ax=ax2) if savefig: ax2.set_title('Original matrix', size=12) ax2.tick_params(axis='both', which='major', labelsize=10) ax2.set_xlabel('Bin') ax2.set_ylabel('Bin') normfit = sc_norm.pdf(zdata, np.mean(zdata), np.std(zdata)) _ = ax3.plot(zdata, normfit, ':o', color='grey', ms=3, alpha=.4, markersize=.5) ax3.tick_params(axis='both', which='major', labelsize=10) ax3.hist(zdata, bins=20, normed=True, alpha=0.7, color='r') ax3.set_xlabel('Z-score') ax3.set_ylabel('Frequency') rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' rcParams['axes.axisbelow'] = True rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' rcParams['axes.axisbelow'] = True rcParams['axes.grid'] = True rcParams['grid.color'] = 'w' rcParams['grid.linestyle'] = '-' rcParams['grid.linewidth'] = 2 # rcParams['grid.alpha'] = .3 ax1.minorticks_on() ax1.grid(ls='-', color='w', alpha=.3, lw=2, which='major') ax1.grid(ls='-', b=True, color='w', alpha=.3, lw=1, which='minor') ax1.spines['top'].set_color('none') ax1.spines['right'].set_color('none') ax1.spines['bottom'].set_color('none') ax1.spines['left'].set_color('none') ax1.xaxis.set_ticks_position('bottom') ax1.yaxis.set_ticks_position('left') ax1.set_xscale('log') ax1.set_axis_bgcolor((.9, .9, .9)) ax1.errorbar(range(1, 1 + len(rvmean)), rvmean, yerr=err, ecolor='red', color='orange', lw=2, label='%s randomizations' % (nrand)) total = sum(abs(egval)) / 100 egval = np.array(sorted([e / total for e in abs(egval)], reverse=True)) for i in xrange(len(rvmean)): if rvmean[i] + err[i] > egval[i]: break signifidx = i size = len(data) sev = sum(egval[:signifidx] - rvmean[:signifidx]) if savefig: ax1.plot(range(1, 1 + len(rvmean)), egval, color='green', lw=2, label='Observed data') ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval, where=(np.array(rvmean) + np.array(err)) < egval, facecolor='green', interpolate=True, alpha=0.2) ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval, where=(np.array(rvmean) + np.array(err)) > egval, facecolor='red', interpolate=True, alpha=0.2) ax1.set_xlim((0, len(rvmean))) ax1.set_ylim((0, max(max(rvmean), max(egval)))) ax1.legend(frameon=False, loc='upper right', prop={'size': 10}) ax1.set_xlabel('Log indexes of Eigenvalues') ax1.set_ylabel('Eigenvalues (percentage of total)') #plt.subplots_adjust(right=0.6) #img = + '/matrix_small.png') #fig.figimage(img, 640, -160) minv = float(min([i for d in data for i in d if i])) / 2 if minv == 0.5: minv = 1. / (len(data)**2) mmp = -0.0002 * size + 0.0335 * skewness - 0.0229 * kurtness + 0.0069 * sev + 0.8126 if verbose: sys.stdout.write('\n') sys.stdout.write('\n Results\n') sys.stdout.write(' -------\n\n') if verbose: sys.stdout.write(' MMP score: %.4f\n\n' % mmp) ex_a1, ex_b1 = [0.6975926, 0.2548171] supa1, supb1 = [0.69300732000423904, 0.29858572176099613] lowa1, lowb1 = [0.70217788900976075, 0.211048473299004] scc = (mmp - ex_b1) / ex_a1 scc_up1 = (mmp - supb1) / supa1 scc_lw1 = (mmp - lowb1) / lowa1 if verbose: sys.stdout.write((' predicted dSCC is %.3f (%.3f-%.3f ' '68%% confidence)\n') % (scc, scc_up1, scc_lw1)) supa75, supb75 = [0.69230778430383244, 0.30526310790548261] lowa75, lowb75 = [0.70287742471016734, 0.20437108715451746] scc_up75 = (mmp - supb75) / supa75 scc_lw75 = (mmp - lowb75) / lowa75 if verbose: sys.stdout.write((' (%.3f-%.3f ' '75%% confidence)\n') % (scc_up75, scc_lw75)) supa2, supb2 = [0.68855373600821357, 0.34109720480765293] lowa2, lowb2 = [0.70663147300578644, 0.16853699025234709] scc_up2 = (mmp - supb2) / supa2 scc_lw2 = (mmp - lowb2) / lowa2 if verbose: sys.stdout.write((' (%.3f-%.3f ' '95%% confidence)\n') % (scc_up2, scc_lw2)) if savefig: # write the log log = '' log += ' 1- Matrix size (number of eigenvalues): %s\n' % ( len(egval)) log += " 2- Skewness of the distribution: %0.3f\n" % (skewness) log += " 3- Kurtosis of the distribution: %0.3f\n" % (kurtness) log += " 4- Sum of differences signif EV real-rand: %0.3f\n\n" % ( sev) plt.figtext(0.62, 0.77, log, size='small') log = "MMP score: %.3f\n" % (mmp) log += "Predicted dSCC: %.3f (%.3f-%.3f at 95%% conf)\n" % ( scc, scc_up2, scc_lw2) plt.figtext(0.61, 0.87, log, size=12) tadbit_savefig(savefig) plt.close('all') return mmp, scc, scc_up2, scc_lw2
def correlate_matrices(hic_data1, hic_data2, max_dist=10, intra=False, axe=None, savefig=None, show=False, savedata=None, normalized=False, remove_bad_columns=True, **kwargs): """ Compare the iteractions of two Hi-C matrices at a given distance, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 1 resolution: to be used for scaling the plot :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will not look further than 10 times the resolution) :param None savefig: path to save the plot :param False intra: only takes into account intra-chromosomal contacts :param False show: displays the plot :param False normalized: use normalized data :param True remove_bads: computes the union of bad columns between samples and exclude them from the comparison :returns: list of correlations and list of genomic distances """ corrs = [] dists = [] if normalized: get_the_guy1 = lambda i, j: (hic_data1[j, i] / hic_data1.bias[i] / hic_data1.bias[j]) get_the_guy2 = lambda i, j: (hic_data2[j, i] / hic_data2.bias[i] / hic_data2.bias[j]) else: get_the_guy1 = lambda i, j: hic_data1[j, i] get_the_guy2 = lambda i, j: hic_data2[j, i] if remove_bad_columns: # union of bad columns bads = hic_data1.bads.copy() bads.update(hic_data2.bads) if (intra and hic_data1.sections and hic_data2.sections and hic_data1.sections == hic_data2.sections): for dist in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for crm in hic_data1.section_pos: for j in xrange(hic_data1.section_pos[crm][0], hic_data1.section_pos[crm][1] - dist): i = j + dist if j in bads or i in bads: continue diag1.append(get_the_guy1(i, j)) diag2.append(get_the_guy2(i, j)) corrs.append(spearmanr(diag1, diag2)[0]) dists.append(dist) else: if intra: warn('WARNING: hic_dta does not contain chromosome coordinates, ' + 'intra set to False') for dist in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for j in xrange(len(hic_data1) - dist): i = j + dist if j in bads or i in bads: continue diag1.append(get_the_guy1(i, j)) diag2.append(get_the_guy2(i, j)) corrs.append(spearmanr(diag1, diag2)[0]) dists.append(dist) if show or savefig or axe: if not axe: fig = plt.figure() axe = fig.add_subplot(111) given_axe = False else: given_axe = True axe.plot(dists, corrs, color='orange', linewidth=3, alpha=.8) axe.set_xlabel('Genomic distance in bins') axe.set_ylabel('Spearman rank correlation') axe.set_xlim((0, dists[-1])) if savefig: tadbit_savefig(savefig) if show: if not given_axe: plt.close('all') if savedata: out = open(savedata, 'w') out.write('# genomic distance\tSpearman rank correlation\n') for i in xrange(len(corrs)): out.write('%s\t%s\n' % (dists[i], corrs[i])) out.close() if kwargs.get('get_bads', False): return corrs, dists, bads else: return corrs, dists
def filter_by_zero_count(matrx, draw_hist=False, savefig=None): """ fits the distribution of Hi-C interaction count by column in the matrix to a polynomial. Then searches for the first possible """ nbins = 100 # get sum of columns cols = [] for c in sorted(matrx, key=sum): cols.append(len(c) - c.count(0)) cols = np.array(cols) if draw_hist: plt.figure(figsize=(9, 9)) median = np.median(cols) # mad = np.median([abs(median - c ) for c in cols]) best =(None, None, None, None) # bin the sum of columns xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, cols[-1]) # check if the binning is correct # we want at list half of the bins with some data while list(x).count(0) > 2*len(x)/3: cols = cols[:-1] xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: plt.clf() hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, cols[-1]) # find best polynomial fit in a given range for order in range(7, 14): z = np.polyfit(y, x, order) zpp = np.polyder(z, m=1) roots = np.roots(np.polyder(z)) # check that we are concave down, otherwise take next root pente = np.polyval(zpp, abs(roots[-2] - roots[-1]) / 2 + roots[-1]) if pente > 0: root = roots[-1] else: root = roots[-2] # root must be higher than zero if root <= 0: continue # and lower than the median if root >= median: continue p = np.poly1d(z) R2 = get_r2(p, x, y) if best[0] < R2: best = (R2, order, p, z, root) p, z, root = best[2:] if draw_hist: a = plt.plot(xp, p(xp), "--", color='k') b = plt.vlines(root, 0, plt.ylim()[1], colors='r', linestyles='dashed') try: plt.legend(a + [b], ['polyfit \n%s' % ( ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j>0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1])])), 'first solution of polynomial derivation'], fontsize='x-small') except TypeError: plt.legend(a + [b], ['polyfit \n%s' % ( ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j>0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1])])), 'first solution of polynomial derivation']) plt.ylim(0, plt.ylim()[1]) if savefig: tadbit_savefig(savefig) else: # label as bad the columns with sums lower than the root bads = {} for i, col in enumerate(matrx): if sum(col) < root: bads[i] = None # now stored in Experiment._zeros, used for getting more accurate z-scores return bads
def visualize(self, names=None, tad=None, focus=None, paint_tads=False, axe=None, show=True, logarithm=True, normalized=False, relative=True, decorate=True, savefig=None, clim=None, scale=(8, 6), cmap='jet'): """ Visualize the matrix of Hi-C interactions of a given experiment :param None names: name of the experiment to visualize, or list of experiment names. If None, all experiments will be shown :param None tad: a given TAD in the form: :: {'start': start, 'end' : end, 'brk' : end, 'score': score} **Alternatively** a list of the TADs can be passed (all the TADs between the first and last one passed will be showed. Thus, passing more than two TADs might be superfluous) :param None focus: a tuple with the start and end positions of the region to visualize :param False paint_tads: draw a box around the TADs defined for this experiment :param None axe: an axe object from matplotlib can be passed in order to customize the picture :param True show: either to pop-up matplotlib image or not :param True logarithm: show the logarithm values :param True normalized: show the normalized data (weights might have been calculated previously). *Note: white rows/columns may appear in the matrix displayed; these rows correspond to filtered rows (see* :func:`pytadbit.utils.hic_filtering.hic_filtering_for_modelling` *)* :param True relative: color scale is relative to the whole matrix of data, not only to the region displayed :param True decorate: draws color bar, title and axes labels :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None clim: tuple with minimum and maximum value range for color scale. I.e. clim=(-4, 10) :param 'jet' cmap: color map from matplotlib. Can also be a preconfigured cmap object. """ if names == None: names = [ for xpr in self.experiments] if not isinstance(names, list) and not isinstance(names, tuple): names = [names] cols = 1 rows = 1 else: sqrtxpr = sqrt(len(names)) cols = int(round(sqrtxpr + (0.0 if int(sqrtxpr)==sqrtxpr else .5))) rows = int(sqrtxpr+.5) notaxe = axe == None if not scale: scale = (8, 6) if notaxe and len(names) != 1: fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows)) for i in range(rows): for j in range(cols): if i * cols + j >= len(names): break if notaxe and len(names) != 1: axe = fig.add_subplot( rows, cols, i * cols + j + 1) if (isinstance(names[i * cols + j], tuple) or isinstance(names[i * cols + j], list)): if not axe: fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows)) axe = fig.add_subplot( rows, cols, i * cols + j + 1) xpr1 = self.get_experiment(names[i * cols + j][0]) xpr2 = self.get_experiment(names[i * cols + j][1]) img = xpr1.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=decorate, savefig=False, where='up', clim=clim, cmap=cmap) img = xpr2.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=False, savefig=False, where='down', clim=clim or img.get_clim(), cmap=cmap) #axe = axe.twinx() #axe.set_aspect('equal',adjustable='box-forced',anchor='NE') if decorate: plt.text(1.01, .5, 'Chromosome %s experiment %s' % (,, rotation=-90, va='center', size='large', ha='left', transform=axe.transAxes) else: xper = self.get_experiment(names[i * cols + j]) if not xper.hic_data and not xper.norm: continue xper.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=decorate, savefig=False, clim=clim, cmap=cmap) if savefig: tadbit_savefig(savefig) if show:
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False, genome_seq=None, resolution=None, axe=None, savefig=None, normalized=False): """ :param data: input file name, or HiC_data object or list of lists :param 10 min_diff: lower limit (in number of bins) :param 1000 max_diff: upper limit (in number of bins) to look for :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :returns: slope, intercept and R square of each of the 3 correlations """ resolution = resolution or 1 dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)]) if isinstance(data, str): fhandler = open(data) line = while line.startswith('#'): line = try: while True: _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9) if cr1 != cr2: line = continue diff = abs(int(ps1) / resolution - int(ps2) / resolution) if max_diff > diff >= min_diff: dist_intr[diff] += 1 line = except StopIteration: pass fhandler.close() elif isinstance(data, HiC_data): if normalized: get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y] else: get_data = lambda x, y: data[x, y] max_diff = min(len(data), max_diff) if data.section_pos: for crm in data.section_pos: for diff in xrange(min_diff, min( (max_diff, 1 + data.chromosomes[crm]))): for i in xrange(data.section_pos[crm][0], data.section_pos[crm][1] - diff): dist_intr[diff] += get_data(i, i + diff) else: for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i, i + diff]): dist_intr[diff] += get_data(i, diff) else: if genome_seq: max_diff = min(max(genome_seq.values()), max_diff) cnt = 0 for crm in genome_seq: for diff in xrange(min_diff, min( (max_diff, genome_seq[crm]))): for i in xrange(cnt, cnt + genome_seq[crm] - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] cnt += genome_seq[crm] else: max_diff = min(len(data), max_diff) for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] if not axe: fig=plt.figure() axe = fig.add_subplot(111) # remove last part of the plot in case no interaction is count... reduce max_dist for diff in xrange(max_diff - 1, min_diff, -1): try: if not dist_intr[diff]: del(dist_intr[diff]) max_diff -=1 continue except KeyError: max_diff -=1 continue break xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0])) x = [] y = [] for k in xrange(len(xp)): if yp[k]: x.append(xp[k]) y.append(yp[k]) axe.plot(x, y, 'k.') best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0) logx = np.log(x) logy = np.log(y) ntries = 100 # set k for better fit # for k in xrange(1, ntries/5, ntries/5/5): if resolution == 1: k = 1 for i in xrange(3, ntries-2-k): v1 = i * len(x) / ntries try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1 = b1 = r21 = 0 r21 *= r21 for j in xrange(i + 1 + k, ntries - 2 - k): v2 = j * len(x) / ntries try: a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2]) a3, b3, r23, _, _ = linregress(logx[v2+k: ], logy[v2+k: ]) except ValueError: a2 = b2 = r22 = 0 a3 = b3 = r23 = 0 r2 = r21 + r22**2 + r23**2 if r2 > best[0]: best = (r2, v1, v2, a1, a2, a3, b1, b2, b3, k) # plot line of best fit (v1, v2, a1, a2, a3, b1, b2, b3, k) = best[1:] yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2+k: ], yfit3(x[v2+k: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) else: # from 0.7 Mb v1 = 700000 / resolution # to 10 Mb v2 = 10000000 / resolution try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1, b1, r21 = 0, 0, 0 try: a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2]) except ValueError: a2, b2, r22 = 0, 0, 0 try: a3, b3, r23, _, _ = linregress(logx[v2: ], logy[v2: ]) except ValueError: a3, b3, r23 = 0, 0, 0 yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1:v2], yfit2(x[v1:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2: ], yfit3(x[v2: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) axe.set_ylabel('Log interaction count') axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution)) axe.legend(loc='lower left', frameon=False) axe.set_xscale('log') axe.set_yscale('log') axe.set_xlim((min_diff, max_diff)) try: axe.set_ylim((0, max(y))) except ValueError: pass if savefig: tadbit_savefig(savefig) plt.close('all') elif show==True: plt.close('all') return (a1, b1, r21), (a2, b2, r22), (a3, b3, r23)
def filter_by_cis_percentage(cisprc, beg=0.3, end=0.8, sigma=2, verbose=False, size=None, min_perc=None, max_perc=None, savefig=None): """ Define artifactual columns with either too low or too high counts of interactions by compraing their percentage of cis interactions (inter-chromosomal). :param cisprc: dictionary with counts of cis-percentage by bin number. Values of the dictionary are tuple with,m as first element the number of cis interactions and as second element the total number of interactions. :param 0.3 beg: proportion of bins to be considered as possibly having low counts :param 0.8 end: proportion of bins to be considered as possibly having high counts :param 2 sigma: number of standard deviations used to define lower and upper ranges in the varaition of the percentage of cis interactions :param None size: size of the genome, inumber of bins (otherwise inferred from cisprc dictionary) :param None sevefig: path to save image of the distribution of cis percentages and total counts by bin. :returns: dictionary of bins to be filtered out (with either too low or too high counts of interactions). """ sorted_sum, indices = list(zip(*sorted((cisprc[i][1], i) for i in cisprc))) sorted_prc = [float(cisprc[i][0]) / cisprc[i][1] for i in indices] size = (max(indices) + 1) if not size else size win_size = _best_window_size(sorted_prc, size, beg, end, verbose=verbose) # define confidance bands, compute median plus/minus one standard deviation errors_pos = [] errors_neg = [] for k in range(0, size, 1): vals = sorted_prc[k:k + win_size] std = np.std(vals) med = np.median(vals) errors_pos.append(med + std * sigma) errors_neg.append(med - std * sigma) # calculate median and variation of median plus/minus one standard deviation # for values between percentile 10 and 90 of the distribution of the # percentage of cis interactions # - for median plus one standard deviation std_err_pos = np.std(errors_pos[int(size * beg):int(size * end)]) med_err_pos = np.median(errors_pos[int(size * beg):int(size * end)]) # - for median minus one standard deviation std_err_neg = np.std(errors_neg[int(size * beg):int(size * end)]) med_err_neg = np.median(errors_neg[int(size * beg):int(size * end)]) # define cutoffs, values of cis percentage plus 1 stddev should be between # the general median +/- 2 stddev of the distribution of the cis percentage # plus 1 stddev. Same on the side of median cis percentage minus 1 stddev beg_pos = med_err_pos - std_err_pos * sigma end_pos = med_err_pos + std_err_pos * sigma beg_neg = med_err_neg - std_err_neg * sigma end_neg = med_err_neg + std_err_neg * sigma cutoffL = None passed = 0 consecutive = 10 for cutoffL, (p, n) in enumerate(zip(errors_pos, errors_neg)): # print '%6.4f %6.4f %6.4f %6.4f %6.4f %6.4f' % (beg_pos, p, end_pos, beg_neg, n, end_neg) if (beg_pos < p < end_pos) and (beg_neg < n < end_neg): if passed >= consecutive: break passed += 1 else: passed = 0 else: if min_perc is None: raise Exception('ERROR: left cutoff not found!!!\n' ' define it by hand with min_perc') else: cutoffL = min_perc / 100. * size + consecutive cutoffL -= consecutive # rescale, we asked for XX consecutive # right cutoffR = None passed = 0 for cutoffR, (p, n) in enumerate(list(zip(errors_pos, errors_neg))[::-1]): cutoffR = size - cutoffR # print '%6.4f %6.4f %6.4f %6.4f %6.4f %6.4f' % (beg_pos, p, end_pos, beg_neg, n, end_neg) if (beg_pos < p < end_pos) and (beg_neg < n < end_neg): if passed >= consecutive: break passed += 1 else: passed = 0 else: if max_perc is None: raise Exception('ERROR: right cutoff not found!!!\n' ' define it by hand with max_perc') else: cutoffR = max_perc / 100. * size - consecutive cutoffR += consecutive # rescale, we asked for XX consecutive if min_perc: cutoffL = min_perc / 100. * size if max_perc: cutoffR = max_perc / 100. * size min_count = sorted_sum[int(cutoffL)] try: max_count = sorted_sum[int(cutoffR)] except IndexError: # all good max_count = sorted_sum[-1] + 1 if verbose: print(' * Lower cutoff applied until bin number: %d' % (cutoffL)) print( ' * too few interactions defined as less than %9d interactions' % (min_count)) print(' * Upper cutoff applied until bin number: %d' % (cutoffR)) print( ' * too much interactions defined as more than %9d interactions' % (max_count)) # plot if savefig: if verbose: print(' -> Making plot...') fig = plt.figure(figsize=(20, 11)) ax1 = fig.add_subplot(111) plt.subplots_adjust(left=0.25, bottom=0.2) line1 = ax1.plot([ float(cisprc.get(i, [0, 0])[0]) / cisprc.get(i, [1, 1])[1] for i in indices ], '.', color='grey', alpha=0.2, label='cis interactions ratio by bin', zorder=1) line2 = ax1.plot(list(range(0, len(indices), 20)), [ sum( float(cisprc.get(j, [0, 0])[0]) / cisprc.get(j, [1, 1])[1] for j in indices[k:k + win_size]) / win_size for k in range(0, len(indices), 20) ], '.', color='k', alpha=0.3, label='cis interactions ratio by %d bin' % win_size, zorder=1) for k, (p, n) in enumerate( zip(errors_pos[::size // 100], errors_neg[::size // 100])): ax1.vlines(k * (size // 100), (p + n) // 2, p, color='red', alpha=0.6) ax1.vlines(k * (size // 100), n, (p + n) // 2, color='blue', alpha=0.6) ax1.plot(list(range(0, size, size // 100)), errors_neg[::size // 100], 'b^', mec='blue', alpha=0.5) ax1.plot(list(range(0, size, size // 100)), errors_pos[::size // 100], 'rv', mec='red', alpha=0.5) ax1.fill_between([0, size], beg_pos, end_pos, color='red', alpha=0.3, zorder=2) ax1.text(-size / 15., (end_pos + beg_pos) / 2, 'Confidance band for\nupper stddev of median', color='red', ha='right', va='center') ax1.fill_between([0, size], beg_neg, end_neg, color='blue', alpha=0.3, zorder=2) ax1.text(-size / 15., (end_neg + beg_neg) / 2, 'Confidance band for\nlower stddev of median', color='blue', ha='right', va='center') ax1.set_ylim((0, 1.1)) ax1.set_ylabel('Ratio of cis interactions ratio') ax1.fill_betweenx([0, 1.1], cutoffL, cutoffR, color='green', alpha=0.2) ax1.text( (cutoffR + cutoffL) / 2, -0.1, ('Kept bins, top and bottom deviations from median cis-ratio\n' + 'should be inside their respective confidance bands'), ha='center', color='green') ax2 = fig.add_subplot(111, sharex=ax1, frameon=False) line3 = ax2.plot(sorted_sum, 'rx', alpha=0.4, label='Log sum of interactions by bin') ax2.set_yscale('log') ax2.yaxis.tick_right() ax2.yaxis.set_label_position("right") ax2.set_ylabel('Log interaction counts') lns = line1 + line2 + line3 labs = [l.get_label() for l in lns] ax2.legend(lns, labs, loc=0, bbox_to_anchor=(0, 0), frameon=False) ax3 = fig.add_subplot(111, frameon=False) ax3.xaxis.tick_top() ax3.set_xticks(list(range(100)), minor=True) ax3.set_xticks(list(range(0, 100, 5)), minor=False) ax3.set_yticks([]) ax3.set_xticklabels([]) for p in range(5, 100, 5): ax3.text(p, 99, '%d%%' % p, va='top', ha='left', size=9) ax3.tick_params(direction='in', axis='x', which='both') ax3.set_xlim(0, 100) ax3.set_ylim(0, 100) ax3.grid(which='major') ax3.grid(which='minor', alpha=0.5) if min_perc: plt.title('Setting from %.2f%% to %.2f%%' % (100 * float(cutoffL) / len(indices), 100 * float(cutoffR) / len(indices))) else: plt.title('Keeping from %.2f%% to %.2f%%' % (100 * float(cutoffL) / len(indices), 100 * float(cutoffR) / len(indices))) ax1.set_xlim((0, len(indices))) tadbit_savefig(savefig) plt.close('all') badcol = {} countL = 0 countZ = 0 countU = 0 for c in range(size): if cisprc.get(c, [0, 0])[1] < min_count: badcol[c] = cisprc.get(c, [0, 0])[1] countL += 1 if not c in cisprc: countZ += 1 elif cisprc[c][ 1] > max_count: # don't need get here, already cought in previous condition badcol[c] = cisprc.get(c, [0, 0])[1] countU += 1 print( ' => %d BAD bins (%d/%d/%d null/low/high counts) of %d (%.1f%%)' % (len(badcol), countZ, countL, countU, size, float(len(badcol)) / size * 100)) return badcol
def plot_genomic_distribution(fnam, first_read=True, resolution=10000, axe=None, ylim=None, savefig=None, show=False, savedata=None, chr_names=None, nreads=None): """ :param fnam: input file name :param True first_read: uses first read. :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None savedata: path where to store the output read counts per bin. :param None chr_names: can pass a list of chromosome names in case only some them the need to be plotted (this option may last even more than default) """ distr = {} idx1, idx2 = (1, 3) if first_read else (7, 9) genome_seq = OrderedDict() fhandler = open(fnam) line = if chr_names: chr_names = set(chr_names) cond1 = lambda x: x not in chr_names else: cond1 = lambda x: False if nreads: cond2 = lambda x: x >= nreads else: cond2 = lambda x: False cond = lambda x, y: cond1(x) and cond2(y) count = 0 while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split('\t') genome_seq[crm] = int(clen) line = try: while True: crm, pos = line.strip().split('\t')[idx1:idx2] count += 1 if cond(crm, count): line = if cond2(count): break continue pos = int(pos) / resolution try: distr[crm][pos] += 1 except KeyError: try: distr[crm][pos] = 1 except KeyError: distr[crm] = {pos: 1} line = except StopIteration: pass fhandler.close() if not axe: _ = plt.figure(figsize=(15, 1 + 3 * len( chr_names if chr_names else distr.keys()))) max_y = max([max(distr[c].values()) for c in distr]) max_x = max([len(distr[c].values()) for c in distr]) ncrms = len(chr_names if chr_names else genome_seq if genome_seq else distr) data = {} for i, crm in enumerate(chr_names if chr_names else genome_seq if genome_seq else distr): try: data[crm] = [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))] if savefig: plt.subplot(ncrms, 1, i + 1) plt.plot(range(max(distr[crm])), data[crm], color='red', lw=1.5, alpha=0.7) except KeyError: pass if savefig: if ylim: plt.vlines(genome_seq[crm] / resolution, ylim[0], ylim[1]) else: plt.vlines(genome_seq[crm] / resolution, 0, max_y) plt.xlim((0, max_x)) plt.ylim(ylim or (0, max_y)) plt.title(crm) if savefig: tadbit_savefig(savefig) plt.close('all') elif show: if savedata: out = open(savedata, 'w') out.write('# CRM\tstart-end\tcount\n') out.write('\n'.join('%s\t%d-%d\t%d' % (c, (i * resolution) + 1, ((i + 1) * resolution), v) for c in data for i, v in enumerate(data[c]))) out.write('\n') out.close()
def filter_by_mean(matrx, draw_hist=False, silent=False, bads=None, savefig=None): """ fits the distribution of Hi-C interaction count by column in the matrix to a polynomial. Then searches for the first possible """ nbins = 100 if not bads: bads = {} # get sum of columns cols = [] size = len(matrx) for c in sorted( [[matrx.get(i + j * size, 0) for j in xrange(size) if not j in bads] for i in xrange(size) if not i in bads], key=sum): cols.append(sum(c)) cols = np.array(cols) if draw_hist: plt.figure(figsize=(9, 9)) try: percentile = np.percentile(cols, 5) except IndexError: warn('WARNING: no columns to filter out') return bads # mad = np.median([abs(median - c ) for c in cols]) best = (None, None, None, None) # bin the sum of columns xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, int(cols[-1])) # check if the binning is correct # we want at list half of the bins with some data try: cnt = 0 while list(x).count(0) > len(x) / 2: cnt += 1 cols = cols[:-1] xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: plt.clf() hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, int(cols[-1])) if cnt > 10000: raise ValueError # find best polynomial fit in a given range for order in range(6, 18): z = np.polyfit(y, x, order) zp = np.polyder(z, m=1) roots = np.roots(np.polyder(z)) # check that we are concave down, otherwise take next root pente = np.polyval(zp, abs(roots[-2] - roots[-1]) / 2 + roots[-1]) if pente > 0: root = roots[-1] else: root = roots[-2] # root must be higher than zero if root <= 0: continue # and lower than the median if root >= percentile: continue p = np.poly1d(z) R2 = get_r2(p, x, y) # try to avoid very large orders by weigthing negatively their fit if order > 13: R2 -= float(order) / 30 if best[0] < R2: best = (R2, order, p, z, root) try: p, z, root = best[2:] if draw_hist: xlims = plt.xlim() ylims = plt.ylim() a = plt.plot(xp, p(xp), "--", color='k') b = plt.vlines(root, 0, plt.ylim()[1], colors='r', linestyles='dashed') # c = plt.vlines(median - mad * 1.5, 0, 110, colors='g', # linestyles='dashed') try: plt.legend(a + [b], [ 'polyfit \n%s' % (''.join([ sub( 'e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j > 0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1]) ])), 'first solution of polynomial derivation' ], fontsize='x-small') except TypeError: plt.legend(a + [b], [ 'polyfit \n%s' % (''.join([ sub( 'e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j > 0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1]) ])), 'first solution of polynomial derivation' ]) # plt.legend(a+[b]+[c], ['polyfit \n{}'.format ( # ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', # '${}{:.1}x^{}$'.format ('+' if j>0 else '', j, # '{' + str(i) + '}')) # for i, j in enumerate(list(p)[::-1])])), # 'first solution of polynomial derivation', # 'median - (1.5 * median absolute deviation)'], # fontsize='x-small') plt.ylim([0, ylims[1]]) plt.xlim(xlims) plt.xlabel('Sum of interactions') plt.xlabel('Number of columns with a given value') if savefig: tadbit_savefig(savefig) else: # label as bad the columns with sums lower than the root for i, col in enumerate( [[matrx.get(i + j * size, 0) for j in xrange(size)] for i in xrange(size)]): if sum(col) < root: bads[i] = sum(col) # now stored in Experiment._zeros, used for getting more accurate z-scores if bads and not silent: stderr.write( ('\nWARNING: removing columns having less than %s ' + 'counts:\n %s\n') % (round(root, 3), ' '.join([ '%5s' % str(i + 1) + ('' if (j + 1) % 20 else '\n') for j, i in enumerate(sorted(bads.keys())) ]))) except: if not silent: stderr.write('WARNING: Too many zeroes to filter columns.' + ' SKIPPING...\n') if draw_hist: plt.xlabel('Sum of interactions') plt.xlabel('Number of columns with a given value') if savefig: tadbit_savefig(savefig) else: except ValueError: if not silent: stderr.write('WARNING: Too few data to filter columns based on ' + 'mean value.\n') if draw_hist: plt.close('all') return bads
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6, normalized=False, savefig=None, show=False, savedata=None, remove_bad_columns=True, **kwargs): """ Compare the iteractions of two Hi-C matrices using their 6 first eigenvectors, with pearson correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 6 nvect: number of eigenvectors to compare :param None savefig: path to save the plot :param False show: displays the plot :param False normalized: use normalized data :param True remove_bads: computes the union of bad columns between samples and exclude them from the comparison :param kwargs: any argument to pass to matplotlib imshow function :returns: matrix of correlations """ data1 = hic_data1.get_matrix(normalized=normalized) data2 = hic_data2.get_matrix(normalized=normalized) ## reduce matrices to remove bad columns if remove_bad_columns: # union of bad columns bads = hic_data1.bads.copy() bads.update(hic_data2.bads) # remove them form both matrices for bad in sorted(bads, reverse=True): del(data1[bad]) del(data2[bad]) for i in xrange(len(data1)): _ = data1[i].pop(bad) _ = data2[i].pop(bad) # get the log size = len(data1) data1 = nozero_log(data1, np.log2) data2 = nozero_log(data2, np.log2) # get the eigenvectors ev1, evect1 = eigh(data1) ev2, evect2 = eigh(data2) corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)] # sort eigenvectors according to their eigenvalues => first is last!! sort_perm = ev1.argsort() ev1.sort() evect1 = evect1[sort_perm] sort_perm = ev2.argsort() ev2.sort() evect2 = evect2[sort_perm] # calculate Pearson correlation for i in xrange(nvect): for j in xrange(nvect): corr[i][j] = abs(pearsonr(evect1[:,-i-1], evect2[:,-j-1])[0]) # plot axe = plt.axes([0.1, 0.1, 0.6, 0.8]) cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8]) if show or savefig: im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs) axe.set_xlabel('Eigen Vectors exp. 1') axe.set_ylabel('Eigen Vectors exp. 2') axe.set_xticks(range(nvect)) axe.set_yticks(range(nvect)) axe.set_xticklabels(range(1, nvect + 2)) axe.set_yticklabels(range(1, nvect + 2)) axe.xaxis.set_tick_params(length=0, width=0) axe.yaxis.set_tick_params(length=0, width=0) cbar = plt.colorbar(im, cax = cbaxes )'Pearson correlation', rotation=90*3, verticalalignment='bottom') axe2 = axe.twinx() axe2.set_yticks(range(nvect)) axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]]) axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3, verticalalignment='bottom') axe2.set_ylim((-0.5, nvect - 0.5)) axe2.yaxis.set_tick_params(length=0, width=0) axe3 = axe.twiny() axe3.set_xticks(range(nvect)) axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]]) axe3.set_xlabel('corresponding Eigen Values exp. 1') axe3.set_xlim((-0.5, nvect - 0.5)) axe3.xaxis.set_tick_params(length=0, width=0) axe.set_ylim((-0.5, nvect - 0.5)) axe.set_xlim((-0.5, nvect - 0.5)) if savefig: tadbit_savefig(savefig) if show: plt.close('all') if savedata: out = open(savedata, 'w') out.write('# ' + '\t'.join(['Eigen Vector %s'% i for i in xrange(nvect)]) + '\n') for i in xrange(nvect): out.write('\t'.join([str(corr[i][j]) for j in xrange(nvect)]) + '\n') out.close() if kwargs.get('get_bads', False): return corr, bads else: return corr
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) quals = [] henes = [] sites = [] fixes = [] liges = [] ligep = 0 tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) else: fhandler = open(fnam) if not r_enz: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '') l_site = religated(r_enz) d_site = repaired(r_enz) if r_site*2 == l_site: # in case the religated site equals 2 restriction sites (like DnpII) site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site) fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site) else: site = re.compile(r_site) fixe = re.compile(d_site) lige = re.compile(l_site) if nreads: while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: if r_enz: _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(len(line.strip())), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, len(line))) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, len(line))) if r_enz: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( r_enz, nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') seq_len = len(line) - max((len(r_site), len(l_site), len(d_site))) sites = [sites.count(k) for k in xrange(seq_len)] # Undigested liges = [liges.count(k) for k in xrange(seq_len)] # OK fixes = [fixes.count(k) for k in xrange(seq_len)] # DE if d_site in r_site: pos = r_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)]) if d_site in l_site: pos = l_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)]) site_len = max((len(r_site), len(l_site), len(d_site))) if paired: sites[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len liges[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len fixes[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len ax2.plot(sites, linewidth=2, color='darkred') ax2.set_ylabel('Undigested RE site (%s)' % r_site) ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) ax3 = ax2.twinx() ax3.plot(liges, linewidth=2, color='darkblue') ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Religated (%s)' % l_site) if any([f > 0 for f in fixes]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) ax4.plot(fixes, linewidth=2, color='darkorange') ax4.yaxis.label.set_color('darkorange') ax4.tick_params(axis='y', colors='darkorange', **tkw) ax4.set_ylabel('Dangling-ends (%s)' % d_site) else: ax2.set_ylabel('RE site & Dangling-ends (%s)' % r_site) ax2.set_xlim((0, len(line))) lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2]) sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2]) des = ((100. * (fixes[0] + (fixes[(len(line) / 2)] if paired else 0))) / nreads) if any([f > 0 for f in fixes]) else ( 100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' + 'Percentage of reads with ligation site: %.0f%%') %( (100. * lig_cnt) / (lig_cnt + sit_cnt), des, (ligep * 100.) / nreads)) plt.subplots_adjust(right=0.85) if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: return des, (ligep * 100.) / nreads
def draw(self, focus=None, extras=None, ymax=None, ali_colors=('grey',), normalized=True, savefig=None, shape='ellipse'): """ Draw alignments as a plot. :param None focus: can pass a tuple (bin_start, bin_stop) to display the alignment between these genomic bins :param None extras: list of coordinates (genomic bin) where to draw a red cross :param None ymax: limit the y axis up to a given value :param ('grey', ): successive colors for alignment :param True normalized: normalized Hi-C count are plotted instead of raw data. :param 'ellipse' shape: which kind of shape to use as schematic representation of TADs. Implemented: 'ellipse', 'rectangle', 'triangle' :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ from import jet from matplotlib import pyplot as plt experiments = self.__experiments maxres = max([e.resolution for e in experiments]) facts = [maxres / e.resolution for e in experiments] siz = experiments[0].size if focus: figsiz = 4 + (focus[1] - focus[0]) / 30 else: figsiz = 4 + siz / 30 fig, axes = plt.subplots(nrows=len(experiments), sharex=True, sharey=True, figsize=(figsiz, 1 + len(experiments) * 1.8)) fig.subplots_adjust(hspace=0) maxys = [] for iex, xpr in enumerate(experiments): if not in self: continue _tad_density_plot(xpr, maxys=maxys, normalized=normalized, fact_res=facts[iex], axe=axes[iex], extras=extras, shape=shape, focus=focus) # draw alignment columns start = focus[0] if focus else 1 end = focus[1] if focus else xpr.tads[max(xpr.tads)]['end'] maxy = (ymax or max(maxys)) + 0.4 maxxs = [] for iex in range(len(experiments)): starting = focus[0] if focus else 1 ending = (focus[1] if focus else experiments[iex].tads.values()[-1]['end']) axes[iex].hlines(1, 1, end, 'k', lw=1.5) axes[iex].set_ylim((0, maxy)) maxxs.append(ending / facts[iex]) axes[iex].text(starting + 1, float(maxy) / 20, experiments[iex].name, {'ha': 'left', 'va': 'bottom'}) axes[iex].set_yticks([float(i) / 2 for i in range(1, int(maxy + .5) * 2)]) if ymax: axes[iex].set_ylim((0, ymax)) axes[iex].set_xlim(xmin=starting, xmax=max(maxxs)) pos = {'ha': 'center', 'va': 'bottom'} for i, col in enumerate(self.itercolumns()): ends = sorted([(t['end'], j) for j, t in enumerate(col) if t['end']]) beg = (ends[0][0] + 0.9) / facts[ends[0][1]] end = (ends[-1][0] + 1.1) / facts[ends[-1][1]] if focus: if beg < focus[0] or end > focus[1]: continue axes[0].text(beg + float(end - beg) / 2, maxy + float(maxy) / 20, str(i + 1), pos, rotation=90, size='small') for iex, tad in enumerate(col): if not tad['end']: continue axes[iex].axvspan(beg-.2, end+.2, alpha=0.2, color=ali_colors[i%(len(ali_colors))]) axes[iex].set_xlabel('Genomic bin') tit1 = fig.suptitle("TAD borders' alignment", size='x-large') tit2 = axes[0].set_title("Alignment column number") tit2.set_y(1.3) plt.subplots_adjust(top=0.76) # This was for color bar instead of legend # ax1 = fig.add_axes([0.9 + 0.3/figsiz, 0.05, 0.2/figsiz, 0.9]) # cb1 = colorbar.ColorbarBase(ax1, cmap=jet, # norm=colors.Normalize(vmin=0., vmax=1.)) # cb1.set_label('Border prediction score') #[str(i)for i in range(1, 11)]) fig.set_facecolor('white') plots = [] for scr in xrange(1, 11): plots += plt.plot((100,),(100,), marker=6, ms=9, color=jet(float(scr) / 10), mec='none') try: axes[-1].legend(plots, [str(scr) for scr in xrange(1, 11)], numpoints=1, title='Border scores', fontsize='small', loc='lower left', bbox_to_anchor=(1, 0.5)) except TypeError: axes[-1].legend(plots, [str(scr) for scr in xrange(1, 11)], numpoints=1, title='Border scores', loc='lower left', bbox_to_anchor=(1, 0.5)) if savefig: tadbit_savefig(savefig) else:
def draw_map(data, genome_seq, cumcs, savefig, show, one=False, clim=None, cmap='jet', decay=False, perc=10, name=None, cistrans=None, decay_resolution=10000, normalized=False, max_diff=None): _ = plt.figure(figsize=(15.,12.5)) if not max_diff: max_diff = len(data) ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205]) ax2 = plt.axes([0.07, 0.65, 0.21, 0.15]) if decay: ax3 = plt.axes([0.07, 0.42, 0.21, 0.15]) plot_distance_vs_interactions(data, genome_seq=genome_seq, axe=ax3, resolution=decay_resolution, max_diff=max_diff, normalized=normalized) ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1) ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1) ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1) try: minoridata = np.nanmin(data) maxoridata = np.nanmax(data) except AttributeError: vals = [i for d in data for i in d if not np.isnan(i)] minoridata = np.min(vals) maxoridata = np.max(vals) totaloridata = np.nansum([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data))]) data = nozero_log(data, np.log2) vals = np.array([i for d in data for i in d]) vals = vals[np.isfinite(vals)] mindata = np.nanmin(vals) maxdata = np.nanmax(vals) diff = maxdata - mindata posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01 posF = 1.0 if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0 if cmap == 'tadbit': cuts = perc cdict = {'red' : [(0.0, 0.0, 0.0)], 'green': [(0.0, 0.0, 0.0)], 'blue' : [(0.0, 0.5, 0.5)]} prev_pos = 0 median = (np.median(vals) - mindata) / diff for prc in np.linspace(posI, median, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.) - mindata) / diff prc = ((prc - posI) / (median - posI)) + 1. / cuts except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict['red' ].append([pos, prc, prc]) cdict['green'].append([pos, prc, prc]) cdict['blue' ].append([pos, 1, 1]) prev_pos = pos for prc in np.linspace(median + 1. / cuts, posF, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.) - mindata) / diff prc = ((prc - median) / (posF - median)) except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict['red' ].append([pos, 1.0, 1.0]) cdict['green'].append([pos, 1 - prc, 1 - prc]) cdict['blue' ].append([pos, 1 - prc, 1 - prc]) prev_pos = pos pos = (np.percentile(vals ,97.) - mindata) / diff cdict['red' ].append([pos, 0.1, 0.1]) cdict['green'].append([pos, 0, 0]) cdict['blue' ].append([pos, 0, 0]) cdict['red' ].append([1.0, 1, 1]) cdict['green'].append([1.0, 1, 1]) cdict['blue' ].append([1.0, 0, 0]) cmap = LinearSegmentedColormap(cmap, cdict) clim = None else: cmap = plt.get_cmap(cmap) cmap.set_bad('darkgrey', 1) ax1.imshow(data, interpolation='none', cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None) size = len(data) for i in xrange(size): for j in xrange(i, size): if np.isnan(data[i][j]): data[i][j] = 0 data[j][i] = 0 #data[j][i] = data[i][j] evals, evect = eigh(data) sort_perm = evals.argsort() evect = evect[sort_perm] data = [i for d in data for i in d if not np.isnan(i)] gradient = np.linspace(np.nanmin(data), np.nanmax(data), size) gradient = np.vstack((gradient, gradient)) h = ax2.hist(data, color='darkgrey', linewidth=2, bins=20, histtype='step', normed=True) _ = ax2.imshow(gradient, aspect='auto', cmap=cmap, extent=(np.nanmin(data), np.nanmax(data) , 0, max(h[0]))) if genome_seq: for crm in genome_seq: ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='w', linestyle='-', linewidth=1, alpha=1) ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='w', linestyle='-', linewidth=1, alpha=1) ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='k', linestyle='--') ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='k', linestyle='--') if not one: vals = [0] keys = [''] for crm in genome_seq: vals.append(cumcs[crm][0]) keys.append(crm) vals.append(cumcs[crm][1]) ax1.set_yticks(vals) ax1.set_yticklabels('') ax1.set_yticks([float(vals[i]+vals[i+1])/2 for i in xrange(len(vals) - 1)], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',') # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(minoridata)[::-1])])[::-1].strip(',') # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(maxoridata)[::-1])])[::-1].strip(',') plt.figtext(0.05,0.25, ''.join([ (name + '\n') if name else '', 'Number of interactions: %s\n' % str(totaloridata), ('' if np.isnan(cistrans) else ('Percentage of cis interactions: %.0f%%\n' % (cistrans*100))), 'Min interactions: %s\n' % (minoridata), 'Max interactions: %s\n' % (maxoridata)])) ax2.set_xlim((np.nanmin(data), np.nanmax(data))) ax2.set_ylim((0, max(h[0]))) ax1.set_xlim ((-0.5, size - .5)) ax1.set_ylim ((-0.5, size - .5)) ax2.set_xlabel('log interaction count') # we reduce the number of dots displayed.... we just want to see the shape subdata = np.array(list(set([float(int(d*100))/100 for d in data]))) try: normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data)) except AttributeError: normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data)) ax2.plot(subdata, normfit, 'w.', markersize=2.5, alpha=.4) ax2.plot(subdata, normfit, 'k.', markersize=1.5, alpha=1) ax2.set_title('skew: %.3f, kurtosis: %.3f' % (skew(data), kurtosis(data))) ax4.vlines(range(size), 0, evect[:,-1], color='k') ax4.hlines(0, 0, size, color='red') ax4.set_ylabel('E1') ax4.set_yticklabels([]) try: ax5.vlines(range(size), 0, evect[:,-2], color='k') except IndexError: pass ax5.hlines(0, 0, size, color='red') ax5.set_ylabel('E2') ax5.set_yticklabels([]) try: ax6.vlines(range(size), 0, evect[:,-3], color='k') except IndexError: pass ax6.hlines(0, 0, size, color='red') ax6.set_ylabel('E3') ax6.set_yticklabels([]) xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels() plt.setp(xticklabels, visible=False) if savefig: tadbit_savefig(savefig) elif show: plt.close('all')
def visualize(self, names=None, tad=None, focus=None, paint_tads=False, axe=None, show=True, logarithm=True, normalized=False, relative=True, decorate=True, savefig=None, clim=None, scale=(8, 6), cmap='jet'): """ Visualize the matrix of Hi-C interactions of a given experiment :param None names: name of the experiment to visualize, or list of experiment names. If None, all experiments will be shown :param None tad: a given TAD in the form: :: {'start': start, 'end' : end, 'brk' : end, 'score': score} **Alternatively** a list of the TADs can be passed (all the TADs between the first and last one passed will be showed. Thus, passing more than two TADs might be superfluous) :param None focus: a tuple with the start and end positions of the region to visualize :param False paint_tads: draw a box around the TADs defined for this experiment :param None axe: an axe object from matplotlib can be passed in order to customize the picture :param True show: either to pop-up matplotlib image or not :param True logarithm: show the logarithm values :param True normalized: show the normalized data (weights might have been calculated previously). *Note: white rows/columns may appear in the matrix displayed; these rows correspond to filtered rows (see* :func:`pytadbit.utils.hic_filtering.hic_filtering_for_modelling` *)* :param True relative: color scale is relative to the whole matrix of data, not only to the region displayed :param True decorate: draws color bar, title and axes labels :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None clim: tuple with minimum and maximum value range for color scale. I.e. clim=(-4, 10) """ if names == None: names = [ for xpr in self.experiments] if not isinstance(names, list) and not isinstance(names, tuple): names = [names] cols = 1 rows = 1 else: sqrtxpr = sqrt(len(names)) cols = int(round(sqrtxpr + (0.0 if int(sqrtxpr)==sqrtxpr else .5))) rows = int(sqrtxpr+.5) notaxe = axe == None if not scale: scale = (8, 6) if notaxe and len(names) != 1: fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows)) for i in xrange(rows): for j in xrange(cols): if i * cols + j >= len(names): break if notaxe and len(names) != 1: axe = fig.add_subplot( rows, cols, i * cols + j + 1) if (isinstance(names[i * cols + j], tuple) or isinstance(names[i * cols + j], list)): if not axe: fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows)) axe = fig.add_subplot( rows, cols, i * cols + j + 1) xpr1 = self.get_experiment(names[i * cols + j][0]) xpr2 = self.get_experiment(names[i * cols + j][1]) img = xpr1.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=decorate, savefig=False, where='up', clim=clim, cmap=cmap) img = xpr2.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=False, savefig=False, where='down', clim=clim or img.get_clim(), cmap=cmap) #axe = axe.twinx() #axe.set_aspect('equal',adjustable='box-forced',anchor='NE') if decorate: plt.text(1.01, .5, 'Chromosome %s experiment %s' % (,, rotation=-90, va='center', size='large', ha='left', transform=axe.transAxes) else: xper = self.get_experiment(names[i * cols + j]) if not xper.hic_data and not xper.norm: continue xper.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=decorate, savefig=False, clim=clim, cmap=cmap) if savefig: tadbit_savefig(savefig) if show:
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False, genome_seq=None, resolution=None, axe=None, savefig=None, normalized=False): """ :param data: input file name, or HiC_data object or list of lists :param 10 min_diff: lower limit (in number of bins) :param 1000 max_diff: upper limit (in number of bins) to look for :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ resolution = resolution or 1 dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)]) if isinstance(data, str): fhandler = open(data) line = while line.startswith('#'): line = try: while True: _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9) if cr1 != cr2: line = continue diff = abs(int(ps1) / resolution - int(ps2) / resolution) if max_diff > diff >= min_diff: dist_intr[diff] += 1 line = except StopIteration: pass fhandler.close() elif isinstance(data, HiC_data): if normalized: get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y] else: get_data = lambda x, y: data[x, y] max_diff = min(len(data), max_diff) if data.section_pos: for crm in data.section_pos: for diff in xrange(min_diff, min( (max_diff, 1 + data.chromosomes[crm]))): for i in xrange(data.section_pos[crm][0], data.section_pos[crm][1] - diff): dist_intr[diff] += get_data(i, i + diff) else: for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i, i + diff]): dist_intr[diff] += get_data(i, diff) else: if genome_seq: max_diff = min(max(genome_seq.values()), max_diff) cnt = 0 for crm in genome_seq: for diff in xrange(min_diff, min( (max_diff, genome_seq[crm]))): for i in xrange(cnt, cnt + genome_seq[crm] - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] cnt += genome_seq[crm] else: max_diff = min(len(data), max_diff) for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] if not axe: fig=plt.figure() axe = fig.add_subplot(111) # remove last part of the plot in case no interaction is count... reduce max_dist for diff in xrange(max_diff - 1, min_diff, -1): try: if not dist_intr[diff]: del(dist_intr[diff]) max_diff -=1 continue except KeyError: max_diff -=1 continue break xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0])) x = [] y = [] for k in xrange(len(xp)): if yp[k]: x.append(xp[k]) y.append(yp[k]) axe.plot(x, y, 'k.') best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0) logx = np.log(x) logy = np.log(y) ntries = 100 # set k for better fit # for k in xrange(1, ntries/5, ntries/5/5): if resolution == 1: k = 1 for i in xrange(3, ntries-2-k): v1 = i * len(x) / ntries try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1 = b1 = r21 = 0 r21 *= r21 for j in xrange(i + 1 + k, ntries - 2 - k): v2 = j * len(x) / ntries try: a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2]) a3, b3, r23, _, _ = linregress(logx[v2+k: ], logy[v2+k: ]) except ValueError: a2 = b2 = r22 = 0 a3 = b3 = r23 = 0 r2 = r21 + r22**2 + r23**2 if r2 > best[0]: best = (r2, v1, v2, a1, a2, a3, b1, b2, b3, k) # plot line of best fit (v1, v2, a1, a2, a3, b1, b2, b3, k) = best[1:] yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2+k: ], yfit3(x[v2+k: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) else: # from 0.7 Mb v1 = 700000 / resolution # to 10 Mb v2 = 10000000 / resolution try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1, b1, r21 = 0, 0, 0 try: a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2]) except ValueError: a2, b2, r22 = 0, 0, 0 try: a3, b3, r23, _, _ = linregress(logx[v2: ], logy[v2: ]) except ValueError: a3, b3, r23 = 0, 0, 0 yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1:v2], yfit2(x[v1:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2: ], yfit3(x[v2: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) axe.set_ylabel('Log interaction count') axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution)) axe.legend(loc='lower left', frameon=False) axe.set_xscale('log') axe.set_yscale('log') axe.set_xlim((min_diff, max_diff)) try: axe.set_ylim((0, max(y))) except ValueError: pass if savefig: tadbit_savefig(savefig) plt.close('all') elif show==True: plt.close('all')
def filter_by_mean(matrx, draw_hist=False, silent=False, bads=None, savefig=None): """ fits the distribution of Hi-C interaction count by column in the matrix to a polynomial. Then searches for the first possible """ nbins = 100 if not bads: bads = {} # get sum of columns cols = [] size = len(matrx) for c in sorted([[matrx.get(i+j*size, 0) for j in xrange(size) if not j in bads] for i in xrange(size) if not i in bads], key=sum): cols.append(sum(c)) cols = np.array(cols) if draw_hist: plt.figure(figsize=(9, 9)) try: percentile = np.percentile(cols, 5) except IndexError: warn('WARNING: no columns to filter out') return bads # mad = np.median([abs(median - c ) for c in cols]) best =(None, None, None, None) # bin the sum of columns xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, int(cols[-1])) # check if the binning is correct # we want at list half of the bins with some data try: cnt = 0 while list(x).count(0) > len(x)/2: cnt += 1 cols = cols[:-1] xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: plt.clf() hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, int(cols[-1])) if cnt > 10000: raise ValueError # find best polynomial fit in a given range for order in range(6, 18): z = np.polyfit(y, x, order) zp = np.polyder(z, m=1) roots = np.roots(np.polyder(z)) # check that we are concave down, otherwise take next root pente = np.polyval(zp, abs(roots[-2] - roots[-1]) / 2 + roots[-1]) if pente > 0: root = roots[-1] else: root = roots[-2] # root must be higher than zero if root <= 0: continue # and lower than the median if root >= percentile: continue p = np.poly1d(z) R2 = get_r2(p, x, y) # try to avoid very large orders by weigthing negatively their fit if order > 13: R2 -= float(order)/30 if best[0] < R2: best = (R2, order, p, z, root) try: p, z, root = best[2:] if draw_hist: xlims = plt.xlim() ylims = plt.ylim() a = plt.plot(xp, p(xp), "--", color='k') b = plt.vlines(root, 0, plt.ylim()[1], colors='r', linestyles='dashed') # c = plt.vlines(median - mad * 1.5, 0, 110, colors='g', # linestyles='dashed') try: plt.legend(a+[b], ['polyfit \n%s' % ( ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j>0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1])])), 'first solution of polynomial derivation'], fontsize='x-small') except TypeError: plt.legend(a+[b], ['polyfit \n%s' % ( ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j>0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1])])), 'first solution of polynomial derivation']) # plt.legend(a+[b]+[c], ['polyfit \n{}'.format ( # ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', # '${}{:.1}x^{}$'.format ('+' if j>0 else '', j, # '{' + str(i) + '}')) # for i, j in enumerate(list(p)[::-1])])), # 'first solution of polynomial derivation', # 'median - (1.5 * median absolute deviation)'], # fontsize='x-small') plt.ylim([0, ylims[1]]) plt.xlim(xlims) plt.xlabel('Sum of interactions') plt.xlabel('Number of columns with a given value') if savefig: tadbit_savefig(savefig) else: # label as bad the columns with sums lower than the root for i, col in enumerate([[matrx.get(i+j*size, 0) for j in xrange(size)] for i in xrange(size)]): if sum(col) < root: bads[i] = sum(col) # now stored in Experiment._zeros, used for getting more accurate z-scores if bads and not silent: stderr.write(('\nWARNING: removing columns having less than %s ' + 'counts:\n %s\n') % ( round(root, 3), ' '.join( ['%5s'%str(i + 1) + (''if (j + 1) % 20 else '\n') for j, i in enumerate(sorted(bads.keys()))]))) except: if not silent: stderr.write('WARNING: Too many zeroes to filter columns.' + ' SKIPPING...\n') if draw_hist: plt.xlabel('Sum of interactions') plt.xlabel('Number of columns with a given value') if savefig: tadbit_savefig(savefig) else: except ValueError: if not silent: stderr.write('WARNING: Too few data to filter columns based on ' + 'mean value.\n') if draw_hist: plt.close('all') return bads
def plot_iterative_mapping(fnam1, fnam2, total_reads=None, axe=None, savefig=None): """ :param fnam: input file name :param total_reads: total number of reads in the initial FASTQ file :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :returns: a dictionary with the number of reads per mapped length """ count_by_len = {} total_reads = total_reads or 1 if not axe: fig=plt.figure() _ = fig.add_subplot(111) colors = ['olive', 'darkcyan'] iteration = False for i, fnam in enumerate([fnam1, fnam2]): fhandler = open(fnam) line = count_by_len[i] = {} while line.startswith('#'): if line.startswith('# MAPPED '): itr, num = line.split()[2:] count_by_len[i][int(itr)] = int(num) line = if not count_by_len[i]: iteration = True try: while True: _, length, _, _ = line.rsplit('\t', 3) try: count_by_len[i][int(length)] += 1 except KeyError: count_by_len[i][int(length)] = 1 line = except StopIteration: pass fhandler.close() lengths = sorted(count_by_len[i].keys()) for k in lengths[::-1]: count_by_len[i][k] += sum([count_by_len[i][j] for j in lengths if j < k]) plt.plot(lengths, [float(count_by_len[i][l]) / total_reads for l in lengths], label='read' + str(i + 1), linewidth=2, color=colors[i]) if iteration: plt.xlabel('read length (bp)') else: plt.xlabel('Iteration number') if total_reads != 1: plt.ylabel('Proportion of mapped reads') else: plt.ylabel('Number of mapped reads') plt.legend(loc=4) if savefig: tadbit_savefig(savefig) elif not axe: plt.close('all') return count_by_len
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None, xlog=False): """ Plots the distribution of dangling-ends lengths :param fnam: input file name :param None savefig: path where to store the output images. :param 99.9 max_size: top percentage of distances to consider, within the top 0.01% are usually found very long outliers. :param False xlog: represent x axis in logarithmic scale :returns: the median value and the percentile inputed as max_size. """ distr = {} genome_seq = OrderedDict() fhandler = open(fnam) line = while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) line = des = [] if nreads: nreads /= 2 try: while True: (crm1, pos1, dir1, _, re1, _, crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12] if re1==re2 and crm1 == crm2 and dir1 != dir2: pos1, pos2 = int(pos1), int(pos2) if (pos2 > pos1) == int(dir1): des.append(abs(pos2 - pos1)) if len(des) == nreads: break line = except StopIteration: pass fhandler.close() ax = setup_plot(axe, figsize=(10, 5.5)) max_perc = np.percentile(des, max_size) perc99 = np.percentile(des, 99) perc01 = np.percentile(des, 1) perc50 = np.percentile(des, 50) perc95 = np.percentile(des, 95) perc05 = np.percentile(des, 5) desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3, label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99)) ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3) desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3, label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95)) deshist = ax.hist(des, bins=100, range=(0, max_perc), alpha=.7, color='darkred', label='Dangling-ends') ylims = ax.get_ylim() plots = [] ax.set_xlabel('Genomic distance between reads') ax.set_ylabel('Count') ax.set_title('Distribution of dangling-ends ' + 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % ( perc50, max_size, max_perc)) if xlog: ax.set_xscale('log') ax.set_xlim((50, max_perc)) plt.subplots_adjust(left=0.1, right=0.75) ax.legend(bbox_to_anchor=(1.4, 1), frameon=False) if savefig: tadbit_savefig(savefig) elif not axe: plt.close('all') return perc50, max_perc