def correlate_matrices(hic_data1, hic_data2, max_dist=10, intra=False, savefig=None, show=False, savedata=None): """ Compare the iteractions of two Hi-C matrices at a given distance, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 1 resolution: to be used for scaling the plot :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will not look further than 10 times the resolution) :param None savefig: path to save the plot :param False intra: only takes into account intra-chromosomal contacts :param False show: displays the plot :returns: list of correlations and list of genomic distances """ corr = [] dist = [] if (intra and hic_data1.sections and hic_data2.sections and hic_data1.sections == hic_data2.sections): for i in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for crm in hic_data1.section_pos: for j in xrange(hic_data1.section_pos[crm][0], hic_data1.section_pos[crm][1] - i): diag1.append(hic_data1[j, i + j]) diag2.append(hic_data2[j, i + j]) corr.append(spearmanr(diag1, diag2)[0]) dist.append(i) else: if intra: warn('WARNING: hic_dta does not contain chromosome coordinates, ' + 'intra set to False') for i in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for j in xrange(len(hic_data1) - i): diag1.append(hic_data1[j, i + j]) diag2.append(hic_data2[j, i + j]) corr.append(spearmanr(diag1, diag2)[0]) dist.append(i) if show or savefig: plt.plot(dist, corr, color='orange', linewidth=3, alpha=.8) plt.xlabel('Genomic distance in bins') plt.ylabel('Spearman rank correlation') plt.xlim((0, dist[-1])) if savefig: tadbit_savefig(savefig) if show: plt.show() plt.close('all') if savedata: out = open(savedata, 'w') out.write('# genomic distance\tSpearman rank correlation\n') for i in xrange(len(corr)): out.write('%s\t%s\n' % (dist[i], corr[i])) out.close() return corr, dist
def do_3d_plot(nam, outfile, size, count, minmax, sigma=0, log=False): fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(1, 1, 1, projection='3d') beg = -size / 2 end = size / 2 X = np.arange(beg, end, 1) Y = np.arange(beg, end, 1) X, Y = np.meshgrid(X, Y) Z = np.array([np.array([float(i) for i in l.split()]) for l in open(nam) if not l.startswith('#')]) plt.title(nam + '\nMean: %.3f, median: %.3f, standard-deviation: %.3f (N=%d)' % (np.mean(Z), np.median(Z), np.std(Z), count)) if sigma: Z = ndimage.gaussian_filter(Z, sigma=sigma, order=0) if log: Z = np.log(Z) zspan = minmax if minmax else np.max(np.abs(Z)) zmax = zspan zmin = -zspan else: zspan = minmax if minmax else np.max(np.abs(Z - 1)) zmin = -zspan + 1 zmax = zspan + 1 cmap = 'coolwarm' # 'coolwarm' _ = ax.contourf(X, Y, Z, zdir='z', offset=zmin, cmap=cmap, vmin=zmin, vmax=zmax) surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cmap, linewidth=0, antialiased=True, alpha=1, vmin=zmin, vmax=zmax, shade=True) ax.set_zlim3d(zmin, zmax) ax.view_init(elev=15, azim=25) cb = fig.colorbar(surf, shrink=0.5, aspect=20) cb.set_label('%sverage normalized interactions%s' % ('Log a' if log else 'A', '\nSmoothed with $\sigma=%s$' % sigma)) tadbit_savefig(outfile)
def objective_function(self, log=False, smooth=True, axe=None, savefig=None): """ This function plots the objective function value per each Monte-Carlo step. :param False log: log plot :param True smooth: curve smoothing :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ show = False if not axe: fig = plt.figure(figsize=(7, 7)) axe = fig.add_subplot(111) show = True axe.patch.set_facecolor('lightgrey') axe.patch.set_alpha(0.4) axe.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') axe.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') axe.set_axisbelow(True) axe.minorticks_on() # always on, not only for log # remove tick marks axe.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) axe.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') else: fig = axe.get_figure() # text plt.xlabel('Iteration number') plt.ylabel('IMP Objective Function Value') plt.title('Model ' + str(self['rand_init'])) # smooth nrjz = self['log_objfun'][1:] if smooth: xnew = linspace(0, len(nrjz), 10000) nrjz_smooth = spline(range(len(nrjz)), nrjz, xnew, order=3) axe.plot(xnew, nrjz_smooth, color='darkred') else: axe.plot(nrjz, color='darkred') # plot axe.plot(nrjz, color='darkred', marker='o', alpha=.5, ms=4, ls='None') # log if log: axe.set_yscale('log') if savefig: tadbit_savefig(savefig) elif show: plt.show()
def plot_iterative_mapping(fnam1, fnam2, total_reads=None, axe=None, savefig=None): """ :param fnam: input file name :param total_reads: total number of reads in the initial FASTQ file :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :returns: a dictionary with the number of reads per mapped length """ count_by_len = {} total_reads = total_reads or 1 if not axe: fig=plt.figure() _ = fig.add_subplot(111) colors = ['olive', 'darkcyan'] for i, fnam in enumerate([fnam1, fnam2]): fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): line = fhandler.next() try: count_by_len[i] = {} while True: _, length, _, _ = line.rsplit('\t', 3) try: count_by_len[i][int(length)] += 1 except KeyError: count_by_len[i][int(length)] = 1 line = fhandler.next() except StopIteration: pass fhandler.close() lengths = sorted(count_by_len[i].keys()) for k in lengths[::-1]: count_by_len[i][k] += sum([count_by_len[i][j] for j in lengths if j < k]) plt.plot(lengths, [float(count_by_len[i][l]) / total_reads for l in lengths], label='read' + str(i + 1), linewidth=2, color=colors[i]) plt.xlabel('read length (bp)') if total_reads != 1: plt.ylabel('Proportion of mapped reads') else: plt.ylabel('Number of mapped reads') plt.legend(loc=4) if savefig: tadbit_savefig(savefig) elif not axe: plt.show() return count_by_len
def plot_genomic_distribution(fnam, first_read=True, resolution=10000, genome_seq=None, axe=None, ylim=None, savefig=None): """ :param fnam: input file name :param True first_read: map first read. :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ distr = {} idx1, idx2 = (1, 3) if first_read else (7, 9) for line in open(fnam): crm, pos = line.split()[idx1:idx2] pos = int(pos) / resolution try: distr[crm][pos] += 1 except KeyError: try: distr[crm][pos] = 1 except KeyError: distr[crm] = {pos: 1} if not axe: fig=plt.figure(figsize=(15, 3 * len(distr.keys()))) max_y = max([max(distr[c].values()) for c in distr]) max_x = max([len(distr[c].values()) for c in distr]) for i, crm in enumerate(genome_seq if genome_seq else distr): plt.subplot(len(distr.keys()), 1, i + 1) plt.plot(range(max(distr[crm])), [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))], color='red', lw=1.5, alpha=0.7) if genome_seq: if ylim: plt.vlines(len(genome_seq[crm]) / resolution, ylim[0], ylim[1]) else: plt.vlines(len(genome_seq[crm]) / resolution, 0, max_y) plt.xlim((0, max_x)) plt.ylim(ylim or (0, max_y)) plt.title(crm) if savefig: tadbit_savefig(savefig) elif not axe: plt.show()
def plot_distance_vs_interactions(fnam, min_diff=100, max_diff=1000000, resolution=100, axe=None, savefig=None): """ :param fnam: input file name :param 100 min_diff: lower limit kn genomic distance (usually equal to read length) :param 1000000 max_diff: upper limit in genomic distance to look for :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ dist_intr = {} for line in open(fnam): _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.rsplit('\t', 9) if cr1 != cr2: continue diff = resolution * (abs(int(ps1) - int(ps2)) / resolution) if max_diff > diff > min_diff: dist_intr.setdefault(diff, 0) dist_intr[diff] += 1 for k in dist_intr.keys()[:]: if dist_intr[k] <= 2: del(dist_intr[k]) if not axe: fig=plt.figure() ax = fig.add_subplot(111) x, y = zip(*sorted(dist_intr.items(), key=lambda x:x[0])) plt.plot(x, y, 'k.') # sigma = 10 # p_x = gaussian_filter1d(x, sigma) # p_y = gaussian_filter1d(y, sigma) # plot line of best fit # plt.plot(p_x, p_y,color= 'darkgreen', lw=2, label='Gaussian fit') plt.yscale('log') plt.xscale('log') plt.xlabel('Log genomic distance (binned by %d bp)' % resolution) plt.ylabel('Log interaction count') # plt.legend() if savefig: tadbit_savefig(savefig) elif not axe: plt.show()
def hic_map(data, genome_seq, biases=None, masked=None, resolution=100000, savefig=None, show=False, savedata=None, focus=None): if isinstance(data, str): fnam = data cumcs = {} total = 0 for crm in genome_seq: cumcs[crm] = total total += len(genome_seq[crm]) / resolution + 1 # bin the data data = [[0 for _ in xrange(total + 1)] for _ in xrange(total + 1)] masked = masked or set() for line in open(fnam): read, cr1, ps1, _, _, _, _, cr2, ps2, _, _, _, _ = line.split() if read in masked: continue ps1 = int(ps1) / resolution ps2 = int(ps2) / resolution try: data[cumcs[cr1] + ps1][cumcs[cr2] + ps2] += 1 except: break else: hic_data = data beg, end = focus if focus else (0, len(hic_data)) beg -= 1 if focus else 0 if biases: data = [[hic_data[len(hic_data) * i + j] / (biases[i] * biases[j]) for j in xrange(beg, end)] for i in xrange(beg, end)] else: data = [[hic_data[len(hic_data) * i + j] for j in xrange(beg, end)] for i in xrange(beg, end)] # do the plot if show or savefig: import numpy as np plt.figure(figsize=(16, 12)) plt.imshow(np.log2(data), origin='lower', cmap='gist_earth', interpolation='nearest') plt.colorbar() if savefig: tadbit_savefig(savefig) elif show: plt.show() if savedata: out = open(savedata, 'w') for line in data: out.write('\t'.join([str(cell) for cell in line]) + '\n') out.close()
def correlate_matrices(hic_data1, hic_data2, max_dist=10, savefig=None, show=False, savedata=None): """ Compare the iteractions of two Hi-C matrices at a given distance, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 1 resolution: to be used for scaling the plot :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will not look further than 10 times the resolution) :param None savefig: path to save the plot :param False show: displays the plot :returns: list of correlations and list of genomic distances """ corr = [] dist = [] for i in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for j in xrange(len(hic_data1) - i): diag1.append(hic_data1[j, i + j]) diag2.append(hic_data2[j, i + j]) corr.append(spearmanr(diag1, diag2)[0]) dist.append(i) if show or savefig: plt.plot(dist, corr, color='orange', linewidth=3, alpha=.8) plt.xlabel('Genomic distance in bins') plt.ylabel('Spearman rank correlation') plt.xlim((0, dist[-1])) if savefig: tadbit_savefig(savefig) if show: plt.show() plt.close('all') if savedata: out = open(savedata, 'w') out.write('# genomic distance\tSpearman rank correlation\n') for i in xrange(len(corr)): out.write('%s\t%s\n' % (dist[i], corr[i])) out.close() return corr, dist
def quality_plot(fnam, r_enz=None, nreads=float('inf'), axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' )]) if isinstance(r_enz, list): r_enzs = r_enz elif isinstance(r_enz, str): r_enzs = [r_enz] for k in RESTRICTION_ENZYMES.keys(): for i in range(len(r_enzs)): if k.lower() == str(r_enz[i]).lower(): r_enz[i] = k # else let it as None quals = [] henes = [] sites = {} fixes = {} liges = {} ligep = {} tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) elif fnam.endswith('.dsrc'): proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE) fhandler = proc.stdout else: fhandler = open(fnam) if len(r_enzs) == 1 and r_enzs[0] is None: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_sites = {} d_sites = {} for r_enz in r_enzs: r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '') d_sites[r_enz] = repaired(r_enz) sites[r_enz] = [] # initialize dico to store undigested sites fixes[r_enz] = [] # initialize dico to store digested sites l_sites = religateds(r_enzs) site = {} fixe = {} for r_enz in r_enzs: site[r_enz] = re.compile(r_sites[r_enz]) fixe[r_enz] = re.compile(d_sites[r_enz]) # ligation sites should appear in lower case in the sequence lige = {} for k in l_sites: liges[k] = [] # initialize dico to store sites ligep[k] = 0 # initialize dico to store sites l_sites[k] = l_sites[k].lower() lige[k] = re.compile(l_sites[k]) while len(quals) <= nreads: try: next(fhandler) except StopIteration: break seq = next(fhandler) # ligation sites replaced by lower case to ease the search for lig in l_sites.values(): seq = seq.replace(lig.upper(), lig) for r_enz in r_enzs: sites[r_enz].extend( [m.start() for m in site[r_enz].finditer(seq)]) # TODO: you cannot have a repaired/fixed site in the middle of # the sequence, this could be only checked at the beginning fixes[r_enz].extend( [m.start() for m in fixe[r_enz].finditer(seq)]) for k in lige: # for each paired of cut-site liges[k].extend([m.start() for m in lige[k].finditer(seq)]) ligep[k] += l_sites[k] in seq # store the number of Ns found in the sequences if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = izip_longest(*quals, fillvalue=float('nan')) meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals]) max_seq_len = len(meanquals) if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: # configure plot if len(r_enzs) == 1 and r_enzs[0] is None: # do both plots _, ax = plt.subplots(1, 1, figsize=(15, 6)) else: # only do the quality_plot plot _, (ax, ax2) = plt.subplots(2, 1, figsize=(15, 12)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(max_seq_len), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, max_seq_len)) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() # quality_plot plot axb.plot([henes.count(i) for i in range(max_seq_len)], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, max_seq_len)) # Hi-C plot if not (len(r_enzs) == 1 and r_enzs[0] is None): ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (', '.join(map(str, r_enzs)), nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') # seq_len is the length of the line to plot. we don't want to plot # if there is no room for the cut-site, or ligation site. site_len = max((max([len(r_sites[k]) for k in r_sites]), max([len(l_sites[k]) for k in l_sites]), max([len(d_sites[k]) for k in d_sites]))) seq_len = max_seq_len - site_len # transform dictionaries of positions into dictionaries of counts for r_enz in sites: sites[r_enz] = [sites[r_enz].count(k) for k in range(seq_len)] # Undigested fixes[r_enz] = [fixes[r_enz].count(k) for k in range(seq_len)] # DE for r1, r2 in liges: liges[(r1, r2)] = [liges[(r1, r2)].count(k) for k in range(seq_len)] # OK # in case the pattern of the repaired cut-site contains the target # cut-site pattern. These sites were counted twice, once in the # undigested, and once in the repaired. We remove them from the # repaired: for r_enz in r_enzs: if d_sites[r_enz] in r_sites[r_enz]: pos = r_sites[r_enz].find(d_sites[r_enz]) fixes[r_enz] = (fixes[r_enz][:pos] + [ fixes[r_enz][k] - sites[r_enz][k - pos] for k in range(pos, seq_len) ]) # same for ligated sites for r_enz1 in r_enzs: for r_enz2 in r_enzs: if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]: continue pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1]) fixes[r_enz1] = (fixes[r_enz1][:pos] + [ fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos] for k in range(pos, seq_len) ]) # remove anything that could be in between the two read ends if paired: for k in sites: sites[k][max_seq_len // 2 - site_len:max_seq_len // 2] = [float('nan')] * site_len fixes[k][max_seq_len // 2 - site_len:max_seq_len // 2] = [float('nan')] * site_len for k in liges: liges[k][max_seq_len // 2 - site_len:max_seq_len // 2] = [float('nan')] * site_len # plot undigested cut-sites color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs)))) for r_enz in sites: # print 'undigested', r_enz # print sites[r_enz][:20] ax2.plot( sites[r_enz], linewidth=2, color=color.next(), alpha=0.9, label='Undigested RE site (%s: %s)' % (r_enz, r_sites[r_enz]) if any([f > 0 for f in fixes[r_enz]]) else 'Undigested & Dangling-Ends (%s: %s)' % (r_enz, r_sites[r_enz])) ax2.set_ylabel('Undigested') ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) lines, labels = ax2.get_legend_handles_labels() ax3 = ax2.twinx() color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges)))) for r1, r2 in liges: # print 'ligated', r1, r2 # print liges[(r1, r2)][:20] ax3.plot(liges[(r1, r2)], linewidth=2, color=color.next(), alpha=0.9, label='Ligated (%s-%s: %s)' % (r1, r2, l_sites[(r1, r2)].upper())) ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Ligated') tmp_lines, tmp_labels = ax3.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs)))) for i, r_enz in enumerate(r_enzs): if any([f > 0 for f in fixes[r_enz]]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) # print 'repaired', r_enz # print fixes[r_enz][:20] ax4.plot(fixes[r_enz], linewidth=2, color=color.next(), alpha=0.9, label='Dangling-ends (%s: %s)' % (r_enz, d_sites[r_enz])) ax4.yaxis.label.set_color('darkgreen') ax4.tick_params(axis='y', colors='darkgreen', **tkw) ax4.set_ylabel('Dangling-ends') tmp_lines, tmp_labels = ax4.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) else: ax2.set_ylabel('Undigested & Dangling-ends') ax2.set_xlim((0, max_seq_len)) # Count ligation sites lig_cnt = {} for k in liges: lig_cnt[k] = (nansum(liges[k]) - liges[k][0] - liges[k][max_seq_len // 2]) # Count undigested sites sit_cnt = {} for r_enz in r_enzs: sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] - sites[r_enz][max_seq_len // 2]) # Count Dangling-Ends des = {} for r_enz in r_enzs: if any([f > 0 for f in fixes[r_enz]]): des[r_enz] = ( (100. * (fixes[r_enz][0] + (fixes[r_enz][(max_seq_len // 2)] if paired else 0))) // nreads) else: des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][ (max_seq_len // 2)] if paired else 0))) // nreads # Decorate plot title = '' for r_enz in r_enzs: lcnt = float( sum([ lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1) for r_enz1 in r_enzs for r_enz2 in r_enzs if r_enz1 == r_enz or r_enz2 == r_enz ])) title += ( 'Percentage of digested sites (not considering Dangling-Ends) ' '%s: %.1f%%\n' % (r_enz, 100. * float(lcnt) / (lcnt + sit_cnt[r_enz]))) for r_enz in r_enzs: title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz, des[r_enz]) for r_enz1 in r_enzs: for r_enz2 in r_enzs: title += ( 'Percentage of reads with ligation site (%s-%s): %.1f%% \n' % (r_enz1, r_enz2, (ligep[(r_enz1, r_enz2)] * 100.) / nreads)) plt.title(title.strip(), size=10, ha='left', x=0) plt.subplots_adjust(right=0.85) ax2.legend(lines, labels, bbox_to_anchor=(0.75, 1.0), loc=3, borderaxespad=0., frameon=False, fontsize=9) plt.tight_layout() if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() for k in ligep: ligep[k] = (ligep[k] * 100.) / nreads if len(r_enzs) == 1 and r_enzs[0] is None: return {}, {} return des, ligep
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None if opts.figsize: opts.figsize = map(float, opts.figsize.split(',')) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v !='raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None if opts.plot and not opts.force_plot: if opts.interactive: max_size = 1500**2 else: max_size = 5000**2 else: max_size = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict(zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = OrderedDict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean, max_size=max_size) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemented ' 'for matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions) for p in range(starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(opts.reso, sep=''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) + '\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: # transform matrix matrix = array([array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2)]) m = zeros_like(matrix) for bad1 in bads1: m[:,bad1] = 1 for bad2 in bads2: m[bad2,:] = 1 matrix = ma.masked_array(matrix, m) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s%s.%s' % ( norm, name, nicer(opts.reso, sep=''), ('_' + param_hash), '_tri' if opts.triangular else '', opts.format) out_plots[norm_string] = path.join(outdir, fnam) pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = 0 if start2 is None else start2 pltend2 = sections[regions[-1]] if end2 is None else end2 xlabel = '{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1) ylabel = '{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2) section_pos = OrderedDict((k, section_pos[k]) for k in section_pos if k in regions) ax1, _ = plot_HiC_matrix( matrix, triangular=opts.triangular, vmin=vmin, vmax=vmax, cmap=opts.cmap, figsize=opts.figsize, bad_color=opts.bad_color if norm != 'raw' else None) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % ( name, norm, nicer(opts.reso)), y=1.05) _format_axes(ax1, start1, end1, start2, end2, opts.reso, regions, section_pos, sections, opts.xtick_rotation, triangular=False) if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update(write_matrix( mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s '% tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) quals = [] henes = [] sites = [] fixes = [] liges = [] ligep = 0 tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) elif fnam.endswith('.dsrc'): proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE) fhandler = proc.stdout else: fhandler = open(fnam) if not r_enz: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '') l_site = religated(r_enz) d_site = repaired(r_enz) if r_site*2 == l_site: # in case the religated site equals 2 restriction sites (like DnpII) site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site) fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site) else: site = re.compile(r_site) fixe = re.compile(d_site) lige = re.compile(l_site) if nreads: while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: if r_enz: _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(len(line.strip())), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, len(line))) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, len(line))) if r_enz: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( r_enz, nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') seq_len = len(line) - max((len(r_site), len(l_site), len(d_site))) sites = [sites.count(k) for k in xrange(seq_len)] # Undigested liges = [liges.count(k) for k in xrange(seq_len)] # OK fixes = [fixes.count(k) for k in xrange(seq_len)] # DE if d_site in r_site: pos = r_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)]) if d_site in l_site: pos = l_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)]) site_len = max((len(r_site), len(l_site), len(d_site))) if paired: sites[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len liges[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len fixes[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len ax2.plot(sites, linewidth=2, color='darkred') ax2.set_ylabel('Undigested RE site (%s)' % r_site) ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) ax3 = ax2.twinx() ax3.plot(liges, linewidth=2, color='darkblue') ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Religated (%s)' % l_site) if any([f > 0 for f in fixes]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) ax4.plot(fixes, linewidth=2, color='darkorange') ax4.yaxis.label.set_color('darkorange') ax4.tick_params(axis='y', colors='darkorange', **tkw) ax4.set_ylabel('Dangling-ends (%s)' % d_site) else: ax2.set_ylabel('RE site & Dangling-ends (%s)' % r_site) ax2.set_xlim((0, len(line))) lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2]) sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2]) des = ((100. * (fixes[0] + (fixes[(len(line) / 2)] if paired else 0))) / nreads) if any([f > 0 for f in fixes]) else ( 100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' + 'Percentage of reads with ligation site: %.0f%%') %( (100. * lig_cnt) / (lig_cnt + sit_cnt), des, (ligep * 100.) / nreads)) plt.subplots_adjust(right=0.85) if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() return des, (ligep * 100.) / nreads
def quality_plot(fnam, nreads=None, axe=None, savefig=None): """ Plot the qualities :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ phred = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' quals = [] if fnam.endswith('.gz'): fhandler = gopen(fnam) else: fhandler = open(fnam) if nreads: while True: try: next(fhandler) except EOFError: break next(fhandler) next(fhandler) line = next(fhandler) quals.append([phred.index(i) for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break next(fhandler) next(fhandler) next(fhandler) line = next(fhandler) quals.append([phred.index(i) for i in line.strip()]) fhandler.close() quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() plt.clf() else: fig = plt.figure() plt.clf() ax = fig.add_subplot(111) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') plt.figure(figsize=(15, 7)) plt.errorbar(range(len(line.strip())), meanquals, yerr=errorquals, ecolor='orange') plt.xlim((0, len(line))) plt.xlabel('Sequence') plt.ylabel('PHRED score') if savefig: tadbit_savefig(savefig) elif not axe: plt.show()
def mmp_score(matrix, nrand=10, verbose=False, savefig=None): """ :param matrix: list of lists :param 10 nrand: number of randomizations :param None savefig: path where to save figure :returns: 1- MMP score which ranges from 0 (bad) to 1 (good), and 2- the expected correlation of the contact matrices of the modeled chromatin with the original Hi-C data (plus the 3- lower and 4- upper values expected in 95% of the cases) """ data = np.array([np.array([v for v in l]) for l in matrix]) if verbose: sys.stdout.write(' - getting EigenVectors\n') egval, _ = np.linalg.eigh(data) # sort eigenvalues/vectors idx = (-egval).argsort() egval = egval[idx] regvals = [] if verbose: sys.stdout.write(' - randomization\n') for i in xrange(int(nrand)): if verbose: sys.stdout.write('\r ' + str(i + 1) + ' / ' + str(nrand)) sys.stdout.flush() regval, _ = np.linalg.eigh(randomize_matrix(data)) regval = [abs(j) for j in regval] regval.sort(reverse=True) regvals.append( regval) if verbose: sys.stdout.write('\n') regvals = zip(*regvals) rvmean = [] for rv in regvals: rvmean.append(np.mean(rv)) total = sum(rvmean)/100 rvmean = [i/total for i in rvmean] err = [] for rv in regvals: rvstd = np.std(rv/total) err.append(2 * rvstd) zdata = sorted(np.log2([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data)) if data[i][j]])) skewness = skew(zdata) kurtness = kurtosis(zdata) if savefig: _ = plt.figure(figsize=(14, 8)) gs = gridspec.GridSpec(7, 5, wspace=0.5, hspace=1.5) ax1 = plt.subplot(gs[: , 0:3]) ax2 = plt.subplot(gs[1:5 , 3: ]) ax3 = plt.subplot(gs[5:7 , 3: ]) img = ax2.imshow(np.log2(data), interpolation='none') plt.colorbar(img, ax=ax2) if savefig: ax2.set_title('Original matrix', size=12) ax2.tick_params(axis='both', which='major', labelsize=10) ax2.set_xlabel('Bin') ax2.set_ylabel('Bin') normfit = sc_norm.pdf(zdata, np.mean(zdata), np.std(zdata)) _ = ax3.plot(zdata, normfit, ':o', color='grey', ms=3, alpha=.4, markersize=.5) ax3.tick_params(axis='both', which='major', labelsize=10) ax3.hist(zdata, bins=20, density=True, alpha=0.7, color='r') ax3.set_xlabel('Z-score') ax3.set_ylabel('Frequency') rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' rcParams['axes.axisbelow'] = True rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' rcParams['axes.axisbelow'] = True rcParams['axes.grid'] = True rcParams['grid.color'] = 'w' rcParams['grid.linestyle'] = '-' rcParams['grid.linewidth'] = 2 # rcParams['grid.alpha'] = .3 ax1.minorticks_on() ax1.grid(ls='-', color='w', alpha=.3, lw=2, which='major') ax1.grid(ls='-', b=True, color='w', alpha=.3, lw=1, which='minor') ax1.spines['top'].set_color('none') ax1.spines['right'].set_color('none') ax1.spines['bottom'].set_color('none') ax1.spines['left'].set_color('none') ax1.xaxis.set_ticks_position('bottom') ax1.yaxis.set_ticks_position('left') ax1.set_xscale('log') try: ax1.set_axis_bgcolor((.9,.9,.9)) except AttributeError: ax1.set_facecolor((.9,.9,.9)) ax1.errorbar(range(1, 1 + len(rvmean)), rvmean, yerr=err, ecolor='red', color='orange', lw=2, label='%s randomizations' % (nrand)) total = sum(abs(egval)) / 100 egval = np.array(sorted([e/total for e in abs(egval)], reverse=True)) for i in xrange(len(rvmean)): if rvmean[i] + err[i] > egval[i]: break signifidx = i size = len(data) sev = sum(egval[:signifidx]-rvmean[:signifidx]) if savefig: ax1.plot(range(1, 1 + len(rvmean)), egval, color='green', lw=2, label='Observed data') ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval, where=(np.array(rvmean) + np.array(err))<egval, facecolor='green', interpolate=True, alpha=0.2) ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval, where=(np.array(rvmean) + np.array(err))>egval, facecolor='red' , interpolate=True, alpha=0.2) ax1.set_xlim((0,len(rvmean))) ax1.set_ylim((0, max(max(rvmean), max(egval)))) ax1.legend(frameon=False, loc='upper right', prop={'size': 10}) ax1.set_xlabel('Log indexes of Eigenvalues') ax1.set_ylabel('Eigenvalues (percentage of total)') #plt.subplots_adjust(right=0.6) #img = Image.open(opts.outdir + '/matrix_small.png') #fig.figimage(img, 640, -160) minv = float(min([i for d in data for i in d if i])) / 2 if minv == 0.5: minv = 1./(len(data)**2) mmp = -0.0002 * size + 0.0335 * skewness - 0.0229 * kurtness + 0.0069 * sev + 0.8126 if verbose: sys.stdout.write('\n') sys.stdout.write('\n Results\n') sys.stdout.write(' -------\n\n') if verbose: sys.stdout.write(' MMP score: %.4f\n\n' % mmp) ex_a1, ex_b1 = [0.6975926, 0.2548171] supa1, supb1 = [0.69300732000423904, 0.29858572176099613] lowa1, lowb1 = [0.70217788900976075, 0.211048473299004] scc = (mmp - ex_b1 ) / ex_a1 scc_up1 = (mmp - supb1 ) / supa1 scc_lw1 = (mmp - lowb1 ) / lowa1 if verbose: sys.stdout.write((' predicted dSCC is %.3f (%.3f-%.3f ' '68%% confidence)\n') % (scc , scc_up1 , scc_lw1 )) supa75, supb75 = [0.69230778430383244, 0.30526310790548261] lowa75, lowb75 = [0.70287742471016734, 0.20437108715451746] scc_up75 = (mmp - supb75 ) / supa75 scc_lw75 = (mmp - lowb75 ) / lowa75 if verbose: sys.stdout.write((' (%.3f-%.3f ' '75%% confidence)\n') % (scc_up75 , scc_lw75 )) supa2, supb2 = [0.68855373600821357, 0.34109720480765293] lowa2, lowb2 = [0.70663147300578644, 0.16853699025234709] scc_up2 = (mmp - supb2 ) / supa2 scc_lw2 = (mmp - lowb2 ) / lowa2 if verbose: sys.stdout.write((' (%.3f-%.3f ' '95%% confidence)\n') % (scc_up2 , scc_lw2 )) if savefig: # write the log log = '' log += ' 1- Matrix size (number of eigenvalues): %s\n' % (len(egval)) log += " 2- Skewness of the distribution: %0.3f\n" % (skewness) log += " 3- Kurtosis of the distribution: %0.3f\n" % (kurtness) log += " 4- Sum of differences signif EV real-rand: %0.3f\n\n" % (sev) plt.figtext(0.62, 0.77, log, size='small') log = "MMP score: %.3f\n" % (mmp) log += "Predicted dSCC: %.3f (%.3f-%.3f at 95%% conf)\n" % (scc, scc_up2, scc_lw2) plt.figtext(0.61, 0.87, log, size=12) tadbit_savefig(savefig) plt.close('all') return mmp, scc, scc_up2 , scc_lw2
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v != 'raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict( zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemeted for ' 'matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1, p + opts.reso) for r, reg in enumerate(regions) for p in range( starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer( opts.reso).replace(' ', ''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) + '\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: cmap = plt.get_cmap(opts.cmap) if norm != 'raw': cmap.set_bad('grey', 1.) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace( ' ', ''), ('_' + param_hash), opts.format) out_plots[norm_string] = path.join(outdir, fnam) if opts.interactive: _ = plt.figure(figsize=(8, 7)) else: _ = plt.figure(figsize=(16, 14)) # ax1 = plt.subplot(111) ax1 = plt.axes([0.1, 0.1, 0.7, 0.8]) ax2 = plt.axes([0.82, 0.1, 0.07, 0.8]) matrix = array([ array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2) ]) mini = np_min(matrix[nonzero(matrix)]) / 2. matrix[matrix == 0] = mini m = zeros_like(matrix) for bad1 in bads1: m[:, bad1] = 1 for bad2 in bads2: m[bad2, :] = 1 matrix = log2(ma.masked_array(matrix, m)) ax1.imshow(matrix, interpolation='None', origin='lower', cmap=cmap, vmin=vmin, vmax=vmax) if len(regions) <= 2: pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = pltbeg1 if len( regions) == 1 else 0 if start2 is None else start2 pltend2 = pltend1 if len(regions) == 1 else sections[ regions[-1]] if end2 is None else end2 ax1.set_xlabel('{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1)) ax1.set_ylabel('{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)) def format_xticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg1) return nicer(tickstring if tickstring else 1, coma=True) def format_yticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg2) return nicer(tickstring if tickstring else 1, coma=True) ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks)) ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks)) labels = ax1.get_xticklabels() plt.setp(labels, rotation=-25, ha='left') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) else: vals = [0] keys = [''] for crm in regions: vals.append(section_pos[crm][0] / opts.reso) keys.append(crm) vals.append(section_pos[crm][1] / opts.reso) ax1.set_yticks(vals) ax1.set_yticklabels('') ax1.set_yticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xticks(vals) ax1.set_xticklabels('') ax1.set_xticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_xticklabels(keys, minor=True) for t in ax1.xaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xlabel('Chromosomes') ax1.set_ylabel('Chromosomes') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) data = [i for d in matrix for i in d if isfinite(i)] mindata = nanmin(data) maxdata = nanmax(data) gradient = linspace(maxdata, mindata, max((len(matrix), len(matrix[0])))) gradient = dstack((gradient, gradient))[0] h = ax2.hist(data, color='darkgrey', linewidth=2, orientation='horizontal', bins=50, histtype='step', normed=True) _ = ax2.imshow(gradient, aspect='auto', cmap=cmap, extent=(0, max(h[0]), mindata, maxdata)) ax2.yaxis.tick_right() ax2.yaxis.set_label_position("right") ax2.set_xticks([]) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (name, norm, nicer(opts.reso))) ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90) ax2.set_xlabel('Count') if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update( write_matrix(mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s ' % tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6, savefig=None, show=False, savedata=None, **kwargs): """ Compare the iteractions of two Hi-C matrices using their 6 first eigenvectors, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 6 nvect: number of eigenvectors to compare :param None savefig: path to save the plot :param False show: displays the plot :param kwargs: any argument to pass to matplotlib imshow function :returns: matrix of correlations """ data1 = hic_data1.get_matrix() data2 = hic_data2.get_matrix() # get the log size = len(data1) data1 = nozero_log(data1, np.log2) data2 = nozero_log(data2, np.log2) # get the eigenvectors ev1, evect1 = eigh(data1) ev2, evect2 = eigh(data2) corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)] # sort eigenvectors according to their eigenvalues => first is last!! sort_perm = ev1.argsort() ev1.sort() evect1 = evect1[sort_perm] sort_perm = ev2.argsort() ev2.sort() evect2 = evect2[sort_perm] # calculate Pearson correlation for i in xrange(nvect): for j in xrange(nvect): corr[i][j] = abs(pearsonr(evect1[:,-i-1], evect2[:,-j-1])[0]) # plot axe = plt.axes([0.1, 0.1, 0.6, 0.8]) cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8]) if show or savefig: im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs) axe.set_xlabel('Eigen Vectors exp. 1') axe.set_ylabel('Eigen Vectors exp. 2') axe.set_xticks(range(nvect)) axe.set_yticks(range(nvect)) axe.set_xticklabels(range(1, nvect + 2)) axe.set_yticklabels(range(1, nvect + 2)) axe.xaxis.set_tick_params(length=0, width=0) axe.yaxis.set_tick_params(length=0, width=0) cbar = plt.colorbar(im, cax = cbaxes ) cbar.ax.set_ylabel('Pearson correlation', rotation=90*3, verticalalignment='bottom') axe2 = axe.twinx() axe2.set_yticks(range(nvect)) axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]]) axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3, verticalalignment='bottom') axe2.set_ylim((-0.5, nvect - 0.5)) axe2.yaxis.set_tick_params(length=0, width=0) axe3 = axe.twiny() axe3.set_xticks(range(nvect)) axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]]) axe3.set_xlabel('corresponding Eigen Values exp. 1') axe3.set_xlim((-0.5, nvect - 0.5)) axe3.xaxis.set_tick_params(length=0, width=0) axe.set_ylim((-0.5, nvect - 0.5)) axe.set_xlim((-0.5, nvect - 0.5)) if savefig: tadbit_savefig(savefig) if show: plt.show() plt.close('all') if savedata: out = open(savedata, 'w') out.write('# ' + '\t'.join(['Eigen Vector %s'% i for i in xrange(nvect)]) + '\n') for i in xrange(nvect): out.write('\t'.join([str(corr[i][j]) for j in xrange(nvect)]) + '\n') out.close() return corr
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6, savefig=None, show=False, savedata=None): """ Compare the iteractions of two Hi-C matrices using their 6 first eigenvectors, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 6 nvect: number of eigenvectors to compare :param None savefig: path to save the plot :param False show: displays the plot :returns: matrix of correlations """ corr = [] ev1, evect1 = eigh(np.array([[hic_data1[i, j] for j in xrange(len(hic_data1))] for i in xrange(len(hic_data1))])) ev2, evect2 = eigh(np.array([[hic_data2[i, j] for j in xrange(len(hic_data2))] for i in xrange(len(hic_data2))])) corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)] sort_perm = ev1.argsort() ev1.sort() evect1 = evect1[sort_perm][::-1] sort_perm = ev2.argsort() ev2.sort() evect2 = evect2[sort_perm][::-1] for i in xrange(nvect): for j in xrange(nvect): corr[i][j] = abs(pearsonr(evect1[:,i], evect2[:,j])[0]) axe = plt.axes([0.1, 0.1, 0.6, 0.8]) cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8]) if show or savefig: im = axe.imshow(corr, interpolation="nearest",origin='lower') axe.set_xlabel('Eigen Vectors exp. 1') axe.set_ylabel('Eigen Vectors exp. 2') axe.set_xticklabels(range(nvect + 1), range(1, nvect + 2)) axe.set_yticklabels(range(nvect + 1), range(1, nvect + 2)) axe.xaxis.set_tick_params(length=0, width=0) axe.yaxis.set_tick_params(length=0, width=0) cbar = plt.colorbar(im, cax = cbaxes ) cbar.ax.set_ylabel('Pearson correlation', rotation=90*3, verticalalignment='bottom') axe2 = axe.twinx() axe2.set_yticks(range(nvect)) axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]]) axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3, verticalalignment='bottom') axe2.set_ylim((-0.5, nvect - 0.5)) axe2.yaxis.set_tick_params(length=0, width=0) axe3 = axe.twiny() axe3.set_xticks(range(nvect)) axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]]) axe3.set_xlabel('corresponding Eigen Values exp. 1') axe3.set_xlim((-0.5, nvect - 0.5)) axe3.xaxis.set_tick_params(length=0, width=0) axe.set_ylim((-0.5, nvect - 0.5)) axe.set_xlim((-0.5, nvect - 0.5)) if savefig: tadbit_savefig(savefig) if show: plt.show() plt.close('all') if savedata: out = open(savedata, 'w') out.write('# ' + '\t'.join(['Eigen Vector %s'% i for i in xrange(nvect)]) + '\n') for i in xrange(nvect): out.write('\t'.join([str(corr[i][j]) for j in xrange(nvect)]) + '\n') out.close() return corr
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None, xlog=False): """ Plots the distribution of dangling-ends lengths :param fnam: input file name :param None savefig: path where to store the output images. :param 99.9 max_size: top percentage of distances to consider, within the top 0.01% are usually found very long outliers. :param False xlog: represent x axis in logarithmic scale """ distr = {} genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) line = fhandler.next() des = [] if nreads: nreads /= 2 try: while True: (crm1, pos1, dir1, _, re1, _, crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12] if re1==re2 and crm1 == crm2 and dir1 != dir2: pos1, pos2 = int(pos1), int(pos2) if (pos2 > pos1) == int(dir1): des.append(abs(pos2 - pos1)) if len(des) == nreads: break line = fhandler.next() except StopIteration: pass fhandler.close() ax = setup_plot(axe, figsize=(10, 5.5)) max_perc = np.percentile(des, max_size) perc99 = np.percentile(des, 99) perc01 = np.percentile(des, 1) perc95 = np.percentile(des, 95) perc05 = np.percentile(des, 5) desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3, label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99)) ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3) desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3, label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95)) deshist = ax.hist(des, bins=100, range=(0, max_perc), alpha=.7, color='darkred', label='Dangling-ends') ylims = ax.get_ylim() plots = [] ax.set_xlabel('Genomic distance between reads') ax.set_ylabel('Count') ax.set_title('Distribution of dangling-ends ' + 'lenghts\n(top %.1f%%, up to %0.f nts)' % (max_size, max_perc)) if xlog: ax.set_xscale('log') ax.set_xlim((50, max_perc)) plt.subplots_adjust(left=0.1, right=0.75) ax.legend(bbox_to_anchor=(1.4, 1), frameon=False) if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all')
def draw_map( data, genome_seq, cumcs, savefig, show, one=False, clim=None, cmap="jet", decay=False, perc=10, name=None, cistrans=None, decay_resolution=10000, normalized=False, max_diff=None, ): _ = plt.figure(figsize=(15.0, 12.5)) if not max_diff: max_diff = len(data) ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205]) ax2 = plt.axes([0.07, 0.65, 0.21, 0.15]) if decay: ax3 = plt.axes([0.07, 0.42, 0.21, 0.15]) plot_distance_vs_interactions( data, genome_seq=genome_seq, axe=ax3, resolution=decay_resolution, max_diff=max_diff, normalized=normalized ) ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1) ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1) ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1) try: minoridata = np.nanmin(data) maxoridata = np.nanmax(data) except AttributeError: vals = [i for d in data for i in d if not np.isnan(i)] minoridata = np.min(vals) maxoridata = np.max(vals) totaloridata = np.nansum([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data))]) data = nozero_log(data, np.log2) vals = np.array([i for d in data for i in d]) vals = vals[np.isfinite(vals)] mindata = np.nanmin(vals) maxdata = np.nanmax(vals) diff = maxdata - mindata posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01 posF = 1.0 if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0 if cmap == "tadbit": cuts = perc cdict = {"red": [(0.0, 0.0, 0.0)], "green": [(0.0, 0.0, 0.0)], "blue": [(0.0, 0.5, 0.5)]} prev_pos = 0 median = (np.median(vals) - mindata) / diff for prc in np.linspace(posI, median, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.0) - mindata) / diff prc = ((prc - posI) / (median - posI)) + 1.0 / cuts except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict["red"].append([pos, prc, prc]) cdict["green"].append([pos, prc, prc]) cdict["blue"].append([pos, 1, 1]) prev_pos = pos for prc in np.linspace(median + 1.0 / cuts, posF, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.0) - mindata) / diff prc = (prc - median) / (posF - median) except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict["red"].append([pos, 1.0, 1.0]) cdict["green"].append([pos, 1 - prc, 1 - prc]) cdict["blue"].append([pos, 1 - prc, 1 - prc]) prev_pos = pos pos = (np.percentile(vals, 97.0) - mindata) / diff cdict["red"].append([pos, 0.1, 0.1]) cdict["green"].append([pos, 0, 0]) cdict["blue"].append([pos, 0, 0]) cdict["red"].append([1.0, 1, 1]) cdict["green"].append([1.0, 1, 1]) cdict["blue"].append([1.0, 0, 0]) cmap = LinearSegmentedColormap(cmap, cdict) clim = None else: cmap = plt.get_cmap(cmap) cmap.set_bad("darkgrey", 1) ax1.imshow(data, interpolation="none", cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None) size = len(data) for i in xrange(size): for j in xrange(i, size): if np.isnan(data[i][j]): data[i][j] = 0 data[j][i] = 0 # data[j][i] = data[i][j] evals, evect = eigh(data) sort_perm = evals.argsort() evect = evect[sort_perm] data = [i for d in data for i in d if not np.isnan(i)] gradient = np.linspace(np.nanmin(data), np.nanmax(data), size) gradient = np.vstack((gradient, gradient)) h = ax2.hist(data, color="darkgrey", linewidth=2, bins=20, histtype="step", normed=True) _ = ax2.imshow(gradient, aspect="auto", cmap=cmap, extent=(np.nanmin(data), np.nanmax(data), 0, max(h[0]))) if genome_seq: for crm in genome_seq: ax1.vlines( [cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5], cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5, color="w", linestyle="-", linewidth=1, alpha=1, ) ax1.hlines( [cumcs[crm][1] - 0.5, cumcs[crm][0] - 0.5], cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5, color="w", linestyle="-", linewidth=1, alpha=1, ) ax1.vlines( [cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5], cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5, color="k", linestyle="--", ) ax1.hlines( [cumcs[crm][1] - 0.5, cumcs[crm][0] - 0.5], cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5, color="k", linestyle="--", ) if not one: vals = [0] keys = [""] for crm in genome_seq: vals.append(cumcs[crm][0]) keys.append(crm) vals.append(cumcs[crm][1]) ax1.set_yticks(vals) ax1.set_yticklabels("") ax1.set_yticks([float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1)], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',') # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(minoridata)[::-1])])[::-1].strip(',') # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(maxoridata)[::-1])])[::-1].strip(',') plt.figtext( 0.05, 0.25, "".join( [ (name + "\n") if name else "", "Number of interactions: %s\n" % str(totaloridata), ("" if np.isnan(cistrans) else ("Percentage of cis interactions: %.0f%%\n" % (cistrans * 100))), "Min interactions: %s\n" % (minoridata), "Max interactions: %s\n" % (maxoridata), ] ), ) ax2.set_xlim((np.nanmin(data), np.nanmax(data))) ax2.set_ylim((0, max(h[0]))) ax1.set_xlim((-0.5, size - 0.5)) ax1.set_ylim((-0.5, size - 0.5)) ax2.set_xlabel("log interaction count") # we reduce the number of dots displayed.... we just want to see the shape subdata = np.array(list(set([float(int(d * 100)) / 100 for d in data]))) try: normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data)) except AttributeError: normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data)) ax2.plot(subdata, normfit, "w.", markersize=2.5, alpha=0.4) ax2.plot(subdata, normfit, "k.", markersize=1.5, alpha=1) ax2.set_title("skew: %.3f, kurtosis: %.3f" % (skew(data), kurtosis(data))) ax4.vlines(range(size), 0, evect[:, -1], color="k") ax4.hlines(0, 0, size, color="red") ax4.set_ylabel("E1") ax4.set_yticklabels([]) try: ax5.vlines(range(size), 0, evect[:, -2], color="k") except IndexError: pass ax5.hlines(0, 0, size, color="red") ax5.set_ylabel("E2") ax5.set_yticklabels([]) try: ax6.vlines(range(size), 0, evect[:, -3], color="k") except IndexError: pass ax6.hlines(0, 0, size, color="red") ax6.set_ylabel("E3") ax6.set_yticklabels([]) xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels() plt.setp(xticklabels, visible=False) if savefig: tadbit_savefig(savefig) elif show: plt.show() plt.close("all")
def quality_plot(fnam, r_enz=None, nreads=float('inf'), axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) if isinstance(r_enz, list): r_enzs = r_enz elif isinstance(r_enz, str): r_enzs = [r_enz] for k in RESTRICTION_ENZYMES.keys(): for i in range(len(r_enzs)): if k.lower() == r_enz[i].lower(): r_enz[i] = k # else let it as None quals = [] henes = [] sites = {} fixes = {} liges = {} ligep = {} tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) elif fnam.endswith('.dsrc'): proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE) fhandler = proc.stdout else: fhandler = open(fnam) if not r_enzs: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_sites = {} d_sites = {} for r_enz in r_enzs: r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '') d_sites[r_enz] = repaired(r_enz) sites[r_enz] = [] # initialize dico to store undigested sites fixes[r_enz] = [] # initialize dico to store digested sites l_sites = religateds(r_enzs) site = {} fixe = {} for r_enz in r_enzs: site[r_enz] = re.compile(r_sites[r_enz]) fixe[r_enz] = re.compile(d_sites[r_enz]) # ligation sites should appear in lower case in the sequence lige = {} for k in l_sites: liges[k] = [] # initialize dico to store sites ligep[k] = 0 # initialize dico to store sites l_sites[k] = l_sites[k].lower() lige[k] = re.compile(l_sites[k]) while len(quals) <= nreads: try: next(fhandler) except StopIteration: break seq = next(fhandler) # ligation sites replaced by lower case to ease the search for lig in l_sites.values(): seq = seq.replace(lig.upper(), lig) for r_enz in r_enzs: sites[r_enz].extend([m.start() for m in site[r_enz].finditer(seq)]) # TODO: you cannot have a repaired/fixed site in the middle of # the sequence, this could be only checked at the beginning fixes[r_enz].extend([m.start() for m in fixe[r_enz].finditer(seq)]) for k in lige: # for each paired of cut-site liges[k].extend([m.start() for m in lige[k].finditer(seq)]) ligep[k] += l_sites[k] in seq # store the number of Ns found in the sequences if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = izip_longest(*quals, fillvalue=float('nan')) meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals]) max_seq_len = len(meanquals) if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: # configure plot if r_enz: # do both plots _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: # only do the quality_plot plot _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(max_seq_len), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, max_seq_len)) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() # quality_plot plot axb.plot([henes.count(i) for i in xrange(max_seq_len)], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, max_seq_len)) # Hi-C plot if r_enzs: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( ', '.join(r_enzs), nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') # seq_len is the length of the line to plot. we don't want to plot # if there is no room for the cut-site, or ligation site. site_len = max((max([len(r_sites[k]) for k in r_sites]), max([len(l_sites[k]) for k in l_sites]), max([len(d_sites[k]) for k in d_sites]))) seq_len = max_seq_len - site_len # transform dictionaries of positions into dictionaries of counts for r_enz in sites: sites[r_enz] = [sites[r_enz].count(k) for k in xrange(seq_len)] # Undigested fixes[r_enz] = [fixes[r_enz].count(k) for k in xrange(seq_len)] # DE for r1, r2 in liges: liges[(r1, r2)] = [liges[(r1, r2)].count(k) for k in xrange(seq_len)] # OK # in case the pattern of the repaired cut-site contains the target # cut-site pattern. These sites were counted twice, once in the # undigested, and once in the repaired. We remove them from the # repaired: for r_enz in r_enzs: if d_sites[r_enz] in r_sites[r_enz]: pos = r_sites[r_enz].find(d_sites[r_enz]) fixes[r_enz] = (fixes[r_enz][:pos] + [fixes[r_enz][k] - sites[r_enz][k-pos] for k in xrange(pos, seq_len)]) # same for ligated sites for r_enz1 in r_enzs: for r_enz2 in r_enzs: if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]: continue pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1]) fixes[r_enz1] = (fixes[r_enz1][:pos] + [fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos] for k in xrange(pos, seq_len)]) # remove anything that could be in between the two read ends if paired: for k in sites: sites[k][max_seq_len / 2 - site_len: max_seq_len / 2] = [float('nan')] * site_len fixes[k][max_seq_len / 2 - site_len: max_seq_len / 2] = [float('nan')] * site_len for k in liges: liges[k][max_seq_len / 2 - site_len: max_seq_len / 2] = [float('nan')] * site_len # plot undigested cut-sites color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs)))) for r_enz in sites: # print 'undigested', r_enz # print sites[r_enz][:20] ax2.plot(sites[r_enz], linewidth=2, color = color.next(), alpha=0.9, label='Undigested RE site (%s: %s)' % (r_enz, r_sites[r_enz]) if any([f > 0 for f in fixes[r_enz]]) else 'Undigested & Dangling-Ends (%s: %s)' % (r_enz, r_sites[r_enz])) ax2.set_ylabel('Undigested') ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) lines, labels = ax2.get_legend_handles_labels() ax3 = ax2.twinx() color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges)))) for r1, r2 in liges: # print 'ligated', r1, r2 # print liges[(r1, r2)][:20] ax3.plot(liges[(r1, r2)], linewidth=2, color=color.next(), alpha=0.9, label = 'Ligated (%s-%s: %s)' % (r1, r2, l_sites[(r1, r2)].upper())) ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Ligated') tmp_lines, tmp_labels = ax3.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs)))) for i, r_enz in enumerate(r_enzs): if any([f > 0 for f in fixes[r_enz]]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) # print 'repaired', r_enz # print fixes[r_enz][:20] ax4.plot(fixes[r_enz], linewidth=2, color=color.next(), alpha=0.9, label='Dangling-ends (%s: %s)' % (r_enz, d_sites[r_enz])) ax4.yaxis.label.set_color('darkgreen') ax4.tick_params(axis='y', colors='darkgreen', **tkw) ax4.set_ylabel('Dangling-ends') tmp_lines, tmp_labels = ax4.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) else: ax2.set_ylabel('Undigested & Dangling-ends') ax2.set_xlim((0, max_seq_len)) # Count ligation sites lig_cnt = {} for k in liges: lig_cnt[k] = (nansum(liges[k]) - liges[k][0] - liges[k][max_seq_len / 2]) # Count undigested sites sit_cnt = {} for r_enz in r_enzs: sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] - sites[r_enz][max_seq_len / 2]) # Count Dangling-Ends des = {} for r_enz in r_enzs: if any([f > 0 for f in fixes[r_enz]]): des[r_enz] = ((100. * (fixes[r_enz][0] + (fixes[r_enz][(max_seq_len / 2)] if paired else 0))) / nreads) else: des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][(max_seq_len / 2)] if paired else 0))) / nreads # Decorate plot title = '' for r_enz in r_enzs: lcnt = float(sum([lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1) for r_enz1 in r_enzs for r_enz2 in r_enzs if r_enz1 == r_enz or r_enz2 == r_enz])) title += ('Percentage of digested sites (not considering Dangling-Ends) ' '%s: %.1f%%\n' % (r_enz, 100. * float(lcnt) / (lcnt + sit_cnt[r_enz]))) for r_enz in r_enzs: title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz, des[r_enz]) for r_enz1 in r_enzs: for r_enz2 in r_enzs: title += ('Percentage of reads with ligation site (%s-%s): %.1f%% \n' % (r_enz1, r_enz2, (ligep[(r_enz1, r_enz2)] * 100.) / nreads)) plt.title(title.strip(), size=10, ha='left', x=0) plt.subplots_adjust(right=0.85) ax2.legend(lines, labels, bbox_to_anchor=(0.75, 1.0), loc=3, borderaxespad=0., frameon=False, fontsize=9) plt.tight_layout() if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() for k in ligep: ligep[k] = (ligep[k] * 100.) / nreads return des, ligep
def plot_genomic_distribution(fnam, first_read=True, resolution=10000, axe=None, ylim=None, savefig=None, chr_names=None, nreads=None): """ :param fnam: input file name :param True first_read: uses first read. :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None chr_names: can pass a list of chromosome names in case only some them the need to be plotted (this option may last even more than default) """ distr = {} idx1, idx2 = (1, 3) if first_read else (7, 9) genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() if chr_names: chr_names = set(chr_names) cond1 = lambda x: x not in chr_names else: cond1 = lambda x: False if nreads: cond2 = lambda x: x >= nreads else: cond2 = lambda x: False cond = lambda x, y: cond1(x) and cond2(y) count = 0 while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split('\t') genome_seq[crm] = int(clen) line = fhandler.next() try: while True: crm, pos = line.strip().split('\t')[idx1:idx2] count += 1 if cond(crm, count): line = fhandler.next() if cond2(count): break continue pos = int(pos) / resolution try: distr[crm][pos] += 1 except KeyError: try: distr[crm][pos] = 1 except KeyError: distr[crm] = {pos: 1} line = fhandler.next() except StopIteration: pass fhandler.close() if not axe: _ = plt.figure(figsize=(15, 3 + 3 * len(distr.keys()))) max_y = max([max(distr[c].values()) for c in distr]) max_x = max([len(distr[c].values()) for c in distr]) ncrms = len(genome_seq if genome_seq else distr) for i, crm in enumerate(chr_names if chr_names else genome_seq if genome_seq else distr): plt.subplot(ncrms, 1, i + 1) try: plt.plot(range(max(distr[crm])), [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))], color='red', lw=1.5, alpha=0.7) except KeyError: pass if ylim: plt.vlines(genome_seq[crm] / resolution, ylim[0], ylim[1]) else: plt.vlines(genome_seq[crm] / resolution, 0, max_y) plt.xlim((0, max_x)) plt.ylim(ylim or (0, max_y)) plt.title(crm) if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all')
def draw_map(data, genome_seq, cumcs, savefig, show, one=False, clim=None, cmap='jet', decay=False, perc=10, name=None, cistrans=None, decay_resolution=10000, normalized=False, max_diff=None): _ = plt.figure(figsize=(15.,12.5)) if not max_diff: max_diff = len(data) ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205]) ax2 = plt.axes([0.07, 0.65, 0.21, 0.15]) if decay: ax3 = plt.axes([0.07, 0.42, 0.21, 0.15]) plot_distance_vs_interactions(data, genome_seq=genome_seq, axe=ax3, resolution=decay_resolution, max_diff=max_diff, normalized=normalized) ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1) ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1) ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1) try: minoridata = np.nanmin(data) maxoridata = np.nanmax(data) except AttributeError: vals = [i for d in data for i in d if not np.isnan(i)] minoridata = np.min(vals) maxoridata = np.max(vals) totaloridata = np.nansum([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data[i]))]) # may not be square data = nozero_log(data, np.log2) vals = np.array([i for d in data for i in d]) vals = vals[np.isfinite(vals)] mindata = np.nanmin(vals) maxdata = np.nanmax(vals) diff = maxdata - mindata posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01 posF = 1.0 if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0 if cmap == 'tadbit': cuts = perc cdict = {'red' : [(0.0, 0.0, 0.0)], 'green': [(0.0, 0.0, 0.0)], 'blue' : [(0.0, 0.5, 0.5)]} prev_pos = 0 median = (np.median(vals) - mindata) / diff for prc in np.linspace(posI, median, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.) - mindata) / diff prc = ((prc - posI) / (median - posI)) + 1. / cuts except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict['red' ].append([pos, prc, prc]) cdict['green'].append([pos, prc, prc]) cdict['blue' ].append([pos, 1, 1]) prev_pos = pos for prc in np.linspace(median + 1. / cuts, posF, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.) - mindata) / diff prc = ((prc - median) / (posF - median)) except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict['red' ].append([pos, 1.0, 1.0]) cdict['green'].append([pos, 1 - prc, 1 - prc]) cdict['blue' ].append([pos, 1 - prc, 1 - prc]) prev_pos = pos pos = (np.percentile(vals ,97.) - mindata) / diff cdict['red' ].append([pos, 0.1, 0.1]) cdict['green'].append([pos, 0, 0]) cdict['blue' ].append([pos, 0, 0]) cdict['red' ].append([1.0, 1, 1]) cdict['green'].append([1.0, 1, 1]) cdict['blue' ].append([1.0, 0, 0]) cmap = LinearSegmentedColormap(cmap, cdict) clim = None else: cmap = plt.get_cmap(cmap) cmap.set_bad('darkgrey', 1) ax1.imshow(data, interpolation='none', cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None) size1 = len(data) size2 = len(data[0]) if size1 == size2: for i in xrange(size1): for j in xrange(i, size2): if np.isnan(data[i][j]): data[i][j] = 0 data[j][i] = 0 else: for i in xrange(size1): for j in xrange(size2): if np.isnan(data[i][j]): data[i][j] = 0 #data[j][i] = data[i][j] try: evals, evect = eigh(data) sort_perm = evals.argsort() evect = evect[sort_perm] except: evals, evect = None, None data = [i for d in data for i in d if not np.isnan(i)] gradient = np.linspace(np.nanmin(data), np.nanmax(data), max(size1, size2)) gradient = np.vstack((gradient, gradient)) h = ax2.hist(data, color='darkgrey', linewidth=2, bins=20, histtype='step', normed=True) _ = ax2.imshow(gradient, aspect='auto', cmap=cmap, extent=(np.nanmin(data), np.nanmax(data) , 0, max(h[0]))) if genome_seq: for crm in genome_seq: ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='w', linestyle='-', linewidth=1, alpha=1) ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='w', linestyle='-', linewidth=1, alpha=1) ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='k', linestyle='--') ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='k', linestyle='--') if not one: vals = [0] keys = [''] for crm in genome_seq: vals.append(cumcs[crm][0]) keys.append(crm) vals.append(cumcs[crm][1]) ax1.set_yticks(vals) ax1.set_yticklabels('') ax1.set_yticks([float(vals[i]+vals[i+1])/2 for i in xrange(len(vals) - 1)], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',') # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(minoridata)[::-1])])[::-1].strip(',') # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(maxoridata)[::-1])])[::-1].strip(',') plt.figtext(0.05,0.25, ''.join([ (name + '\n') if name else '', 'Number of interactions: %s\n' % str(totaloridata), ('' if np.isnan(cistrans) else ('Percentage of cis interactions: %.0f%%\n' % (cistrans*100))), 'Min interactions: %s\n' % (minoridata), 'Max interactions: %s\n' % (maxoridata)])) ax2.set_xlim((np.nanmin(data), np.nanmax(data))) ax2.set_ylim((0, max(h[0]))) ax1.set_xlim ((-0.5, size1 - .5)) ax1.set_ylim ((-0.5, size2 - .5)) ax2.set_xlabel('log interaction count') # we reduce the number of dots displayed.... we just want to see the shape subdata = np.array(list(set([float(int(d*100))/100 for d in data]))) try: normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data)) except AttributeError: normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data)) ax2.plot(subdata, normfit, 'w.', markersize=2.5, alpha=.4) ax2.plot(subdata, normfit, 'k.', markersize=1.5, alpha=1) ax2.set_title('skew: %.3f, kurtosis: %.3f' % (skew(data), kurtosis(data))) try: ax4.vlines(range(size1), 0, evect[:,-1], color='k') except (TypeError, IndexError): pass ax4.hlines(0, 0, size2, color='red') ax4.set_ylabel('E1') ax4.set_yticklabels([]) try: ax5.vlines(range(size1), 0, evect[:,-2], color='k') except (TypeError, IndexError): pass ax5.hlines(0, 0, size2, color='red') ax5.set_ylabel('E2') ax5.set_yticklabels([]) try: ax6.vlines(range(size1), 0, evect[:,-3], color='k') except (TypeError, IndexError): pass ax6.hlines(0, 0, size2, color='red') ax6.set_ylabel('E3') ax6.set_yticklabels([]) xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels() plt.setp(xticklabels, visible=False) if savefig: tadbit_savefig(savefig) elif show: plt.show() plt.close('all')
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None, show=False, xlog=False, stats=('median', 'perc_max')): """ Plots the distribution of dangling-ends lengths :param fnam: input file name :param None savefig: path where to store the output images. :param 99.9 max_size: top percentage of distances to consider, within the top 0.01% are usually found very long outliers. :param False xlog: represent x axis in logarithmic scale :param ('median', 'perc_max') stats: returns this set of values calculated from the distribution of insert/fragment sizes. Possible values are: - 'median' median of the distribution - 'perc_max' percentil defined by the other parameter 'max_size' - 'first_deacay' starting from the median of the distribution to the first window where 10 consecutive insert sizes are counted less than a given value (this given value is equal to the sum of all sizes divided by 100 000) - 'MAD' Double Median Adjusted Deviation :returns: the median value and the percentile inputed as max_size. """ distr = {} genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) line = fhandler.next() des = [] if nreads: nreads /= 2 try: while True: (crm1, pos1, dir1, _, re1, _, crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12] if re1==re2 and crm1 == crm2 and dir1 != dir2: pos1, pos2 = int(pos1), int(pos2) if (pos2 > pos1) == int(dir1): des.append(abs(pos2 - pos1)) if len(des) == nreads: break line = fhandler.next() except StopIteration: pass fhandler.close() max_perc = np.percentile(des, max_size) perc99 = np.percentile(des, 99) perc01 = np.percentile(des, 1) perc50 = np.percentile(des, 50) perc95 = np.percentile(des, 95) perc05 = np.percentile(des, 5) to_return = {'median': perc50} cutoff = len(des) / 100000. count = 0 for v in xrange(int(perc50), int(max(des))): if des.count(v) < cutoff: count += 1 else: count = 0 if count >= 10: to_return['first_decay'] = v - 10 break else: raise Exception('ERROR: not found') to_return['perc_max'] = max_perc to_return['MAD'] = mad(des) if not savefig and not axe and not show: return [to_return[k] for k in stats] ax = setup_plot(axe, figsize=(10, 5.5)) desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3, label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99)) ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3) desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3, label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95)) deshist = ax.hist(des, bins=100, range=(0, max_perc), alpha=.7, color='darkred', label='Dangling-ends') ylims = ax.get_ylim() plots = [] ax.set_xlabel('Genomic distance between reads') ax.set_ylabel('Count') ax.set_title('Distribution of dangling-ends ' + 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % ( perc50, max_size, max_perc)) if xlog: ax.set_xscale('log') ax.set_xlim((50, max_perc)) plt.subplots_adjust(left=0.1, right=0.75) ax.legend(bbox_to_anchor=(1.4, 1), frameon=False) if savefig: tadbit_savefig(savefig) elif show and not axe: plt.show() plt.close('all') return [to_return[k] for k in stats]
def mmp_score(matrix, nrand=10, verbose=False, savefig=None): """ :param matrix: list of lists :param 10 nrand: number of randomizations :param None savefig: path where to save figure :returns: 1- MMP score which ranges from 0 (bad) to 1 (good), and 2- the expected correlation of the contact matrices of the modeled chromatin with the original Hi-C data (plus the 3- lower and 4- upper values expected in 95% of the cases) """ data = np.array([np.array([v for v in l]) for l in matrix]) if verbose: sys.stdout.write(' - getting EigenVectors\n') egval, _ = np.linalg.eigh(data) # sort eigenvalues/vectors idx = (-egval).argsort() egval = egval[idx] regvals = [] if verbose: sys.stdout.write(' - randomization\n') for i in xrange(int(nrand)): if verbose: sys.stdout.write('\r ' + str(i + 1) + ' / ' + str(nrand)) sys.stdout.flush() regval, _ = np.linalg.eigh(randomize_matrix(data)) regval = [abs(j) for j in regval] regval.sort(reverse=True) regvals.append(regval) if verbose: sys.stdout.write('\n') regvals = zip(*regvals) rvmean = [] for rv in regvals: rvmean.append(np.mean(rv)) total = sum(rvmean) / 100 rvmean = [i / total for i in rvmean] err = [] for rv in regvals: rvstd = np.std(rv / total) err.append(2 * rvstd) zdata = sorted( np.log2([ data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data)) if data[i][j] ])) skewness = skew(zdata) kurtness = kurtosis(zdata) if savefig: _ = plt.figure(figsize=(14, 8)) gs = gridspec.GridSpec(7, 5, wspace=0.5, hspace=1.5) ax1 = plt.subplot(gs[:, 0:3]) ax2 = plt.subplot(gs[1:5, 3:]) ax3 = plt.subplot(gs[5:7, 3:]) img = ax2.imshow(np.log2(data), interpolation='none') plt.colorbar(img, ax=ax2) if savefig: ax2.set_title('Original matrix', size=12) ax2.tick_params(axis='both', which='major', labelsize=10) ax2.set_xlabel('Bin') ax2.set_ylabel('Bin') normfit = sc_norm.pdf(zdata, np.mean(zdata), np.std(zdata)) _ = ax3.plot(zdata, normfit, ':o', color='grey', ms=3, alpha=.4, markersize=.5) ax3.tick_params(axis='both', which='major', labelsize=10) ax3.hist(zdata, bins=20, normed=True, alpha=0.7, color='r') ax3.set_xlabel('Z-score') ax3.set_ylabel('Frequency') rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' rcParams['axes.axisbelow'] = True rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' rcParams['axes.axisbelow'] = True rcParams['axes.grid'] = True rcParams['grid.color'] = 'w' rcParams['grid.linestyle'] = '-' rcParams['grid.linewidth'] = 2 # rcParams['grid.alpha'] = .3 ax1.minorticks_on() ax1.grid(ls='-', color='w', alpha=.3, lw=2, which='major') ax1.grid(ls='-', b=True, color='w', alpha=.3, lw=1, which='minor') ax1.spines['top'].set_color('none') ax1.spines['right'].set_color('none') ax1.spines['bottom'].set_color('none') ax1.spines['left'].set_color('none') ax1.xaxis.set_ticks_position('bottom') ax1.yaxis.set_ticks_position('left') ax1.set_xscale('log') ax1.set_axis_bgcolor((.9, .9, .9)) ax1.errorbar(range(1, 1 + len(rvmean)), rvmean, yerr=err, ecolor='red', color='orange', lw=2, label='%s randomizations' % (nrand)) total = sum(abs(egval)) / 100 egval = np.array(sorted([e / total for e in abs(egval)], reverse=True)) for i in xrange(len(rvmean)): if rvmean[i] + err[i] > egval[i]: break signifidx = i size = len(data) sev = sum(egval[:signifidx] - rvmean[:signifidx]) if savefig: ax1.plot(range(1, 1 + len(rvmean)), egval, color='green', lw=2, label='Observed data') ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval, where=(np.array(rvmean) + np.array(err)) < egval, facecolor='green', interpolate=True, alpha=0.2) ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval, where=(np.array(rvmean) + np.array(err)) > egval, facecolor='red', interpolate=True, alpha=0.2) ax1.set_xlim((0, len(rvmean))) ax1.set_ylim((0, max(max(rvmean), max(egval)))) ax1.legend(frameon=False, loc='upper right', prop={'size': 10}) ax1.set_xlabel('Log indexes of Eigenvalues') ax1.set_ylabel('Eigenvalues (percentage of total)') #plt.subplots_adjust(right=0.6) #img = Image.open(opts.outdir + '/matrix_small.png') #fig.figimage(img, 640, -160) minv = float(min([i for d in data for i in d if i])) / 2 if minv == 0.5: minv = 1. / (len(data)**2) mmp = -0.0002 * size + 0.0335 * skewness - 0.0229 * kurtness + 0.0069 * sev + 0.8126 if verbose: sys.stdout.write('\n') sys.stdout.write('\n Results\n') sys.stdout.write(' -------\n\n') if verbose: sys.stdout.write(' MMP score: %.4f\n\n' % mmp) ex_a1, ex_b1 = [0.6975926, 0.2548171] supa1, supb1 = [0.69300732000423904, 0.29858572176099613] lowa1, lowb1 = [0.70217788900976075, 0.211048473299004] scc = (mmp - ex_b1) / ex_a1 scc_up1 = (mmp - supb1) / supa1 scc_lw1 = (mmp - lowb1) / lowa1 if verbose: sys.stdout.write((' predicted dSCC is %.3f (%.3f-%.3f ' '68%% confidence)\n') % (scc, scc_up1, scc_lw1)) supa75, supb75 = [0.69230778430383244, 0.30526310790548261] lowa75, lowb75 = [0.70287742471016734, 0.20437108715451746] scc_up75 = (mmp - supb75) / supa75 scc_lw75 = (mmp - lowb75) / lowa75 if verbose: sys.stdout.write((' (%.3f-%.3f ' '75%% confidence)\n') % (scc_up75, scc_lw75)) supa2, supb2 = [0.68855373600821357, 0.34109720480765293] lowa2, lowb2 = [0.70663147300578644, 0.16853699025234709] scc_up2 = (mmp - supb2) / supa2 scc_lw2 = (mmp - lowb2) / lowa2 if verbose: sys.stdout.write((' (%.3f-%.3f ' '95%% confidence)\n') % (scc_up2, scc_lw2)) if savefig: # write the log log = '' log += ' 1- Matrix size (number of eigenvalues): %s\n' % ( len(egval)) log += " 2- Skewness of the distribution: %0.3f\n" % (skewness) log += " 3- Kurtosis of the distribution: %0.3f\n" % (kurtness) log += " 4- Sum of differences signif EV real-rand: %0.3f\n\n" % ( sev) plt.figtext(0.62, 0.77, log, size='small') log = "MMP score: %.3f\n" % (mmp) log += "Predicted dSCC: %.3f (%.3f-%.3f at 95%% conf)\n" % ( scc, scc_up2, scc_lw2) plt.figtext(0.61, 0.87, log, size=12) tadbit_savefig(savefig) plt.close('all') return mmp, scc, scc_up2, scc_lw2
def correlate_matrices(hic_data1, hic_data2, max_dist=10, intra=False, axe=None, savefig=None, show=False, savedata=None, normalized=False, remove_bad_columns=True, **kwargs): """ Compare the iteractions of two Hi-C matrices at a given distance, with spearman rank correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 1 resolution: to be used for scaling the plot :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will not look further than 10 times the resolution) :param None savefig: path to save the plot :param False intra: only takes into account intra-chromosomal contacts :param False show: displays the plot :param False normalized: use normalized data :param True remove_bads: computes the union of bad columns between samples and exclude them from the comparison :returns: list of correlations and list of genomic distances """ corrs = [] dists = [] if normalized: get_the_guy1 = lambda i, j: (hic_data1[j, i] / hic_data1.bias[i] / hic_data1.bias[j]) get_the_guy2 = lambda i, j: (hic_data2[j, i] / hic_data2.bias[i] / hic_data2.bias[j]) else: get_the_guy1 = lambda i, j: hic_data1[j, i] get_the_guy2 = lambda i, j: hic_data2[j, i] if remove_bad_columns: # union of bad columns bads = hic_data1.bads.copy() bads.update(hic_data2.bads) if (intra and hic_data1.sections and hic_data2.sections and hic_data1.sections == hic_data2.sections): for dist in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for crm in hic_data1.section_pos: for j in xrange(hic_data1.section_pos[crm][0], hic_data1.section_pos[crm][1] - dist): i = j + dist if j in bads or i in bads: continue diag1.append(get_the_guy1(i, j)) diag2.append(get_the_guy2(i, j)) corrs.append(spearmanr(diag1, diag2)[0]) dists.append(dist) else: if intra: warn('WARNING: hic_dta does not contain chromosome coordinates, ' + 'intra set to False') for dist in xrange(1, max_dist + 1): diag1 = [] diag2 = [] for j in xrange(len(hic_data1) - dist): i = j + dist if j in bads or i in bads: continue diag1.append(get_the_guy1(i, j)) diag2.append(get_the_guy2(i, j)) corrs.append(spearmanr(diag1, diag2)[0]) dists.append(dist) if show or savefig or axe: if not axe: fig = plt.figure() axe = fig.add_subplot(111) given_axe = False else: given_axe = True axe.plot(dists, corrs, color='orange', linewidth=3, alpha=.8) axe.set_xlabel('Genomic distance in bins') axe.set_ylabel('Spearman rank correlation') axe.set_xlim((0, dists[-1])) if savefig: tadbit_savefig(savefig) if show: plt.show() if not given_axe: plt.close('all') if savedata: out = open(savedata, 'w') out.write('# genomic distance\tSpearman rank correlation\n') for i in xrange(len(corrs)): out.write('%s\t%s\n' % (dists[i], corrs[i])) out.close() if kwargs.get('get_bads', False): return corrs, dists, bads else: return corrs, dists
def filter_by_zero_count(matrx, draw_hist=False, savefig=None): """ fits the distribution of Hi-C interaction count by column in the matrix to a polynomial. Then searches for the first possible """ nbins = 100 # get sum of columns cols = [] for c in sorted(matrx, key=sum): cols.append(len(c) - c.count(0)) cols = np.array(cols) if draw_hist: plt.figure(figsize=(9, 9)) median = np.median(cols) # mad = np.median([abs(median - c ) for c in cols]) best =(None, None, None, None) # bin the sum of columns xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, cols[-1]) # check if the binning is correct # we want at list half of the bins with some data while list(x).count(0) > 2*len(x)/3: cols = cols[:-1] xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: plt.clf() hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, cols[-1]) # find best polynomial fit in a given range for order in range(7, 14): z = np.polyfit(y, x, order) zpp = np.polyder(z, m=1) roots = np.roots(np.polyder(z)) # check that we are concave down, otherwise take next root pente = np.polyval(zpp, abs(roots[-2] - roots[-1]) / 2 + roots[-1]) if pente > 0: root = roots[-1] else: root = roots[-2] # root must be higher than zero if root <= 0: continue # and lower than the median if root >= median: continue p = np.poly1d(z) R2 = get_r2(p, x, y) if best[0] < R2: best = (R2, order, p, z, root) p, z, root = best[2:] if draw_hist: a = plt.plot(xp, p(xp), "--", color='k') b = plt.vlines(root, 0, plt.ylim()[1], colors='r', linestyles='dashed') try: plt.legend(a + [b], ['polyfit \n%s' % ( ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j>0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1])])), 'first solution of polynomial derivation'], fontsize='x-small') except TypeError: plt.legend(a + [b], ['polyfit \n%s' % ( ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j>0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1])])), 'first solution of polynomial derivation']) plt.ylim(0, plt.ylim()[1]) if savefig: tadbit_savefig(savefig) else: plt.show() # label as bad the columns with sums lower than the root bads = {} for i, col in enumerate(matrx): if sum(col) < root: bads[i] = None # now stored in Experiment._zeros, used for getting more accurate z-scores return bads
def visualize(self, names=None, tad=None, focus=None, paint_tads=False, axe=None, show=True, logarithm=True, normalized=False, relative=True, decorate=True, savefig=None, clim=None, scale=(8, 6), cmap='jet'): """ Visualize the matrix of Hi-C interactions of a given experiment :param None names: name of the experiment to visualize, or list of experiment names. If None, all experiments will be shown :param None tad: a given TAD in the form: :: {'start': start, 'end' : end, 'brk' : end, 'score': score} **Alternatively** a list of the TADs can be passed (all the TADs between the first and last one passed will be showed. Thus, passing more than two TADs might be superfluous) :param None focus: a tuple with the start and end positions of the region to visualize :param False paint_tads: draw a box around the TADs defined for this experiment :param None axe: an axe object from matplotlib can be passed in order to customize the picture :param True show: either to pop-up matplotlib image or not :param True logarithm: show the logarithm values :param True normalized: show the normalized data (weights might have been calculated previously). *Note: white rows/columns may appear in the matrix displayed; these rows correspond to filtered rows (see* :func:`pytadbit.utils.hic_filtering.hic_filtering_for_modelling` *)* :param True relative: color scale is relative to the whole matrix of data, not only to the region displayed :param True decorate: draws color bar, title and axes labels :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None clim: tuple with minimum and maximum value range for color scale. I.e. clim=(-4, 10) :param 'jet' cmap: color map from matplotlib. Can also be a preconfigured cmap object. """ if names == None: names = [xpr.name for xpr in self.experiments] if not isinstance(names, list) and not isinstance(names, tuple): names = [names] cols = 1 rows = 1 else: sqrtxpr = sqrt(len(names)) cols = int(round(sqrtxpr + (0.0 if int(sqrtxpr)==sqrtxpr else .5))) rows = int(sqrtxpr+.5) notaxe = axe == None if not scale: scale = (8, 6) if notaxe and len(names) != 1: fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows)) for i in range(rows): for j in range(cols): if i * cols + j >= len(names): break if notaxe and len(names) != 1: axe = fig.add_subplot( rows, cols, i * cols + j + 1) if (isinstance(names[i * cols + j], tuple) or isinstance(names[i * cols + j], list)): if not axe: fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows)) axe = fig.add_subplot( rows, cols, i * cols + j + 1) xpr1 = self.get_experiment(names[i * cols + j][0]) xpr2 = self.get_experiment(names[i * cols + j][1]) img = xpr1.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=decorate, savefig=False, where='up', clim=clim, cmap=cmap) img = xpr2.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=False, savefig=False, where='down', clim=clim or img.get_clim(), cmap=cmap) #axe = axe.twinx() #axe.set_aspect('equal',adjustable='box-forced',anchor='NE') if decorate: plt.text(1.01, .5, 'Chromosome %s experiment %s' % ( self.name, xpr2.name), rotation=-90, va='center', size='large', ha='left', transform=axe.transAxes) else: xper = self.get_experiment(names[i * cols + j]) if not xper.hic_data and not xper.norm: continue xper.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=decorate, savefig=False, clim=clim, cmap=cmap) if savefig: tadbit_savefig(savefig) if show: plt.show()
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False, genome_seq=None, resolution=None, axe=None, savefig=None, normalized=False): """ :param data: input file name, or HiC_data object or list of lists :param 10 min_diff: lower limit (in number of bins) :param 1000 max_diff: upper limit (in number of bins) to look for :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :returns: slope, intercept and R square of each of the 3 correlations """ resolution = resolution or 1 dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)]) if isinstance(data, str): fhandler = open(data) line = fhandler.next() while line.startswith('#'): line = fhandler.next() try: while True: _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9) if cr1 != cr2: line = fhandler.next() continue diff = abs(int(ps1) / resolution - int(ps2) / resolution) if max_diff > diff >= min_diff: dist_intr[diff] += 1 line = fhandler.next() except StopIteration: pass fhandler.close() elif isinstance(data, HiC_data): if normalized: get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y] else: get_data = lambda x, y: data[x, y] max_diff = min(len(data), max_diff) if data.section_pos: for crm in data.section_pos: for diff in xrange(min_diff, min( (max_diff, 1 + data.chromosomes[crm]))): for i in xrange(data.section_pos[crm][0], data.section_pos[crm][1] - diff): dist_intr[diff] += get_data(i, i + diff) else: for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i, i + diff]): dist_intr[diff] += get_data(i, diff) else: if genome_seq: max_diff = min(max(genome_seq.values()), max_diff) cnt = 0 for crm in genome_seq: for diff in xrange(min_diff, min( (max_diff, genome_seq[crm]))): for i in xrange(cnt, cnt + genome_seq[crm] - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] cnt += genome_seq[crm] else: max_diff = min(len(data), max_diff) for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] if not axe: fig=plt.figure() axe = fig.add_subplot(111) # remove last part of the plot in case no interaction is count... reduce max_dist for diff in xrange(max_diff - 1, min_diff, -1): try: if not dist_intr[diff]: del(dist_intr[diff]) max_diff -=1 continue except KeyError: max_diff -=1 continue break xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0])) x = [] y = [] for k in xrange(len(xp)): if yp[k]: x.append(xp[k]) y.append(yp[k]) axe.plot(x, y, 'k.') best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0) logx = np.log(x) logy = np.log(y) ntries = 100 # set k for better fit # for k in xrange(1, ntries/5, ntries/5/5): if resolution == 1: k = 1 for i in xrange(3, ntries-2-k): v1 = i * len(x) / ntries try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1 = b1 = r21 = 0 r21 *= r21 for j in xrange(i + 1 + k, ntries - 2 - k): v2 = j * len(x) / ntries try: a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2]) a3, b3, r23, _, _ = linregress(logx[v2+k: ], logy[v2+k: ]) except ValueError: a2 = b2 = r22 = 0 a3 = b3 = r23 = 0 r2 = r21 + r22**2 + r23**2 if r2 > best[0]: best = (r2, v1, v2, a1, a2, a3, b1, b2, b3, k) # plot line of best fit (v1, v2, a1, a2, a3, b1, b2, b3, k) = best[1:] yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2+k: ], yfit3(x[v2+k: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) else: # from 0.7 Mb v1 = 700000 / resolution # to 10 Mb v2 = 10000000 / resolution try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1, b1, r21 = 0, 0, 0 try: a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2]) except ValueError: a2, b2, r22 = 0, 0, 0 try: a3, b3, r23, _, _ = linregress(logx[v2: ], logy[v2: ]) except ValueError: a3, b3, r23 = 0, 0, 0 yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1:v2], yfit2(x[v1:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2: ], yfit3(x[v2: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) axe.set_ylabel('Log interaction count') axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution)) axe.legend(loc='lower left', frameon=False) axe.set_xscale('log') axe.set_yscale('log') axe.set_xlim((min_diff, max_diff)) try: axe.set_ylim((0, max(y))) except ValueError: pass if savefig: tadbit_savefig(savefig) plt.close('all') elif show==True: plt.show() plt.close('all') return (a1, b1, r21), (a2, b2, r22), (a3, b3, r23)
def filter_by_cis_percentage(cisprc, beg=0.3, end=0.8, sigma=2, verbose=False, size=None, min_perc=None, max_perc=None, savefig=None): """ Define artifactual columns with either too low or too high counts of interactions by compraing their percentage of cis interactions (inter-chromosomal). :param cisprc: dictionary with counts of cis-percentage by bin number. Values of the dictionary are tuple with,m as first element the number of cis interactions and as second element the total number of interactions. :param 0.3 beg: proportion of bins to be considered as possibly having low counts :param 0.8 end: proportion of bins to be considered as possibly having high counts :param 2 sigma: number of standard deviations used to define lower and upper ranges in the varaition of the percentage of cis interactions :param None size: size of the genome, inumber of bins (otherwise inferred from cisprc dictionary) :param None sevefig: path to save image of the distribution of cis percentages and total counts by bin. :returns: dictionary of bins to be filtered out (with either too low or too high counts of interactions). """ sorted_sum, indices = list(zip(*sorted((cisprc[i][1], i) for i in cisprc))) sorted_prc = [float(cisprc[i][0]) / cisprc[i][1] for i in indices] size = (max(indices) + 1) if not size else size win_size = _best_window_size(sorted_prc, size, beg, end, verbose=verbose) # define confidance bands, compute median plus/minus one standard deviation errors_pos = [] errors_neg = [] for k in range(0, size, 1): vals = sorted_prc[k:k + win_size] std = np.std(vals) med = np.median(vals) errors_pos.append(med + std * sigma) errors_neg.append(med - std * sigma) # calculate median and variation of median plus/minus one standard deviation # for values between percentile 10 and 90 of the distribution of the # percentage of cis interactions # - for median plus one standard deviation std_err_pos = np.std(errors_pos[int(size * beg):int(size * end)]) med_err_pos = np.median(errors_pos[int(size * beg):int(size * end)]) # - for median minus one standard deviation std_err_neg = np.std(errors_neg[int(size * beg):int(size * end)]) med_err_neg = np.median(errors_neg[int(size * beg):int(size * end)]) # define cutoffs, values of cis percentage plus 1 stddev should be between # the general median +/- 2 stddev of the distribution of the cis percentage # plus 1 stddev. Same on the side of median cis percentage minus 1 stddev beg_pos = med_err_pos - std_err_pos * sigma end_pos = med_err_pos + std_err_pos * sigma beg_neg = med_err_neg - std_err_neg * sigma end_neg = med_err_neg + std_err_neg * sigma cutoffL = None passed = 0 consecutive = 10 for cutoffL, (p, n) in enumerate(zip(errors_pos, errors_neg)): # print '%6.4f %6.4f %6.4f %6.4f %6.4f %6.4f' % (beg_pos, p, end_pos, beg_neg, n, end_neg) if (beg_pos < p < end_pos) and (beg_neg < n < end_neg): if passed >= consecutive: break passed += 1 else: passed = 0 else: if min_perc is None: raise Exception('ERROR: left cutoff not found!!!\n' ' define it by hand with min_perc') else: cutoffL = min_perc / 100. * size + consecutive cutoffL -= consecutive # rescale, we asked for XX consecutive # right cutoffR = None passed = 0 for cutoffR, (p, n) in enumerate(list(zip(errors_pos, errors_neg))[::-1]): cutoffR = size - cutoffR # print '%6.4f %6.4f %6.4f %6.4f %6.4f %6.4f' % (beg_pos, p, end_pos, beg_neg, n, end_neg) if (beg_pos < p < end_pos) and (beg_neg < n < end_neg): if passed >= consecutive: break passed += 1 else: passed = 0 else: if max_perc is None: raise Exception('ERROR: right cutoff not found!!!\n' ' define it by hand with max_perc') else: cutoffR = max_perc / 100. * size - consecutive cutoffR += consecutive # rescale, we asked for XX consecutive if min_perc: cutoffL = min_perc / 100. * size if max_perc: cutoffR = max_perc / 100. * size min_count = sorted_sum[int(cutoffL)] try: max_count = sorted_sum[int(cutoffR)] except IndexError: # all good max_count = sorted_sum[-1] + 1 if verbose: print(' * Lower cutoff applied until bin number: %d' % (cutoffL)) print( ' * too few interactions defined as less than %9d interactions' % (min_count)) print(' * Upper cutoff applied until bin number: %d' % (cutoffR)) print( ' * too much interactions defined as more than %9d interactions' % (max_count)) # plot if savefig: if verbose: print(' -> Making plot...') fig = plt.figure(figsize=(20, 11)) ax1 = fig.add_subplot(111) plt.subplots_adjust(left=0.25, bottom=0.2) line1 = ax1.plot([ float(cisprc.get(i, [0, 0])[0]) / cisprc.get(i, [1, 1])[1] for i in indices ], '.', color='grey', alpha=0.2, label='cis interactions ratio by bin', zorder=1) line2 = ax1.plot(list(range(0, len(indices), 20)), [ sum( float(cisprc.get(j, [0, 0])[0]) / cisprc.get(j, [1, 1])[1] for j in indices[k:k + win_size]) / win_size for k in range(0, len(indices), 20) ], '.', color='k', alpha=0.3, label='cis interactions ratio by %d bin' % win_size, zorder=1) for k, (p, n) in enumerate( zip(errors_pos[::size // 100], errors_neg[::size // 100])): ax1.vlines(k * (size // 100), (p + n) // 2, p, color='red', alpha=0.6) ax1.vlines(k * (size // 100), n, (p + n) // 2, color='blue', alpha=0.6) ax1.plot(list(range(0, size, size // 100)), errors_neg[::size // 100], 'b^', mec='blue', alpha=0.5) ax1.plot(list(range(0, size, size // 100)), errors_pos[::size // 100], 'rv', mec='red', alpha=0.5) ax1.fill_between([0, size], beg_pos, end_pos, color='red', alpha=0.3, zorder=2) ax1.text(-size / 15., (end_pos + beg_pos) / 2, 'Confidance band for\nupper stddev of median', color='red', ha='right', va='center') ax1.fill_between([0, size], beg_neg, end_neg, color='blue', alpha=0.3, zorder=2) ax1.text(-size / 15., (end_neg + beg_neg) / 2, 'Confidance band for\nlower stddev of median', color='blue', ha='right', va='center') ax1.set_ylim((0, 1.1)) ax1.set_ylabel('Ratio of cis interactions ratio') ax1.fill_betweenx([0, 1.1], cutoffL, cutoffR, color='green', alpha=0.2) ax1.text( (cutoffR + cutoffL) / 2, -0.1, ('Kept bins, top and bottom deviations from median cis-ratio\n' + 'should be inside their respective confidance bands'), ha='center', color='green') ax2 = fig.add_subplot(111, sharex=ax1, frameon=False) line3 = ax2.plot(sorted_sum, 'rx', alpha=0.4, label='Log sum of interactions by bin') ax2.set_yscale('log') ax2.yaxis.tick_right() ax2.yaxis.set_label_position("right") ax2.set_ylabel('Log interaction counts') lns = line1 + line2 + line3 labs = [l.get_label() for l in lns] ax2.legend(lns, labs, loc=0, bbox_to_anchor=(0, 0), frameon=False) ax3 = fig.add_subplot(111, frameon=False) ax3.xaxis.tick_top() ax3.set_xticks(list(range(100)), minor=True) ax3.set_xticks(list(range(0, 100, 5)), minor=False) ax3.set_yticks([]) ax3.set_xticklabels([]) for p in range(5, 100, 5): ax3.text(p, 99, '%d%%' % p, va='top', ha='left', size=9) ax3.tick_params(direction='in', axis='x', which='both') ax3.set_xlim(0, 100) ax3.set_ylim(0, 100) ax3.grid(which='major') ax3.grid(which='minor', alpha=0.5) if min_perc: plt.title('Setting from %.2f%% to %.2f%%' % (100 * float(cutoffL) / len(indices), 100 * float(cutoffR) / len(indices))) else: plt.title('Keeping from %.2f%% to %.2f%%' % (100 * float(cutoffL) / len(indices), 100 * float(cutoffR) / len(indices))) ax1.set_xlim((0, len(indices))) tadbit_savefig(savefig) plt.close('all') badcol = {} countL = 0 countZ = 0 countU = 0 for c in range(size): if cisprc.get(c, [0, 0])[1] < min_count: badcol[c] = cisprc.get(c, [0, 0])[1] countL += 1 if not c in cisprc: countZ += 1 elif cisprc[c][ 1] > max_count: # don't need get here, already cought in previous condition badcol[c] = cisprc.get(c, [0, 0])[1] countU += 1 print( ' => %d BAD bins (%d/%d/%d null/low/high counts) of %d (%.1f%%)' % (len(badcol), countZ, countL, countU, size, float(len(badcol)) / size * 100)) return badcol
def plot_genomic_distribution(fnam, first_read=True, resolution=10000, axe=None, ylim=None, savefig=None, show=False, savedata=None, chr_names=None, nreads=None): """ :param fnam: input file name :param True first_read: uses first read. :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None savedata: path where to store the output read counts per bin. :param None chr_names: can pass a list of chromosome names in case only some them the need to be plotted (this option may last even more than default) """ distr = {} idx1, idx2 = (1, 3) if first_read else (7, 9) genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() if chr_names: chr_names = set(chr_names) cond1 = lambda x: x not in chr_names else: cond1 = lambda x: False if nreads: cond2 = lambda x: x >= nreads else: cond2 = lambda x: False cond = lambda x, y: cond1(x) and cond2(y) count = 0 while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split('\t') genome_seq[crm] = int(clen) line = fhandler.next() try: while True: crm, pos = line.strip().split('\t')[idx1:idx2] count += 1 if cond(crm, count): line = fhandler.next() if cond2(count): break continue pos = int(pos) / resolution try: distr[crm][pos] += 1 except KeyError: try: distr[crm][pos] = 1 except KeyError: distr[crm] = {pos: 1} line = fhandler.next() except StopIteration: pass fhandler.close() if not axe: _ = plt.figure(figsize=(15, 1 + 3 * len( chr_names if chr_names else distr.keys()))) max_y = max([max(distr[c].values()) for c in distr]) max_x = max([len(distr[c].values()) for c in distr]) ncrms = len(chr_names if chr_names else genome_seq if genome_seq else distr) data = {} for i, crm in enumerate(chr_names if chr_names else genome_seq if genome_seq else distr): try: data[crm] = [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))] if savefig: plt.subplot(ncrms, 1, i + 1) plt.plot(range(max(distr[crm])), data[crm], color='red', lw=1.5, alpha=0.7) except KeyError: pass if savefig: if ylim: plt.vlines(genome_seq[crm] / resolution, ylim[0], ylim[1]) else: plt.vlines(genome_seq[crm] / resolution, 0, max_y) plt.xlim((0, max_x)) plt.ylim(ylim or (0, max_y)) plt.title(crm) if savefig: tadbit_savefig(savefig) plt.close('all') elif show: plt.show() if savedata: out = open(savedata, 'w') out.write('# CRM\tstart-end\tcount\n') out.write('\n'.join('%s\t%d-%d\t%d' % (c, (i * resolution) + 1, ((i + 1) * resolution), v) for c in data for i, v in enumerate(data[c]))) out.write('\n') out.close()
def filter_by_mean(matrx, draw_hist=False, silent=False, bads=None, savefig=None): """ fits the distribution of Hi-C interaction count by column in the matrix to a polynomial. Then searches for the first possible """ nbins = 100 if not bads: bads = {} # get sum of columns cols = [] size = len(matrx) for c in sorted( [[matrx.get(i + j * size, 0) for j in xrange(size) if not j in bads] for i in xrange(size) if not i in bads], key=sum): cols.append(sum(c)) cols = np.array(cols) if draw_hist: plt.figure(figsize=(9, 9)) try: percentile = np.percentile(cols, 5) except IndexError: warn('WARNING: no columns to filter out') return bads # mad = np.median([abs(median - c ) for c in cols]) best = (None, None, None, None) # bin the sum of columns xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, int(cols[-1])) # check if the binning is correct # we want at list half of the bins with some data try: cnt = 0 while list(x).count(0) > len(x) / 2: cnt += 1 cols = cols[:-1] xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: plt.clf() hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, int(cols[-1])) if cnt > 10000: raise ValueError # find best polynomial fit in a given range for order in range(6, 18): z = np.polyfit(y, x, order) zp = np.polyder(z, m=1) roots = np.roots(np.polyder(z)) # check that we are concave down, otherwise take next root pente = np.polyval(zp, abs(roots[-2] - roots[-1]) / 2 + roots[-1]) if pente > 0: root = roots[-1] else: root = roots[-2] # root must be higher than zero if root <= 0: continue # and lower than the median if root >= percentile: continue p = np.poly1d(z) R2 = get_r2(p, x, y) # try to avoid very large orders by weigthing negatively their fit if order > 13: R2 -= float(order) / 30 if best[0] < R2: best = (R2, order, p, z, root) try: p, z, root = best[2:] if draw_hist: xlims = plt.xlim() ylims = plt.ylim() a = plt.plot(xp, p(xp), "--", color='k') b = plt.vlines(root, 0, plt.ylim()[1], colors='r', linestyles='dashed') # c = plt.vlines(median - mad * 1.5, 0, 110, colors='g', # linestyles='dashed') try: plt.legend(a + [b], [ 'polyfit \n%s' % (''.join([ sub( 'e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j > 0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1]) ])), 'first solution of polynomial derivation' ], fontsize='x-small') except TypeError: plt.legend(a + [b], [ 'polyfit \n%s' % (''.join([ sub( 'e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j > 0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1]) ])), 'first solution of polynomial derivation' ]) # plt.legend(a+[b]+[c], ['polyfit \n{}'.format ( # ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', # '${}{:.1}x^{}$'.format ('+' if j>0 else '', j, # '{' + str(i) + '}')) # for i, j in enumerate(list(p)[::-1])])), # 'first solution of polynomial derivation', # 'median - (1.5 * median absolute deviation)'], # fontsize='x-small') plt.ylim([0, ylims[1]]) plt.xlim(xlims) plt.xlabel('Sum of interactions') plt.xlabel('Number of columns with a given value') if savefig: tadbit_savefig(savefig) else: plt.show() # label as bad the columns with sums lower than the root for i, col in enumerate( [[matrx.get(i + j * size, 0) for j in xrange(size)] for i in xrange(size)]): if sum(col) < root: bads[i] = sum(col) # now stored in Experiment._zeros, used for getting more accurate z-scores if bads and not silent: stderr.write( ('\nWARNING: removing columns having less than %s ' + 'counts:\n %s\n') % (round(root, 3), ' '.join([ '%5s' % str(i + 1) + ('' if (j + 1) % 20 else '\n') for j, i in enumerate(sorted(bads.keys())) ]))) except: if not silent: stderr.write('WARNING: Too many zeroes to filter columns.' + ' SKIPPING...\n') if draw_hist: plt.xlabel('Sum of interactions') plt.xlabel('Number of columns with a given value') if savefig: tadbit_savefig(savefig) else: plt.show() except ValueError: if not silent: stderr.write('WARNING: Too few data to filter columns based on ' + 'mean value.\n') if draw_hist: plt.close('all') return bads
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6, normalized=False, savefig=None, show=False, savedata=None, remove_bad_columns=True, **kwargs): """ Compare the iteractions of two Hi-C matrices using their 6 first eigenvectors, with pearson correlation :param hic_data1: Hi-C-data object :param hic_data2: Hi-C-data object :param 6 nvect: number of eigenvectors to compare :param None savefig: path to save the plot :param False show: displays the plot :param False normalized: use normalized data :param True remove_bads: computes the union of bad columns between samples and exclude them from the comparison :param kwargs: any argument to pass to matplotlib imshow function :returns: matrix of correlations """ data1 = hic_data1.get_matrix(normalized=normalized) data2 = hic_data2.get_matrix(normalized=normalized) ## reduce matrices to remove bad columns if remove_bad_columns: # union of bad columns bads = hic_data1.bads.copy() bads.update(hic_data2.bads) # remove them form both matrices for bad in sorted(bads, reverse=True): del(data1[bad]) del(data2[bad]) for i in xrange(len(data1)): _ = data1[i].pop(bad) _ = data2[i].pop(bad) # get the log size = len(data1) data1 = nozero_log(data1, np.log2) data2 = nozero_log(data2, np.log2) # get the eigenvectors ev1, evect1 = eigh(data1) ev2, evect2 = eigh(data2) corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)] # sort eigenvectors according to their eigenvalues => first is last!! sort_perm = ev1.argsort() ev1.sort() evect1 = evect1[sort_perm] sort_perm = ev2.argsort() ev2.sort() evect2 = evect2[sort_perm] # calculate Pearson correlation for i in xrange(nvect): for j in xrange(nvect): corr[i][j] = abs(pearsonr(evect1[:,-i-1], evect2[:,-j-1])[0]) # plot axe = plt.axes([0.1, 0.1, 0.6, 0.8]) cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8]) if show or savefig: im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs) axe.set_xlabel('Eigen Vectors exp. 1') axe.set_ylabel('Eigen Vectors exp. 2') axe.set_xticks(range(nvect)) axe.set_yticks(range(nvect)) axe.set_xticklabels(range(1, nvect + 2)) axe.set_yticklabels(range(1, nvect + 2)) axe.xaxis.set_tick_params(length=0, width=0) axe.yaxis.set_tick_params(length=0, width=0) cbar = plt.colorbar(im, cax = cbaxes ) cbar.ax.set_ylabel('Pearson correlation', rotation=90*3, verticalalignment='bottom') axe2 = axe.twinx() axe2.set_yticks(range(nvect)) axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]]) axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3, verticalalignment='bottom') axe2.set_ylim((-0.5, nvect - 0.5)) axe2.yaxis.set_tick_params(length=0, width=0) axe3 = axe.twiny() axe3.set_xticks(range(nvect)) axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]]) axe3.set_xlabel('corresponding Eigen Values exp. 1') axe3.set_xlim((-0.5, nvect - 0.5)) axe3.xaxis.set_tick_params(length=0, width=0) axe.set_ylim((-0.5, nvect - 0.5)) axe.set_xlim((-0.5, nvect - 0.5)) if savefig: tadbit_savefig(savefig) if show: plt.show() plt.close('all') if savedata: out = open(savedata, 'w') out.write('# ' + '\t'.join(['Eigen Vector %s'% i for i in xrange(nvect)]) + '\n') for i in xrange(nvect): out.write('\t'.join([str(corr[i][j]) for j in xrange(nvect)]) + '\n') out.close() if kwargs.get('get_bads', False): return corr, bads else: return corr
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) quals = [] henes = [] sites = [] fixes = [] liges = [] ligep = 0 tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) else: fhandler = open(fnam) if not r_enz: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '') l_site = religated(r_enz) d_site = repaired(r_enz) if r_site*2 == l_site: # in case the religated site equals 2 restriction sites (like DnpII) site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site) fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site) else: site = re.compile(r_site) fixe = re.compile(d_site) lige = re.compile(l_site) if nreads: while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: if r_enz: _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(len(line.strip())), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, len(line))) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, len(line))) if r_enz: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( r_enz, nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') seq_len = len(line) - max((len(r_site), len(l_site), len(d_site))) sites = [sites.count(k) for k in xrange(seq_len)] # Undigested liges = [liges.count(k) for k in xrange(seq_len)] # OK fixes = [fixes.count(k) for k in xrange(seq_len)] # DE if d_site in r_site: pos = r_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)]) if d_site in l_site: pos = l_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)]) site_len = max((len(r_site), len(l_site), len(d_site))) if paired: sites[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len liges[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len fixes[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len ax2.plot(sites, linewidth=2, color='darkred') ax2.set_ylabel('Undigested RE site (%s)' % r_site) ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) ax3 = ax2.twinx() ax3.plot(liges, linewidth=2, color='darkblue') ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Religated (%s)' % l_site) if any([f > 0 for f in fixes]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) ax4.plot(fixes, linewidth=2, color='darkorange') ax4.yaxis.label.set_color('darkorange') ax4.tick_params(axis='y', colors='darkorange', **tkw) ax4.set_ylabel('Dangling-ends (%s)' % d_site) else: ax2.set_ylabel('RE site & Dangling-ends (%s)' % r_site) ax2.set_xlim((0, len(line))) lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2]) sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2]) des = ((100. * (fixes[0] + (fixes[(len(line) / 2)] if paired else 0))) / nreads) if any([f > 0 for f in fixes]) else ( 100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' + 'Percentage of reads with ligation site: %.0f%%') %( (100. * lig_cnt) / (lig_cnt + sit_cnt), des, (ligep * 100.) / nreads)) plt.subplots_adjust(right=0.85) if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() return des, (ligep * 100.) / nreads
def draw(self, focus=None, extras=None, ymax=None, ali_colors=('grey',), normalized=True, savefig=None, shape='ellipse'): """ Draw alignments as a plot. :param None focus: can pass a tuple (bin_start, bin_stop) to display the alignment between these genomic bins :param None extras: list of coordinates (genomic bin) where to draw a red cross :param None ymax: limit the y axis up to a given value :param ('grey', ): successive colors for alignment :param True normalized: normalized Hi-C count are plotted instead of raw data. :param 'ellipse' shape: which kind of shape to use as schematic representation of TADs. Implemented: 'ellipse', 'rectangle', 'triangle' :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ from matplotlib.cm import jet from matplotlib import pyplot as plt experiments = self.__experiments maxres = max([e.resolution for e in experiments]) facts = [maxres / e.resolution for e in experiments] siz = experiments[0].size if focus: figsiz = 4 + (focus[1] - focus[0]) / 30 else: figsiz = 4 + siz / 30 fig, axes = plt.subplots(nrows=len(experiments), sharex=True, sharey=True, figsize=(figsiz, 1 + len(experiments) * 1.8)) fig.subplots_adjust(hspace=0) maxys = [] for iex, xpr in enumerate(experiments): if not xpr.name in self: continue _tad_density_plot(xpr, maxys=maxys, normalized=normalized, fact_res=facts[iex], axe=axes[iex], extras=extras, shape=shape, focus=focus) # draw alignment columns start = focus[0] if focus else 1 end = focus[1] if focus else xpr.tads[max(xpr.tads)]['end'] maxy = (ymax or max(maxys)) + 0.4 maxxs = [] for iex in range(len(experiments)): starting = focus[0] if focus else 1 ending = (focus[1] if focus else experiments[iex].tads.values()[-1]['end']) axes[iex].hlines(1, 1, end, 'k', lw=1.5) axes[iex].set_ylim((0, maxy)) maxxs.append(ending / facts[iex]) axes[iex].text(starting + 1, float(maxy) / 20, experiments[iex].name, {'ha': 'left', 'va': 'bottom'}) axes[iex].set_yticks([float(i) / 2 for i in range(1, int(maxy + .5) * 2)]) if ymax: axes[iex].set_ylim((0, ymax)) axes[iex].set_xlim(xmin=starting, xmax=max(maxxs)) pos = {'ha': 'center', 'va': 'bottom'} for i, col in enumerate(self.itercolumns()): ends = sorted([(t['end'], j) for j, t in enumerate(col) if t['end']]) beg = (ends[0][0] + 0.9) / facts[ends[0][1]] end = (ends[-1][0] + 1.1) / facts[ends[-1][1]] if focus: if beg < focus[0] or end > focus[1]: continue axes[0].text(beg + float(end - beg) / 2, maxy + float(maxy) / 20, str(i + 1), pos, rotation=90, size='small') for iex, tad in enumerate(col): if not tad['end']: continue axes[iex].axvspan(beg-.2, end+.2, alpha=0.2, color=ali_colors[i%(len(ali_colors))]) axes[iex].set_xlabel('Genomic bin') tit1 = fig.suptitle("TAD borders' alignment", size='x-large') tit2 = axes[0].set_title("Alignment column number") tit2.set_y(1.3) plt.subplots_adjust(top=0.76) # This was for color bar instead of legend # ax1 = fig.add_axes([0.9 + 0.3/figsiz, 0.05, 0.2/figsiz, 0.9]) # cb1 = colorbar.ColorbarBase(ax1, cmap=jet, # norm=colors.Normalize(vmin=0., vmax=1.)) # cb1.set_label('Border prediction score') # cb1.ax.set_yticklabels([str(i)for i in range(1, 11)]) fig.set_facecolor('white') plots = [] for scr in xrange(1, 11): plots += plt.plot((100,),(100,), marker=6, ms=9, color=jet(float(scr) / 10), mec='none') try: axes[-1].legend(plots, [str(scr) for scr in xrange(1, 11)], numpoints=1, title='Border scores', fontsize='small', loc='lower left', bbox_to_anchor=(1, 0.5)) except TypeError: axes[-1].legend(plots, [str(scr) for scr in xrange(1, 11)], numpoints=1, title='Border scores', loc='lower left', bbox_to_anchor=(1, 0.5)) if savefig: tadbit_savefig(savefig) else: plt.show()
def draw_map(data, genome_seq, cumcs, savefig, show, one=False, clim=None, cmap='jet', decay=False, perc=10, name=None, cistrans=None, decay_resolution=10000, normalized=False, max_diff=None): _ = plt.figure(figsize=(15.,12.5)) if not max_diff: max_diff = len(data) ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205]) ax2 = plt.axes([0.07, 0.65, 0.21, 0.15]) if decay: ax3 = plt.axes([0.07, 0.42, 0.21, 0.15]) plot_distance_vs_interactions(data, genome_seq=genome_seq, axe=ax3, resolution=decay_resolution, max_diff=max_diff, normalized=normalized) ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1) ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1) ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1) try: minoridata = np.nanmin(data) maxoridata = np.nanmax(data) except AttributeError: vals = [i for d in data for i in d if not np.isnan(i)] minoridata = np.min(vals) maxoridata = np.max(vals) totaloridata = np.nansum([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data))]) data = nozero_log(data, np.log2) vals = np.array([i for d in data for i in d]) vals = vals[np.isfinite(vals)] mindata = np.nanmin(vals) maxdata = np.nanmax(vals) diff = maxdata - mindata posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01 posF = 1.0 if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0 if cmap == 'tadbit': cuts = perc cdict = {'red' : [(0.0, 0.0, 0.0)], 'green': [(0.0, 0.0, 0.0)], 'blue' : [(0.0, 0.5, 0.5)]} prev_pos = 0 median = (np.median(vals) - mindata) / diff for prc in np.linspace(posI, median, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.) - mindata) / diff prc = ((prc - posI) / (median - posI)) + 1. / cuts except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict['red' ].append([pos, prc, prc]) cdict['green'].append([pos, prc, prc]) cdict['blue' ].append([pos, 1, 1]) prev_pos = pos for prc in np.linspace(median + 1. / cuts, posF, cuts / 2, endpoint=False): try: pos = (np.percentile(vals, prc * 100.) - mindata) / diff prc = ((prc - median) / (posF - median)) except ValueError: pos = prc = 0 if prev_pos >= pos: continue cdict['red' ].append([pos, 1.0, 1.0]) cdict['green'].append([pos, 1 - prc, 1 - prc]) cdict['blue' ].append([pos, 1 - prc, 1 - prc]) prev_pos = pos pos = (np.percentile(vals ,97.) - mindata) / diff cdict['red' ].append([pos, 0.1, 0.1]) cdict['green'].append([pos, 0, 0]) cdict['blue' ].append([pos, 0, 0]) cdict['red' ].append([1.0, 1, 1]) cdict['green'].append([1.0, 1, 1]) cdict['blue' ].append([1.0, 0, 0]) cmap = LinearSegmentedColormap(cmap, cdict) clim = None else: cmap = plt.get_cmap(cmap) cmap.set_bad('darkgrey', 1) ax1.imshow(data, interpolation='none', cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None) size = len(data) for i in xrange(size): for j in xrange(i, size): if np.isnan(data[i][j]): data[i][j] = 0 data[j][i] = 0 #data[j][i] = data[i][j] evals, evect = eigh(data) sort_perm = evals.argsort() evect = evect[sort_perm] data = [i for d in data for i in d if not np.isnan(i)] gradient = np.linspace(np.nanmin(data), np.nanmax(data), size) gradient = np.vstack((gradient, gradient)) h = ax2.hist(data, color='darkgrey', linewidth=2, bins=20, histtype='step', normed=True) _ = ax2.imshow(gradient, aspect='auto', cmap=cmap, extent=(np.nanmin(data), np.nanmax(data) , 0, max(h[0]))) if genome_seq: for crm in genome_seq: ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='w', linestyle='-', linewidth=1, alpha=1) ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='w', linestyle='-', linewidth=1, alpha=1) ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='k', linestyle='--') ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5, color='k', linestyle='--') if not one: vals = [0] keys = [''] for crm in genome_seq: vals.append(cumcs[crm][0]) keys.append(crm) vals.append(cumcs[crm][1]) ax1.set_yticks(vals) ax1.set_yticklabels('') ax1.set_yticks([float(vals[i]+vals[i+1])/2 for i in xrange(len(vals) - 1)], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',') # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(minoridata)[::-1])])[::-1].strip(',') # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(maxoridata)[::-1])])[::-1].strip(',') plt.figtext(0.05,0.25, ''.join([ (name + '\n') if name else '', 'Number of interactions: %s\n' % str(totaloridata), ('' if np.isnan(cistrans) else ('Percentage of cis interactions: %.0f%%\n' % (cistrans*100))), 'Min interactions: %s\n' % (minoridata), 'Max interactions: %s\n' % (maxoridata)])) ax2.set_xlim((np.nanmin(data), np.nanmax(data))) ax2.set_ylim((0, max(h[0]))) ax1.set_xlim ((-0.5, size - .5)) ax1.set_ylim ((-0.5, size - .5)) ax2.set_xlabel('log interaction count') # we reduce the number of dots displayed.... we just want to see the shape subdata = np.array(list(set([float(int(d*100))/100 for d in data]))) try: normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data)) except AttributeError: normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data)) ax2.plot(subdata, normfit, 'w.', markersize=2.5, alpha=.4) ax2.plot(subdata, normfit, 'k.', markersize=1.5, alpha=1) ax2.set_title('skew: %.3f, kurtosis: %.3f' % (skew(data), kurtosis(data))) ax4.vlines(range(size), 0, evect[:,-1], color='k') ax4.hlines(0, 0, size, color='red') ax4.set_ylabel('E1') ax4.set_yticklabels([]) try: ax5.vlines(range(size), 0, evect[:,-2], color='k') except IndexError: pass ax5.hlines(0, 0, size, color='red') ax5.set_ylabel('E2') ax5.set_yticklabels([]) try: ax6.vlines(range(size), 0, evect[:,-3], color='k') except IndexError: pass ax6.hlines(0, 0, size, color='red') ax6.set_ylabel('E3') ax6.set_yticklabels([]) xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels() plt.setp(xticklabels, visible=False) if savefig: tadbit_savefig(savefig) elif show: plt.show() plt.close('all')
def visualize(self, names=None, tad=None, focus=None, paint_tads=False, axe=None, show=True, logarithm=True, normalized=False, relative=True, decorate=True, savefig=None, clim=None, scale=(8, 6), cmap='jet'): """ Visualize the matrix of Hi-C interactions of a given experiment :param None names: name of the experiment to visualize, or list of experiment names. If None, all experiments will be shown :param None tad: a given TAD in the form: :: {'start': start, 'end' : end, 'brk' : end, 'score': score} **Alternatively** a list of the TADs can be passed (all the TADs between the first and last one passed will be showed. Thus, passing more than two TADs might be superfluous) :param None focus: a tuple with the start and end positions of the region to visualize :param False paint_tads: draw a box around the TADs defined for this experiment :param None axe: an axe object from matplotlib can be passed in order to customize the picture :param True show: either to pop-up matplotlib image or not :param True logarithm: show the logarithm values :param True normalized: show the normalized data (weights might have been calculated previously). *Note: white rows/columns may appear in the matrix displayed; these rows correspond to filtered rows (see* :func:`pytadbit.utils.hic_filtering.hic_filtering_for_modelling` *)* :param True relative: color scale is relative to the whole matrix of data, not only to the region displayed :param True decorate: draws color bar, title and axes labels :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None clim: tuple with minimum and maximum value range for color scale. I.e. clim=(-4, 10) """ if names == None: names = [xpr.name for xpr in self.experiments] if not isinstance(names, list) and not isinstance(names, tuple): names = [names] cols = 1 rows = 1 else: sqrtxpr = sqrt(len(names)) cols = int(round(sqrtxpr + (0.0 if int(sqrtxpr)==sqrtxpr else .5))) rows = int(sqrtxpr+.5) notaxe = axe == None if not scale: scale = (8, 6) if notaxe and len(names) != 1: fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows)) for i in xrange(rows): for j in xrange(cols): if i * cols + j >= len(names): break if notaxe and len(names) != 1: axe = fig.add_subplot( rows, cols, i * cols + j + 1) if (isinstance(names[i * cols + j], tuple) or isinstance(names[i * cols + j], list)): if not axe: fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows)) axe = fig.add_subplot( rows, cols, i * cols + j + 1) xpr1 = self.get_experiment(names[i * cols + j][0]) xpr2 = self.get_experiment(names[i * cols + j][1]) img = xpr1.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=decorate, savefig=False, where='up', clim=clim, cmap=cmap) img = xpr2.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=False, savefig=False, where='down', clim=clim or img.get_clim(), cmap=cmap) #axe = axe.twinx() #axe.set_aspect('equal',adjustable='box-forced',anchor='NE') if decorate: plt.text(1.01, .5, 'Chromosome %s experiment %s' % ( self.name, xpr2.name), rotation=-90, va='center', size='large', ha='left', transform=axe.transAxes) else: xper = self.get_experiment(names[i * cols + j]) if not xper.hic_data and not xper.norm: continue xper.view(tad=tad, focus=focus, paint_tads=paint_tads, axe=axe, show=False, logarithm=logarithm, normalized=normalized, relative=relative, decorate=decorate, savefig=False, clim=clim, cmap=cmap) if savefig: tadbit_savefig(savefig) if show: plt.show()
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False, genome_seq=None, resolution=None, axe=None, savefig=None, normalized=False): """ :param data: input file name, or HiC_data object or list of lists :param 10 min_diff: lower limit (in number of bins) :param 1000 max_diff: upper limit (in number of bins) to look for :param 100 resolution: group reads that are closer than this resolution parameter :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ resolution = resolution or 1 dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)]) if isinstance(data, str): fhandler = open(data) line = fhandler.next() while line.startswith('#'): line = fhandler.next() try: while True: _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9) if cr1 != cr2: line = fhandler.next() continue diff = abs(int(ps1) / resolution - int(ps2) / resolution) if max_diff > diff >= min_diff: dist_intr[diff] += 1 line = fhandler.next() except StopIteration: pass fhandler.close() elif isinstance(data, HiC_data): if normalized: get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y] else: get_data = lambda x, y: data[x, y] max_diff = min(len(data), max_diff) if data.section_pos: for crm in data.section_pos: for diff in xrange(min_diff, min( (max_diff, 1 + data.chromosomes[crm]))): for i in xrange(data.section_pos[crm][0], data.section_pos[crm][1] - diff): dist_intr[diff] += get_data(i, i + diff) else: for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i, i + diff]): dist_intr[diff] += get_data(i, diff) else: if genome_seq: max_diff = min(max(genome_seq.values()), max_diff) cnt = 0 for crm in genome_seq: for diff in xrange(min_diff, min( (max_diff, genome_seq[crm]))): for i in xrange(cnt, cnt + genome_seq[crm] - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] cnt += genome_seq[crm] else: max_diff = min(len(data), max_diff) for diff in xrange(min_diff, max_diff): for i in xrange(len(data) - diff): if not np.isnan(data[i][i + diff]): dist_intr[diff] += data[i][i + diff] if not axe: fig=plt.figure() axe = fig.add_subplot(111) # remove last part of the plot in case no interaction is count... reduce max_dist for diff in xrange(max_diff - 1, min_diff, -1): try: if not dist_intr[diff]: del(dist_intr[diff]) max_diff -=1 continue except KeyError: max_diff -=1 continue break xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0])) x = [] y = [] for k in xrange(len(xp)): if yp[k]: x.append(xp[k]) y.append(yp[k]) axe.plot(x, y, 'k.') best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0) logx = np.log(x) logy = np.log(y) ntries = 100 # set k for better fit # for k in xrange(1, ntries/5, ntries/5/5): if resolution == 1: k = 1 for i in xrange(3, ntries-2-k): v1 = i * len(x) / ntries try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1 = b1 = r21 = 0 r21 *= r21 for j in xrange(i + 1 + k, ntries - 2 - k): v2 = j * len(x) / ntries try: a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2]) a3, b3, r23, _, _ = linregress(logx[v2+k: ], logy[v2+k: ]) except ValueError: a2 = b2 = r22 = 0 a3 = b3 = r23 = 0 r2 = r21 + r22**2 + r23**2 if r2 > best[0]: best = (r2, v1, v2, a1, a2, a3, b1, b2, b3, k) # plot line of best fit (v1, v2, a1, a2, a3, b1, b2, b3, k) = best[1:] yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2+k: ], yfit3(x[v2+k: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) else: # from 0.7 Mb v1 = 700000 / resolution # to 10 Mb v2 = 10000000 / resolution try: a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1]) except ValueError: a1, b1, r21 = 0, 0, 0 try: a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2]) except ValueError: a2, b2, r22 = 0, 0, 0 try: a3, b3, r23, _, _ = linregress(logx[v2: ], logy[v2: ]) except ValueError: a3, b3, r23 = 0, 0, 0 yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx))) yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx))) yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx))) axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1)) #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1])) axe.plot(x[v1:v2], yfit2(x[v1:v2]), color= 'orange', lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2)) # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2])) axe.plot(x[v2: ], yfit3(x[v2: ] ), color= 'red' , lw=2, label = r'$\alpha_{%s}=%.2f$' % ( '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3)) # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k])) axe.set_ylabel('Log interaction count') axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution)) axe.legend(loc='lower left', frameon=False) axe.set_xscale('log') axe.set_yscale('log') axe.set_xlim((min_diff, max_diff)) try: axe.set_ylim((0, max(y))) except ValueError: pass if savefig: tadbit_savefig(savefig) plt.close('all') elif show==True: plt.show() plt.close('all')
def filter_by_mean(matrx, draw_hist=False, silent=False, bads=None, savefig=None): """ fits the distribution of Hi-C interaction count by column in the matrix to a polynomial. Then searches for the first possible """ nbins = 100 if not bads: bads = {} # get sum of columns cols = [] size = len(matrx) for c in sorted([[matrx.get(i+j*size, 0) for j in xrange(size) if not j in bads] for i in xrange(size) if not i in bads], key=sum): cols.append(sum(c)) cols = np.array(cols) if draw_hist: plt.figure(figsize=(9, 9)) try: percentile = np.percentile(cols, 5) except IndexError: warn('WARNING: no columns to filter out') return bads # mad = np.median([abs(median - c ) for c in cols]) best =(None, None, None, None) # bin the sum of columns xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, int(cols[-1])) # check if the binning is correct # we want at list half of the bins with some data try: cnt = 0 while list(x).count(0) > len(x)/2: cnt += 1 cols = cols[:-1] xmin = min(cols) xmax = max(cols) y = np.linspace(xmin, xmax, nbins) hist = np.digitize(cols, y) x = [sum(hist == i) for i in range(1, nbins + 1)] if draw_hist: plt.clf() hist = plt.hist(cols, bins=100, alpha=.3, color='grey') xp = range(0, int(cols[-1])) if cnt > 10000: raise ValueError # find best polynomial fit in a given range for order in range(6, 18): z = np.polyfit(y, x, order) zp = np.polyder(z, m=1) roots = np.roots(np.polyder(z)) # check that we are concave down, otherwise take next root pente = np.polyval(zp, abs(roots[-2] - roots[-1]) / 2 + roots[-1]) if pente > 0: root = roots[-1] else: root = roots[-2] # root must be higher than zero if root <= 0: continue # and lower than the median if root >= percentile: continue p = np.poly1d(z) R2 = get_r2(p, x, y) # try to avoid very large orders by weigthing negatively their fit if order > 13: R2 -= float(order)/30 if best[0] < R2: best = (R2, order, p, z, root) try: p, z, root = best[2:] if draw_hist: xlims = plt.xlim() ylims = plt.ylim() a = plt.plot(xp, p(xp), "--", color='k') b = plt.vlines(root, 0, plt.ylim()[1], colors='r', linestyles='dashed') # c = plt.vlines(median - mad * 1.5, 0, 110, colors='g', # linestyles='dashed') try: plt.legend(a+[b], ['polyfit \n%s' % ( ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j>0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1])])), 'first solution of polynomial derivation'], fontsize='x-small') except TypeError: plt.legend(a+[b], ['polyfit \n%s' % ( ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' % ('+' if j>0 else '', j, '{' + str(i) + '}')) for i, j in enumerate(list(p)[::-1])])), 'first solution of polynomial derivation']) # plt.legend(a+[b]+[c], ['polyfit \n{}'.format ( # ''.join([sub('e([-+][0-9]+)', 'e^{\\1}', # '${}{:.1}x^{}$'.format ('+' if j>0 else '', j, # '{' + str(i) + '}')) # for i, j in enumerate(list(p)[::-1])])), # 'first solution of polynomial derivation', # 'median - (1.5 * median absolute deviation)'], # fontsize='x-small') plt.ylim([0, ylims[1]]) plt.xlim(xlims) plt.xlabel('Sum of interactions') plt.xlabel('Number of columns with a given value') if savefig: tadbit_savefig(savefig) else: plt.show() # label as bad the columns with sums lower than the root for i, col in enumerate([[matrx.get(i+j*size, 0) for j in xrange(size)] for i in xrange(size)]): if sum(col) < root: bads[i] = sum(col) # now stored in Experiment._zeros, used for getting more accurate z-scores if bads and not silent: stderr.write(('\nWARNING: removing columns having less than %s ' + 'counts:\n %s\n') % ( round(root, 3), ' '.join( ['%5s'%str(i + 1) + (''if (j + 1) % 20 else '\n') for j, i in enumerate(sorted(bads.keys()))]))) except: if not silent: stderr.write('WARNING: Too many zeroes to filter columns.' + ' SKIPPING...\n') if draw_hist: plt.xlabel('Sum of interactions') plt.xlabel('Number of columns with a given value') if savefig: tadbit_savefig(savefig) else: plt.show() except ValueError: if not silent: stderr.write('WARNING: Too few data to filter columns based on ' + 'mean value.\n') if draw_hist: plt.close('all') return bads
def plot_iterative_mapping(fnam1, fnam2, total_reads=None, axe=None, savefig=None): """ :param fnam: input file name :param total_reads: total number of reads in the initial FASTQ file :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :returns: a dictionary with the number of reads per mapped length """ count_by_len = {} total_reads = total_reads or 1 if not axe: fig=plt.figure() _ = fig.add_subplot(111) colors = ['olive', 'darkcyan'] iteration = False for i, fnam in enumerate([fnam1, fnam2]): fhandler = open(fnam) line = fhandler.next() count_by_len[i] = {} while line.startswith('#'): if line.startswith('# MAPPED '): itr, num = line.split()[2:] count_by_len[i][int(itr)] = int(num) line = fhandler.next() if not count_by_len[i]: iteration = True try: while True: _, length, _, _ = line.rsplit('\t', 3) try: count_by_len[i][int(length)] += 1 except KeyError: count_by_len[i][int(length)] = 1 line = fhandler.next() except StopIteration: pass fhandler.close() lengths = sorted(count_by_len[i].keys()) for k in lengths[::-1]: count_by_len[i][k] += sum([count_by_len[i][j] for j in lengths if j < k]) plt.plot(lengths, [float(count_by_len[i][l]) / total_reads for l in lengths], label='read' + str(i + 1), linewidth=2, color=colors[i]) if iteration: plt.xlabel('read length (bp)') else: plt.xlabel('Iteration number') if total_reads != 1: plt.ylabel('Proportion of mapped reads') else: plt.ylabel('Number of mapped reads') plt.legend(loc=4) if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all') return count_by_len
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None, xlog=False): """ Plots the distribution of dangling-ends lengths :param fnam: input file name :param None savefig: path where to store the output images. :param 99.9 max_size: top percentage of distances to consider, within the top 0.01% are usually found very long outliers. :param False xlog: represent x axis in logarithmic scale :returns: the median value and the percentile inputed as max_size. """ distr = {} genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) line = fhandler.next() des = [] if nreads: nreads /= 2 try: while True: (crm1, pos1, dir1, _, re1, _, crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12] if re1==re2 and crm1 == crm2 and dir1 != dir2: pos1, pos2 = int(pos1), int(pos2) if (pos2 > pos1) == int(dir1): des.append(abs(pos2 - pos1)) if len(des) == nreads: break line = fhandler.next() except StopIteration: pass fhandler.close() ax = setup_plot(axe, figsize=(10, 5.5)) max_perc = np.percentile(des, max_size) perc99 = np.percentile(des, 99) perc01 = np.percentile(des, 1) perc50 = np.percentile(des, 50) perc95 = np.percentile(des, 95) perc05 = np.percentile(des, 5) desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3, label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99)) ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3) desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3, label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95)) deshist = ax.hist(des, bins=100, range=(0, max_perc), alpha=.7, color='darkred', label='Dangling-ends') ylims = ax.get_ylim() plots = [] ax.set_xlabel('Genomic distance between reads') ax.set_ylabel('Count') ax.set_title('Distribution of dangling-ends ' + 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % ( perc50, max_size, max_perc)) if xlog: ax.set_xscale('log') ax.set_xlim((50, max_perc)) plt.subplots_adjust(left=0.1, right=0.75) ax.legend(bbox_to_anchor=(1.4, 1), frameon=False) if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all') return perc50, max_perc