def plot_contig_length_vs_GC(self, alpha=0.5): pylab.plot(self.df["length"], self.df['GC'], "o", alpha=alpha) pylab.xlabel("contig length (bp)") pylab.ylabel("GC (%)") pylab.grid(True) pylab.ylim([0, 100]) pylab.xlim(0, max(self.df['length']) + 10)
def plot_specific_alignment(self, query_name, motif, clf=True, windows=[10, 50, 100, 200, 500, 1000]): found = None bam = BAM(self.bamfile) for aln in bam: if aln.query_name == query_name: found = aln if found: # Detection seq = found.query_sequence if clf: pylab.clf() for window in windows: X = [seq[i:i + window].count(motif) for i in range(len(seq))] pylab.plot(X, label=window) score = sum([x > window / 6 for x in X]) print(window, score / 3.) pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("Not found")
def plot_bar(self, spikes_filename=None, ratio=100): data = self.spikes_found(spikes_filename) lengths = [self.SIRV_lengths[x] for x in data.index] data.plot(kind="bar") pylab.plot(np.array(lengths) / ratio) pylab.tight_layout() return data
def plot_gc_content(self, fontsize=16, ec="k", bins=100): """plot GC content histogram :params bins: a value for the number of bins or an array (with a copy() method) :param ec: add black contour on the bars .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam')) b.plot_gc_content() """ data = self.get_gc_content() try: X = np.linspace(0, 100, bins) except: X = bins.copy() pylab.hist(data, X, normed=True, ec=ec) pylab.grid(True) mu = pylab.mean(data) sigma = pylab.std(data) X = pylab.linspace(X.min(), X.max(), 100) pylab.plot(X, pylab.normpdf(X, mu, sigma), lw=2, color="r", ls="--") pylab.xlabel("GC content", fontsize=16)
def plot_volcano(self): """ .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_volcano() """ d1 = self.df.query("padj>0.05") d2 = self.df.query("padj<=0.05") fig = pylab.figure() pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o", alpha=0.5, color="r", lw=0) pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o", alpha=0.5, color="k", lw=0) pylab.grid(True) pylab.xlabel("fold change") pylab.ylabel("log10 adjusted p-value") m1 = abs(min(self.df.log2FoldChange)) m2 = max(self.df.log2FoldChange) limit = max(m1,m2) pylab.xlim([-limit, limit]) y1,y2 = pylab.ylim() pylab.ylim([0,y2]) pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
def plot(self, color_line='r', bgcolor='grey', color='yellow', lw=4, hold=False, ax=None): xmax = self.xmax + 1 if ax: pylab.sca(ax) pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.3) pylab.fill_between([0, xmax], [20, 20], [30, 30], color='orange', alpha=0.3) pylab.fill_between([0, xmax], [30, 30], [41, 41], color='green', alpha=0.3) if self.X is None: X = range(1, self.xmax + 1) pylab.fill_between(X, self.df.mean() + self.df.std(), self.df.mean() - self.df.std(), color=color, interpolate=False) pylab.plot(X, self.df.mean(), color=color_line, lw=lw) pylab.ylim([0, 41]) pylab.xlim([0, self.xmax + 1]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Quality") pylab.grid(axis='x')
def plot_alignment(self, bamfile, motif, window=200, global_th=10,title=None,legend=True, legend_fontsize=11, valid_rnames=[], valid_flags=[]): """ plot alignments that match the motif. """ bam = BAM(bamfile) print("Found {} hits".format(len(bam))) pylab.clf() count = 0 for aln in bam: if valid_rnames and aln.rname not in valid_rnames: continue if valid_flags and aln.flag not in valid_flags: continue seq = aln.query_sequence if seq: count += 1 X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] pylab.plot(range(aln.reference_start, aln.reference_start+len(seq)),X1, label=aln.query_name) print("Showing {} entries after filtering".format(count)) max_theo = int(1.2*window / len(motif)) pylab.ylim([0, max_theo]) if legend and count<15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16)
def find_motif(bamfile, motif="CAGCAG", window=200, savefig=False, local_th=5, global_th=10): """ If at least 10 position contains at least 5 instances of the motif, then this is a hit and the alignment is kept """ b1 = BAM(bamfile) # FIND motif and create pictures count = 0 found = [] Ss = [] alns = [] for a in b1: count +=1 if a.query_sequence is None: continue seq = a.query_sequence X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] S = sum([x>local_th for x in X1]) Ss.append(S) als.append(a) if S > global_th: found.append(True) off = a.query_alignment_start pylab.clf() pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1) if savefig: pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_"))) else: found.append(False) return alns, found, Ss
def plot_bar(self, spikes_filename=None, ratio=100): data = self.spikes_found(spikes_filename) lengths = [self.SIRV_lengths[x] for x in data.index] data.plot(kind="bar") pylab.plot(np.array(lengths)/ratio) pylab.tight_layout() return data
def plot_specific_alignment(self, bamfile, query_name, motif,clf=True, show_figure=True, authorized_flags=[0,16], windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5): found = None bam = BAM(bamfile) for aln in bam: if aln.query_name == query_name and aln.flag in authorized_flags: found = aln break # we may have several entries. let us pick up the first sizes = [] if found: # Detection seq = found.query_sequence if clf:pylab.clf() for window in windows: X = [seq[i:i+window].count(motif) for i in range(len(seq))] if show_figure: pylab.plot(X, label=window) score = sum([x>local_threshold for x in X]) sizes.append(score-window) if show_figure: pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("{} Not found in {} file".format(query_name, bamfile)) return sizes
def get_max_gc_correlation(self, reference): """Plot correlation between coverage and GC content by varying the GC window The GC content uses a moving window of size W. This parameter affects the correlation bewteen coverage and GC. This function find the *optimal* window length. """ pylab.clf() corrs = [] wss = [] def func(params): ws = int(round(params[0])) if ws < 10: return 0 self.bed.compute_gc_content(reference, ws) corr = self.get_gc_correlation() corrs.append(corr) wss.append(ws) return corr from scipy.optimize import fmin res = fmin(func, 100, xtol=1, disp=False) # guess is 200 pylab.plot(wss, corrs, "o") pylab.xlabel("GC window size") pylab.ylabel("Correlation") pylab.grid() return res[0]
def plot_idr_vs_peaks(self, filename=None, savefig=False): # global_idr is actually -log10(idr) pylab.clf() X1 = pylab.linspace(0, self.threshold, 100) X2 = pylab.linspace(self.threshold, 1, 100) # convert global idr to proba df1 = self.df.query("idr<@self.threshold") df2 = self.df.query("idr>[email protected]") pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2) shift = len(df1) pylab.plot([shift + sum(df2['idr'] < x) for x in X2], X2, "-", color='k', lw=2) pylab.xlabel('Number of significant peaks') pylab.ylabel('IDR') pylab.axhline(0.05, color='b', ls='--') pylab.axvline(self.N_significant_peaks, color='b', ls='--') if savefig: pylab.savefig(filename)
def plot_gc_content(self, fontsize=16, ec="k", bins=100): """plot GC content histogram :params bins: a value for the number of bins or an array (with a copy() method) :param ec: add black contour on the bars .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam')) b.plot_gc_content() """ data = self.get_gc_content() try: X = np.linspace(0, 100, bins) except: X = bins.copy() pylab.hist(data, X, density=True, ec=ec) pylab.grid(True) mu = pylab.mean(data) sigma = pylab.std(data) X = pylab.linspace(X.min(), X.max(), 100) from sequana.misc import normpdf pylab.plot(X, normpdf(X, mu, sigma), lw=2, color="r", ls="--") pylab.xlabel("GC content", fontsize=16)
def check(self, bins=60): y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k") M1 = max(y) # this normalisation is an approximation/hack pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro") pylab.title("simulated (blue) and target (red) distributions")
def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True): """Number Of Polymerase Reads Per Barcode""" PR = self.df_barcoded["Polymerase Reads"].sum() data = self.df_barcoded['Polymerase Reads'].sort_values( ascending=False).values pylab.plot([int(x) for x in range(1, len(data) + 1)], data, label="barcodes") pylab.axhline(data.mean(), color="r", label="average") try: if unbarcoded is True: unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0] pylab.axhline(unbar, color="k", ls="--", label="not barcoded") except: pass pylab.xlabel("Barcode Rank Order", fontsize=fontsize) pylab.ylabel("Counts of Reads", fontsize=fontsize) pylab.title("Total Polymerase count: {}".format(PR)) pylab.legend() pylab.ylim(ymin=0) try: pylab.tight_layout() except: pass
def plot(self, fontsize=16): """plot quality versus base position""" pylab.plot(self.quality, label="offset: %s" % self.offset) pylab.xlabel('base position', fontsize=fontsize) pylab.ylabel('Quality per base', fontsize=fontsize) pylab.grid(True) # ylim set autoscale to off so if we want to call this function several # times, we must reset autoscale to on before calling ylim pylab.autoscale() limits = pylab.ylim() pylab.ylim(max(0, limits[0] - 1), limits[1] + 1)
def plot(self, fontsize=16): """plot quality versus base position""" pylab.plot(self.quality, label="offset: %s" % self.offset) pylab.xlabel('base position', fontsize=fontsize) pylab.ylabel('Quality per base', fontsize=fontsize) pylab.grid(True) # ylim set autoscale to off so if we want to call this function several # times, we must reset autoscale to on before calling ylim pylab.autoscale() limits = pylab.ylim() pylab.ylim(max(0,limits[0]-1), limits[1]+1)
def plot(self, clf=True): if clf: pylab.clf() M = self.df_shustring.shustring_length.max() print(M) M = int(M / 1000) + 1 for i in range(M): pylab.axhline(i * 1000, ls='--', color='grey') pylab.plot(self.df_shustring.shustring_length) pylab.xlabel('position (bp)') pylab.ylabel('Length of repeats') pylab.ylim(bottom=0)
def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000, alpha=1, output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, normed=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([ alpha, self.target_distribution(can) / self.target_distribution(x) ]) #acceptance probability u = pylab.uniform(0, 1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, normed=1) pylab.plot(x, y, 'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF', 'Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)
def plot(self, normed=True, N=1000, Xmin=None, Xmax=None, bins=50, color='red', lw=2, hist_kw={ 'color': '#5F9EA0', "edgecolor": "k" }, ax=None): if ax: ax.hist(self.data, normed=normed, bins=bins, **hist_kw) else: pylab.hist(self.data, density=normed, bins=bins, **hist_kw) if Xmin is None: Xmin = self.data.min() if Xmax is None: Xmax = self.data.max() X = pylab.linspace(Xmin, Xmax, N) if ax: ax.plot(X, [self.model.pdf(x, self.results.x) for x in X], color=color, lw=lw) else: pylab.plot(X, [self.model.pdf(x, self.results.x) for x in X], color=color, lw=lw) K = len(self.results.x) # The PIs must be normalised import scipy.stats as ss for i in range(self.k): mu, sigma, pi_ = self.results.mus[i], self.results.sigmas[ i], self.results.pis[i] if ax: ax.plot(X, [pi_ * ss.norm.pdf(x, mu, sigma) for x in X], 'k--', alpha=0.7, lw=2) else: pylab.plot(X, [pi_ * ss.norm.pdf(x, mu, sigma) for x in X], 'k--', alpha=0.7, lw=2)
def plot_coverage(self): """Please use :class:`GenomeCov` for more sophisticated tools to plot the genome coverage .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.plot_coverage() """ try: self.coverage except: self._set_coverage() pylab.plot(self.coverage) pylab.xlabel("Coverage")
def plot_coverage(self): """Please use :class:`GenomeCov` for more sophisticated tools to plot the genome coverage .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.plot_coverage() """ try: self.coverage except: self.set_fast_stats() pylab.plot(self.coverage) pylab.xlabel("Coverage")
def boxplot_quality(self, color_line='r', bgcolor='grey', color='yellow', lw=4, hold=False, ax=None): quality = self.df[[str(x) for x in range(42)]] # not sure why we have phred score from 0 to 41 N = self.metadata['ReadNum'] proba = quality / N self.xmax = 150 xmax = self.xmax + 1 if ax: pylab.sca(ax) # pragma no cover pylab.fill_between([0,xmax], [0,0], [20,20], color='red', alpha=0.3) pylab.fill_between([0,xmax], [20,20], [30,30], color='orange', alpha=0.3) pylab.fill_between([0,xmax], [30,30], [41,41], color='green', alpha=0.3) X = [] Q = [] S = [] for pos in range(1, 151): qualities = [((int(k)+1)*v) for k,v in quality.loc[pos].items()] mean_quality = sum(qualities) / N X.append(pos) Q.append(mean_quality) proba = quality.loc[pos] / N std = pylab.sqrt(sum([(x-mean_quality)**2 * y for x, y in zip(range(42), proba)])) S.append(std) print(len(X)) print(len(Q)) print(len(S)) Q = np.array(Q) X = np.array(X) S = np.array(S) pylab.fill_between(X, Q+S, Q-S, color=color, interpolate=False) pylab.plot(X, Q, color=color_line, lw=lw) pylab.ylim([0, 41]) pylab.xlim([0, self.xmax+1]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Quality") pylab.grid(axis='x')
def plot_read_length(self): """Plot occurences of aligned read lengths .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("test.bam")) b.plot_read_length() """ X, Y = self._get_read_length() pylab.plot(X, Y, label="min length:{}; max length:{}".format(min(X), max(X))) pylab.grid() pylab.xlabel("Read length", fontsize=16) pylab.legend()
def plot_read_length(self): """Plot occurences of aligned read lengths .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("test.bam")) b.plot_read_length() """ X, Y = self._get_read_length() pylab.plot(X, Y, label="min length:{}; max length:{}".format(min(X), max(X))) pylab.grid() pylab.xlabel("Read length", fontsize=16) pylab.legend()
def diagnostics(self, bins=60, clear=True): if clear: pylab.clf() pylab.subplot(3,1,1) pylab.hist(self.aprob, bins=bins) pylab.title("Acceptation") pylab.subplot(3,1,2) pylab.plot(self.vec) pylab.title("proposition") pylab.subplot(3,1,3) y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k") M1 = max(y) # this normalisation is an approximation/hack pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro") pylab.title("simulated (blue) and target (red) distributions")
def find_motif(self, motif, window=200, figure=False, savefig=False): b1 = BAM(self.bamfile) df = { "query_name": [], "hit": [], "length": [], "start": [], "end": [] } for a in b1: if a.query_sequence is None: continue seq = a.query_sequence X1 = [seq[i:i + window].count(motif) for i in range(len(seq))] S = sum([x >= self.local_threshold for x in X1]) df['query_name'].append(a.query_name) df['start'].append(a.reference_start) df['end'].append(a.reference_end) df['length'].append(a.rlen) df['hit'].append(S) if S >= self.global_threshold: off = a.query_alignment_start #pylab.clf() if figure: pylab.plot( range(off + a.reference_start, off + a.reference_start + len(seq)), X1) if savefig: pylab.savefig("{}_{}_{}.png".format( a.reference_name, S, a.query_name.replace("/", "_"))) df = pd.DataFrame(df) L = len(df.query("hit>5")) print(L) return df
def plot_alignment(self, motif, window=200, global_th=10, title=None, legend=True, legend_fontsize=11): """ plot alignments that match the motif. """ df = self._get_aligments(motif=motif, window=window, global_th=global_th) print("Found {} hits".format(len(df))) bam = BAM(self.bamfile) pylab.clf() count = 0 for aln in bam: if aln.query_name in df.query_name.values: seq = aln.query_sequence if seq: count += 1 X1 = [ seq[i:i + window].count(motif) for i in range(len(seq)) ] pylab.plot(range(aln.reference_start, aln.reference_start + len(seq)), X1, label=aln.query_name) max_theo = int(1.2 * window / len(motif)) pylab.ylim([0, max_theo]) if legend and count < 15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16) return df
def plot_contig_length_vs_nreads(self, fontsize=16): # same as plot_scatter_contig_length_nread_cov if self._df is None: _ = self.get_df() pylab.clf() df = self._df m1 = df.length.min() M1 = df.length.max() pylab.loglog(df.length, df.nread, "o") pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("Contig N reads", fontsize=fontsize) pylab.grid() X = df.query("nread>10 and length>100000")['length'] Y = df.query("nread>10 and length>100000")['nread'] A = np.vstack([X, np.ones(len(X))]).T m, c = np.linalg.lstsq(A, Y.as_matrix())[0] x = np.array([m1, M1]) pylab.plot(x, m * x + c, "o-r") pylab.tight_layout()
def plot(self, X=[0, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8, .9, .95, .99, .999, 1], fontsize=16, label=None): """plot percentage of genes covered (y axis) as a function of percentage of genes covered at least by X percent (x-axis). """ icol = self.coverage_column N = float(len(self.df)) X = np.array(X) Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X]) if label is None: pylab.plot(X * 100, Y, "o-") else: pylab.plot(X * 100, Y, "o-", label=label) pylab.xlabel("Gene coverage (%)", fontsize=fontsize) pylab.ylabel("Percentage of genes covered", fontsize=fontsize) for this in [25, 50, 75]: pylab.axhline(this, color="r", alpha=0.5, ls="--") pylab.axvline(this, color="r", alpha=0.5, ls="--")
def plot_ranks(self, filename=None, savefig=False): # ranks # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000). # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0. df1 = self.df.query('score>540') df2 = self.df.query('score<=540') pylab.clf() pylab.plot(df1.rep1_rank, df1.rep2_rank, 'ko', alpha=0.5, label='<0.05 IDR') pylab.plot(df2.rep1_rank, df2.rep2_rank, 'ro', alpha=0.5, label='>=0.05 IDR') pylab.xlabel("Peak rank - replicate 1") pylab.ylabel("Peak rank - replicate 2") N = len(self.df) pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--') #pylab.xlim([0,1.05]) #pylab.ylim([0,1.05]) pylab.legend(loc='lower right') if savefig: pylab.savefig(filename)
def plot_dispersion(self): pylab.plot( self.dds_stats.baseMean, self.dds_stats.dispGeneEst, "ok", label="Estimate", ms=1, ) pylab.plot( self.dds_stats.baseMean, self.dds_stats.dispersion, "ob", label="final", ms=1, ) pylab.plot(self.dds_stats.baseMean, self.dds_stats.dispFit, "or", label="Fit", ms=1) pylab.legend() ax = pylab.gca() ax.set(yscale="log") ax.set(xscale="log") self._format_plot( title="Dispersion estimation", xlabel="Mean of normalized counts", ylabel="Dispersion", )
def plot_scatter_contig_length_nread_cov(self, fontsize=16, vmin=0, vmax=50, min_nreads=20, min_length=50000): if self._df is None: _ = self.get_df() pylab.clf() df = self._df m1 = df.length.min() M1 = df.length.max() # least square X = df.query("nread>@min_nreads and length>@min_length")['length'] Y = df.query("nread>@min_nreads and length>@min_length")['nread'] Z = df.query("nread>@min_nreads and length>@min_length")['covStat'] print(X) print(Y) print(Z) A = np.vstack([X, np.ones(len(X))]).T m, c = np.linalg.lstsq(A, Y.as_matrix())[0] x = np.array([m1, M1]) X = df['length'] Y = df['nread'] Z = df['covStat'] pylab.scatter(X, Y, c=Z, vmin=vmin, vmax=vmax) pylab.colorbar() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("Contig reads", fontsize=fontsize) pylab.title("coverage function of contig length and reads used") pylab.grid() pylab.plot(x, m * x + c, "o-r") pylab.loglog() pylab.tight_layout()
def plot_pca_vs_max_features(self, step=100, n_components=2, progress=True): """ .. plot:: :include-source: from sequana.viz.pca import PCA from sequana import sequana_data import pandas as pd data = sequana_data("test_pca.csv") df = pd.read_csv(data) df = df.set_index("Id") p = PCA(df) p.plot_pca_vs_max_features() """ assert n_components in [2,3,4] N = len(self.df) if step > N: step = N # We start with at least 5 features X = range(10, N, step) from easydev import Progress pb = Progress(len(X)) Y = [] for i, x in enumerate(X): res = self.plot(n_components=n_components, max_features=x, show_plot=False) Y.append(res) if progress: pb.animate(i+1) sub = n_components pylab.subplot(sub,1,1) pylab.plot(X, [y[0]*100 for y in Y]) pylab.ylabel("PC1 (%)") pylab.subplot(sub,1,2) pylab.plot(X, [y[1]*100 for y in Y]) pylab.ylabel("PC2 (%)") if sub >= 3: pylab.subplot(sub,1,3) pylab.plot(X, [y[2]*100 for y in Y]) pylab.ylabel("PC3 (%)") if sub >= 4: pylab.subplot(sub,1,4) pylab.plot(X, [y[3]*100 for y in Y]) pylab.ylabel("PC4 (%)")
def plot(self): """""" if self.design: self.df['label'] = self.design.df['type'] + "/" + self.design.df[ 'condition'] pylab.clf() MX = self.df.FRiP.max() MY = self.df['in_peaks'].max() pylab.plot([0, MX], [0, MY], ls='--', color='b', alpha=0.5) for label in self.df['label'].unique(): self.df.query('label==@label').plot(x='FRiP', y='in_peaks', marker="o", lw=0, label=label, ax=pylab.gca()) pylab.ylabel('Reads in peaks') pylab.xlabel('FRiP') pylab.xlim(0, pylab.xlim()[1]) pylab.ylim(0, pylab.ylim()[1]) pylab.grid()
def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)]) #acceptance probability u = pylab.uniform(0,1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, density=1) pylab.plot(x,y,'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF','Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)