def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads")
def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads")
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:,"snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title,fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_read_length(self, bins=80, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Read Length", ylabel="#", label="", title=None, logy=False, ec="k", hist_kwargs={}): """Plot histogram Read length :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_read_length() """ mean_len = np.mean(self.df.loc[:, 'read_length']) # set title if not provided if title is None: title = "Read length \n Mean length : %.2f" % (mean_len) if hold is False: pylab.clf() hist = HistCumSum(self.df.loc[:, 'read_length'], fontsize=fontsize, grid=grid) hist.title = title hist.xlabel = xlabel hist.ylabel = ylabel hist.plot(bins=bins, alpha=alpha, edgecolor=ec, label="%s, mean : %.0f, N : %d" % (label, mean_len, len(self)), log=logy, **hist_kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0)
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#", title=""): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() pylab.hist(self._df.loc[:, 'snr_A'], alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:, 'snr_C'], alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:, 'snr_G'], alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:, 'snr_T'], alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.scatter(-pylab.log10(df['Adjusted P-value']), range(len(df)), s=10 * df['size'], c=df['size']) pylab.xlabel("Odd ratio") pylab.ylabel("Gene sets") pylab.yticks(range(len(df)), df.name) a, b = pylab.xlim() pylab.xlim([0, b]) pylab.grid(True) ax = pylab.gca() M = max(df['size']) if M > 100: l1, l2, l3 = "10", "100", str(M) else: l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M) handles = [ pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""), pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""), pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="") ] ax.legend(handles=handles, loc="upper left", title="gene-set size") pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.tight_layout() ax = pylab.colorbar(pylab.gci()) return df
def plot_dispersion(self): pylab.plot( self.dds_stats.baseMean, self.dds_stats.dispGeneEst, "ok", label="Estimate", ms=1, ) pylab.plot( self.dds_stats.baseMean, self.dds_stats.dispersion, "ob", label="final", ms=1, ) pylab.plot(self.dds_stats.baseMean, self.dds_stats.dispFit, "or", label="Fit", ms=1) pylab.legend() ax = pylab.gca() ax.set(yscale="log") ax.set(xscale="log") self._format_plot( title="Dispersion estimation", xlabel="Mean of normalized counts", ylabel="Dispersion", )
def plot_stacked_hist(self, output_filename=None, dpi=200, kind="barh", fontsize=10, edgecolor="k", lw=1, width=1, ytick_fontsize=10): df = self.get_df() df.T.plot(kind=kind, stacked=True, edgecolor=edgecolor, lw=lw, width=width) ax = pylab.gca() positions = pylab.yticks() #ax.set_yticklabel(positions, labels, fontsize=ytick_fontsize) pylab.xlabel("Percentage (%)", fontsize=fontsize) pylab.ylabel("Sample index/name", fontsize=fontsize) pylab.yticks(fontsize=ytick_fontsize) pylab.legend(title="kingdom") pylab.xlim([0, 100]) if output_filename: pylab.savefig(output_filename, dpi=dpi)
def scatter_plot(self, filename=None, hold=False): """Scatter plot of the score versus length of each ortholog .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.scatter_plot() Missing are not show since there is no information about contig . """ if hold is False: pylab.clf() colors = ["green", "orange", "red", "blue"] markers = ['o', 's', 'x', 'o'] for i, this in enumerate(["Complete", "Fragmented", "Duplicated"]): mask = self.df.Status == this if sum(mask) > 0: self.df[mask].plot(x="Length", y="Score", kind="scatter", color=colors[i], ax=pylab.gca(), marker=markers[i], label=this) pylab.legend() pylab.grid() if filename: pylab.savefig(filename)
def plot_unknown_barcodes(self, N=20): ub = self.data['UnknownBarcodes'] df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub}) if "unknown" in df.index and len(df) == 1: df.loc['known'] = [0 for i in df.columns] # if data is made of undetermined only, the dataframe is just made of # N lanes with one entry : unknown S = df.sum(axis=1).sort_values(ascending=False).index[0:N] data = df.loc[S][::-1] #print(data) data.columns = ["Lane {}".format(x) for x in data.columns] from matplotlib import rcParams rcParams['axes.axisbelow'] = True pylab.figure(figsize=(10, 8)) ax = pylab.gca() data.plot(kind="barh", width=1, ec="k", ax=ax) rcParams['axes.axisbelow'] = False pylab.xlabel("Number of reads", fontsize=12) pylab.ylabel("") pylab.grid(True) pylab.legend( ["Lane {}".format(x) for x in range(1, len(df.columns) + 1)], loc="lower right") try: pylab.tight_layout() except Exception as err: print(err) return data
def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None): df = self.get_data_reads() # this is ugly but will do the job for now under = df.query("name=='Undetermined'") others = df.query("name!='Undetermined'") under = under.groupby("name").sum().reset_index() others = others.groupby("name").sum().reset_index() under = under[["name", "count"]].set_index("name") others = others[["name", "count"]].set_index("name") all_data = others.sort_index(ascending=False) all_data.columns = ["samples"] # appended at the end all_data.loc['undetermined'] = 0 # revert back all_data = all_data.loc[::-1] # just for legend under.columns = ['undetermined'] if all_data.sum().min() > 1e6: all_data /= 1e6 under /= 1e6 M = True else: M = False all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k') under.plot(kind="barh", alpha=alpha, color="red", ax=pylab.gca(), zorder=1, width=width, ec='k') pylab.ylim([-0.5, len(all_data) + 0.5]) if len(all_data) < 100: pylab.yticks(range(len(all_data)), all_data.index) pylab.legend() pylab.grid(True, zorder=-1) if M: pylab.xlabel("Number of reads (M)") else: pylab.xlabel("Number of reads") try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename, dpi=200)
def hist_read_length(self, bins=80, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Read Length", ylabel="#", label="", title=None, logy=False, ec="k", hist_kwargs={}): """Plot histogram Read length :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_read_length() """ mean_len = np.mean(self.df.loc[:,'read_length']) # set title if not provided if title is None: title = "Read length \n Mean length : %.2f" %(mean_len) if hold is False: pylab.clf() hist = HistCumSum(self.df.loc[:,'read_length'], fontsize=fontsize, grid=grid) hist.title = title hist.xlabel = xlabel hist.ylabel = ylabel hist.plot(bins=bins, alpha=alpha, edgecolor=ec, label= "%s, mean : %.0f, N : %d" % (label, mean_len, len(self)), log=logy, **hist_kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0)
def plot(self): cmap = sns.diverging_palette(220, 10, as_cmap=True) cmap.set_bad("grey", 1.0) p = sns.clustermap( self.data_df, cmap=cmap, col_colors=self.sample_groups_col_df, yticklabels=self.yticklabels, row_colors=self.gene_groups_col_df, **self.kwargs, ) f = pylab.gca() self._do_legend(f, self.sample_color_dict, (12, 2)) self._do_legend(f, self.gene_color_dict, (-2, -2)) return p
def plot_hist_coverage(self, logx=True, logy=True, fontsize=16, N=20, fignum=1, hold=False, alpha=0.5, filename=None, **kw_hist): """ """ if hold is False: pylab.figure(fignum) pylab.clf() ax = pylab.gca() ax.set_facecolor('#eeeeee') data = self.df['cov'].dropna().values maxcov = data.max() if logx is True and logy is True: bins = pylab.logspace(0, pylab.log10(maxcov), N) pylab.hist(data, bins=bins, log=True, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.semilogx() pylab.xlabel("Coverage (log scale)", fontsize=fontsize) pylab.ylabel("Count (log scale)", fontsize=fontsize) elif logx is False and logy is True: pylab.hist(data, bins=N, log=True, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.xlabel("Coverage", fontsize=fontsize) pylab.ylabel("Count (log scale)", fontsize=fontsize) elif logx is True and logy is False: bins = pylab.logspace(0, pylab.log10(maxcov), N) pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.xlabel("Coverage (log scale)", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) pylab.semilogx() else: pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.xlabel("Coverage", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) pylab.grid(True) if filename: pylab.savefig(filename)
def plot(self): """""" if self.design: self.df['label'] = self.design.df['type'] + "/" + self.design.df[ 'condition'] pylab.clf() MX = self.df.FRiP.max() MY = self.df['in_peaks'].max() pylab.plot([0, MX], [0, MY], ls='--', color='b', alpha=0.5) for label in self.df['label'].unique(): self.df.query('label==@label').plot(x='FRiP', y='in_peaks', marker="o", lw=0, label=label, ax=pylab.gca()) pylab.ylabel('Reads in peaks') pylab.xlabel('FRiP') pylab.xlim(0, pylab.xlim()[1]) pylab.ylim(0, pylab.ylim()[1]) pylab.grid()
def _plot(self, Xr, pca=None, pc1=0, pc2=1, colors=None, show_labels=True): if colors is None: colors = [self.colors[k] for k in self.labels] if len(colors) != len(Xr): colors = ["r"] * len(Xr[:,0]) else: for k in self.labels: if k not in colors.keys(): logger.warning("No key color for this sample: {}. Set to red".format(k)) colors[k] = "r" colors = [colors[k] for k in self.labels] pylab.scatter(Xr[:,pc1], Xr[:,pc2], c=colors) ax = pylab.gca() X1, X2 = pylab.xlim() dX = X2 - X1 pylab.xlim([X1 + X1*0.05, X2 + X2*0.05]) Y1, Y2 = pylab.ylim() dY = Y2 - Y1 pylab.ylim([Y1 + Y1*0.05, Y2 + Y2*0.05]) count = 0 if show_labels: for x,y in zip(Xr[:,pc1], Xr[:,pc2]): x += dX / 40 y += dY / 40 ax.annotate(self.labels[count], (x,y)) count += 1 if count > 100: break if pca: pylab.xlabel("PC{} ({}%)".format(pc1+1, round(pca.explained_variance_ratio_[pc1]*100, 2))) pylab.ylabel("PC{} ({}%)".format(pc2+1, round(pca.explained_variance_ratio_[pc2]*100, 2))) pylab.grid(True)
def plot_go_terms(self, ontologies, max_features=50, log=False, fontsize=8, minimum_genes=0, pvalue=0.05, cmap="summer_r", sort_by="fold_enrichment", show_pvalues=False, include_negative_enrichment=False, fdr_threshold=0.05, compute_levels=True, progress=True): assert sort_by in ['pValue', 'fold_enrichment', 'fdr'] # FIXME: pvalue and fold_enrichment not sorted in same order pylab.clf() df = self.get_data( ontologies, include_negative_enrichment=include_negative_enrichment, fdr=fdr_threshold) if len(df) == 0: return df df = df.query("pValue<=@pvalue") logger.info("Filtering out pvalue>{}. Kept {} GO terms".format( pvalue, len(df))) df = df.reset_index(drop=True) # Select a subset of the data to keep the best max_features in terms of # pValue subdf = df.query("number_in_list>@minimum_genes").copy() logger.info( "Filtering out GO terms with less than {} genes: Kept {} GO terms". format(minimum_genes, len(subdf))) logger.info("Filtering out the 3 parent terms") subdf = subdf.query("id not in @self.ontologies") # Keeping only a part of the data, sorting by pValue if sort_by == "pValue": subdf = subdf.sort_values(by="pValue", ascending=False).iloc[-max_features:] df = df.sort_values(by="pValue", ascending=False) elif sort_by == "fold_enrichment": subdf = subdf.sort_values(by="abs_log2_fold_enrichment", ascending=True).iloc[-max_features:] df = df.sort_values(by="abs_log2_fold_enrichment", ascending=False) elif sort_by == "fdr": subdf = subdf.sort_values(by="fdr", ascending=False).iloc[-max_features:] df = df.sort_values(by="fdr", ascending=False) subdf = subdf.reset_index(drop=True) # We get all levels for each go id. # They are stored by MF, CC or BP if compute_levels: paths = self.get_graph(list(subdf['id'].values), progress=progress) levels = [] keys = list(paths.keys()) goid_levels = paths[keys[0]] if len(keys) > 1: for k in keys[1:]: goid_levels.update(paths[k]) levels = [goid_levels[ID] for ID in subdf['id'].values] subdf["level"] = levels else: subdf['level'] = "" N = len(subdf) size_factor = 12000 / len(subdf) max_size = subdf.number_in_list.max() min_size = subdf.number_in_list.min() sizes = [ max(max_size * 0.2, x) for x in size_factor * subdf.number_in_list.values / subdf.number_in_list.max() ] m1 = min(sizes) m3 = max(sizes) m2 = m1 + (m3 - m1) / 2 if log: pylab.scatter(pylab.log2(subdf.fold_enrichment), range(len(subdf)), c=subdf.fdr, s=sizes, cmap=cmap, alpha=0.8, ec="k", vmin=0, vmax=fdr_threshold, zorder=10) #pylab.barh(range(N), pylab.log2(subdf.fold_enrichment), color="r", # label="pvalue>0.05; FDR>0.05") #pylab.axvline(1, color="gray", ls="--") #pylab.axvline(-1, color="gray", ls="--") else: pylab.scatter(subdf.fold_enrichment, range(len(subdf)), c=subdf.fdr, cmap=cmap, s=sizes, ec="k", alpha=.8, vmin=0, vmax=fdr_threshold, zorder=10) # pylab.barh(range(N), subdf.fold_enrichment, color="r", # label="not significant") pylab.grid(zorder=-10) ax2 = pylab.colorbar(shrink=0.5) ax2.ax.set_ylabel('FDR') labels = [ x if len(x) < 50 else x[0:47] + "..." for x in list(subdf.label) ] ticks = [ "{} ({}) {}".format(ID, level, "; " + label.title()) for level, ID, label in zip(subdf['level'], subdf.id, labels) ] pylab.yticks(range(N), ticks, fontsize=fontsize, ha='left') yax = pylab.gca().get_yaxis() try: pad = [x.label.get_window_extent().width for x in yax.majorTicks] yax.set_tick_params(pad=max(pad)) except: yax.set_tick_params(pad=60 * fontsize * 0.7) yax.set_tick_params(pad=60 * fontsize * 0.6) fc_max = subdf.fold_enrichment.max(skipna=True) fc_min = subdf.fold_enrichment.min(skipna=True) # go into log2 space fc_max = pylab.log2(fc_max) fc_min = pylab.log2(fc_min) abs_max = max(fc_max, abs(fc_min), 1) if log: fc_max = abs_max * 1.5 else: fc_max = 2**abs_max * 1.2 pylab.axvline(0, color="k", lw=2) if log: pylab.xlabel("Fold Enrichment (log2)") else: pylab.xlabel("Fold Enrichment") if include_negative_enrichment: pylab.xlim([-fc_max, fc_max]) else: pylab.xlim([0, fc_max]) pylab.tight_layout() # The pvalue: if show_pvalues: ax = pylab.gca().twiny() ax.set_xlim([0, max(-pylab.log10(subdf.pValue)) * 1.2]) ax.set_xlabel("p-values (log10)", fontsize=12) ax.plot(-pylab.log10(subdf.pValue), range(len(subdf)), label="pvalue", lw=2, color="k") ax.axvline(1.33, lw=1, ls="--", color="grey", label="pvalue=0.05") pylab.tight_layout() pylab.legend(loc="lower right") s1 = pylab.scatter([], [], s=m1, marker='o', color='#555555', ec="k") s2 = pylab.scatter([], [], s=m2, marker='o', color='#555555', ec="k") s3 = pylab.scatter([], [], s=m3, marker='o', color='#555555', ec="k") if len(subdf) < 10: labelspacing = 1.5 * 4 borderpad = 4 handletextpad = 2 elif len(subdf) < 20: labelspacing = 1.5 * 2 borderpad = 1 handletextpad = 2 else: labelspacing = 1.5 borderpad = 2 handletextpad = 2 if len(subdf) >= 3: leg = pylab.legend( (s1, s2, s3), (str(int(min_size)), str(int(min_size + (max_size - min_size) / 2)), str(int(max_size))), scatterpoints=1, loc='lower right', ncol=1, frameon=True, title="gene-set size", labelspacing=labelspacing, borderpad=borderpad, handletextpad=handletextpad, fontsize=8) else: leg = pylab.legend((s1, ), (str(int(min_size)), ), scatterpoints=1, loc='lower right', ncol=1, frameon=True, title="gene-set size", labelspacing=labelspacing, borderpad=borderpad, handletextpad=handletextpad, fontsize=8) frame = leg.get_frame() frame.set_facecolor('#b4aeae') frame.set_edgecolor('black') frame.set_alpha(1) self.subdf = subdf self.df = df return df
def plot_coverage(self, filename=None, fontsize=16, rm_lw=1, rm_color="#0099cc", rm_label="Running median", th_lw=1, th_color="r", th_ls="--", main_color="k", main_lw=1, main_kwargs={}, sample=True, set_ylimits=True): """ Plot coverage as a function of base position. :param filename: :param rm_lw: line width of the running median :param rm_color: line color of the running median :param rm_color: label for the running median :param th_lw: line width of the thresholds :param th_color: line color of the thresholds :param main_color: line color of the coverage :param main_lw: line width of the coverage :param sample: if there are more than 1 000 000 points, we use an integer step to skip data points. We can still plot all points at your own risk by setting this option to False :param set_ylimits: we want to focus on the "normal" coverage ignoring unsual excess. To do so, we set the yaxis range between 0 and a maximum value. This maximum value is set to the minimum between the 6 times the mean coverage and 1.5 the maximum of the high coverage threshold curve. If you want to let the ylimits free, set this argument to False .. note:: if there are more than 1,000,000 points, we show only 1,000,000 by points. For instance for 5,000,000 points, In addition to the coverage, the running median and coverage confidence corresponding to the lower and upper zscore thresholds are shown. .. note:: uses the thresholds attribute. """ # z = (X/rm - \mu ) / sigma high_zcov = (self.thresholds.high * self.best_gaussian["sigma"] + self.best_gaussian["mu"]) * self.df["rm"] low_zcov = (self.thresholds.low * self.best_gaussian["sigma"] + self.best_gaussian["mu"]) * self.df["rm"] pylab.clf() ax = pylab.gca() ax.set_facecolor('#eeeeee') pylab.xlim(0,self.df["pos"].iloc[-1]) axes = [] labels = [] # 1,000,000 points is a lot for matplotlib. Let us restrict ourself to 1 # million points for now. if len(self.df) > 1000000 and sample is True: NN = int(len(self.df)/1000000) else: NN = 1 # the main coverage plot p1, = pylab.plot(self.df["cov"][::NN], color=main_color, label="Coverage", linewidth=main_lw, **main_kwargs) axes.append(p1) labels.append("Coverage") # The running median plot if rm_lw > 0: p2, = pylab.plot(self.df["rm"][::NN], color=rm_color, linewidth=rm_lw, label=rm_label) axes.append(p2) labels.append(rm_label) # The threshold curves if th_lw > 0: p3, = pylab.plot(high_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls, label="Thresholds") p4, = pylab.plot(low_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls, label="_nolegend_") axes.append(p3) labels.append("Thresholds") pylab.legend(axes, labels, loc="best") pylab.xlabel("Position", fontsize=fontsize) pylab.ylabel("Per-base coverage", fontsize=fontsize) pylab.grid(True) # sometimes there are large coverage value that squeeze the plot. # Let us restrict it if set_ylimits is True: pylab.ylim([0, min([ high_zcov.max() * 1.5, self.df["cov"].mean()*6])]) else: pylab.ylim([0, pylab.ylim()[1]]) try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def plot_jaccard_distance(self, mode, padjs=[0.0001,0.001,0.01,0.05,0.1], Nfc=50, smooth=False, window=5): assert mode in ['down', 'up', 'all'] pylab.clf() if mode == "down": m1 = self.r1.df.log2FoldChange.min() m2 = self.r2.df.log2FoldChange.min() minimum = min(m1,m2) print(m1, m2) X = pylab.linspace(0, minimum, Nfc) elif mode == "up": m1 = self.r1.df.log2FoldChange.max() m2 = self.r2.df.log2FoldChange.max() maximum = max(m1,m2) X = pylab.linspace(0, maximum, Nfc) else: minmax1 = self.r1.df.log2FoldChange.abs().max() minmax2 = self.r2.df.log2FoldChange.abs().max() maximum = max(minmax1, minmax2) X = pylab.linspace(0, maximum, Nfc) common = {} for padj in padjs: I = [] common[padj] = [] for x in X: if mode == "down": # less than a given fold change that is negative A = set(self.r1.df.query("log2FoldChange<=@x and padj<@padj").index) B = set(self.r2.df.query("log2FoldChange<=@x and padj<@padj").index) elif mode == "up": # greater than a given fold change that is positive A = set(self.r1.df.query("log2FoldChange>=@x and padj<@padj").index) B = set(self.r2.df.query("log2FoldChange>=@x and padj<@padj").index) else: A = set(self.r1.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index) B = set(self.r2.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index) if len(A) == 0 or len(B) == 0: # no overlap yet I.append(100) else: res = len(A.intersection(B)) / (len(A) + len(B) - len(A.intersection(B))) * 100 I.append(res) common[padj].append(len(A.intersection(B))) try: if smooth: I = pd.Series(I).rolling(window).median().values else: assert False except: pass pylab.plot(X, I, 'o-', label=str(padj)) ax = pylab.gca() ax.set_ylabel("Jaccard similarity (intersection/union)") ax.set_xlabel("Fold change (log2)") ax2 = ax.twinx() for padj in padjs: ax2.plot(X, common[padj], color='orange', ls='--') ax2.set_ylabel("Cardinality of the union ") ax.legend() ax.set_ylim([0,100]) #ax2.set_ylim([0,100]) if mode == "down": ax.axvline(-2, ls='--', color='r') else: ax.axvline(2, ls='--', color='r')
def plot_venn(subsets, labels=None, title=None, ax=None, alpha=0.8, weighted=False, colors=('r', 'b', 'y')): """ Plot venn diagramm according to number of groups. :param subsets: This parameter may be (1) a dict, providing sizes of three diagram regions. The regions are identified via two-letter binary codes ('10', '01', and '11'), hence a valid set could look like: {'10': 10, '01': 20, '11': 40}. Unmentioned codes are considered to map to 0. (2) a list (or a tuple) with three numbers, denoting the sizes of the regions in the following order: (10, 01, 11) and (3) a list containing the subsets of values. The subsets can be a list (or a tuple) containing two set objects. For instance: .. plot:: :include-source: from sequana.viz.venn import plot_venn A = set([1,2,3,4,5,6,7,8,9]) B = set([ 7,8,9,10,11]) plot_venn((A, B), labels=("A", "B")) This is the unweighted version by default meaning all circles have the same size. If you prefer to have circle scaled to the size of the sets, add the relevant parameter as follows: .. plot:: :include-source: from sequana.viz.venn import plot_venn A = set([1,2,3,4,5,6,7,8,9]) B = set([ 7,8,9,10,11]) plot_venn((A, B), labels=("A", "B"), weighted=True) Similarly for 3 sets, a Venn diagram can be represented as follows. Note here that we also use the *title* parameter: .. plot:: :include-source: from sequana.viz.venn import plot_venn A = set([1,2,3,4,5,6,7,8,9]) B = set([ 4,5,6,7,8,9,10,11,12,13]) C = set([ 3,4,5,6,7,8,9]) plot_venn((A, B, C), labels=("A", "B", "C"), title="my Venn3 diagram") Input can be a list/tuple of 2 or 3 sets as described above. """ #pylab.clf() if len(subsets) == 2: from matplotlib_venn import venn2_unweighted, venn2_circles, venn2 if weighted: venn_function = venn2 else: venn_function = venn2_unweighted venn_circles = venn2_circles elif len(subsets) == 3: from matplotlib_venn import venn3_unweighted, venn3_circles, venn3 if weighted: venn_function = venn3 else: venn_function = venn3_unweighted venn_circles = venn3_circles else: raise IOError("Venn diagramm supports only 2 or 3 groups.") vf = venn_function(subsets, set_labels=labels, ax=ax, alpha=alpha, set_colors=colors) # works for weighted, nor for unweighted, so we draw the circles # ourselfves if ax is None: ax = pylab.gca() for center, radius in zip(vf.centers, vf.radii): circle = pylab.Circle(center, radius=radius, linestyle="-", edgecolor="k", lw=2, facecolor='none', alpha=1) ax.add_patch(circle) if title: pylab.title(title) return vf
def plot_common_major_counts(self, mode, labels=None, switch_up_down_cond2=False, add_venn=True, xmax=None, title="", fontsize=12, sortby="log2FoldChange"): """ :param mode: down, up or all .. plot:: :include-source: from sequana import sequana_data from sequana.compare import RNADiffCompare c = RNADiffCompare( sequana_data("rnadiff/rnadiff_onecond_1"), sequana_data("rnadiff/rnadiff_onecond_2")) c.plot_common_major_counts("down") """ #cond1, cond2 = self._get_cond1_cond2() if labels is None: labels = ['r1', 'r2'] if mode in ["down"]: # Negative values ! gl1 = set(self.r1.gene_lists['down']) gl2 = set(self.r2.gene_lists['down']) A = self.r1.df.loc[gl1].sort_values(by=sortby) B = self.r2.df.loc[gl1].sort_values(by=sortby) else: gl1 = set(self.r1.gene_lists[mode]) gl2 = set(self.r2.gene_lists[mode]) A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False) B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False) # sometimes, up and down may be inverted as compared to the other # conditions N = [] for i in range(1,max(len(A), len(B))): a = A.iloc[0:i].index b = B.iloc[0:i].index n = len(set(b).intersection(set(a))) N.append(n / i*100) max_common = len(set(A.index).intersection(set(B.index))) pylab.clf() if len(A) > len(B): pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection") pylab.axvline(len(B), ls="--", color="k", label="rank of minor set") else: pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect") pylab.axvline(len(A), ls="--", color="k", label="rank of minor set") pylab.plot(N) pylab.xlabel('rank', fontsize=fontsize) pylab.ylabel('% common features', fontsize=fontsize) pylab.grid(True) pylab.ylim([0,100]) if xmax: pylab.xlim([0, xmax]) else: pylab.xlim([0, max(len(A),len(B))]) pylab.title(title, fontsize=fontsize) ax = pylab.gca() ax2 = ax.twinx() ax2.plot(A[sortby].values, "orange", label=sortby) ax2.set_ylabel(sortby) pylab.legend(loc="lower left") ax.legend(loc="lower right") if add_venn: f = pylab.gcf() ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey") if mode=="down": self.plot_venn_down(ax=ax, title=None, labels=labels, mode="two_only") elif mode=="up": self.plot_venn_up(ax=ax, title=None, labels=labels, mode="two_only") elif mode=="all": self.plot_venn_all(ax=ax, title=None, labels=labels, mode="two_only")
def plot_volcano(self, labels=None): """Volcano plot of log2 fold change versus log10 of adjusted p-value .. plot:: :include-source: from sequana import sequana_data from sequana.compare import RNADiffCompare c = RNADiffCompare( sequana_data("rnadiff/rnadiff_onecond_1"), sequana_data("rnadiff/rnadiff_onecond_2")) c.plot_volcano() """ cond1, cond2 = "cond1", "cond2" if labels is None: labels = [cond1, cond2] A = self.r1.df.loc[self.r1.gene_lists["all"]] B = self.r2.df.loc[self.r2.gene_lists["all"]] if cond1 == cond2: cond1 += "(1)" cond2 += "(2)" pylab.clf() pylab.plot(A.log2FoldChange, -np.log10(A.padj), marker="o", alpha=0.5, color="r", lw=0, label=labels[0], pickradius=4, picker=True) pylab.plot(B.log2FoldChange, -np.log10(B.padj), marker="x", alpha=0.5, color="k", lw=0, label=labels[1], pickradius=4, picker=True) genes = list(A.index) + list(B.index) pylab.grid(True) pylab.xlabel("fold change") pylab.ylabel("log10 adjusted p-value") pylab.legend(loc="lower right") ax = pylab.gca() def onpick(event): thisline = event.artist self.event = event label = thisline.get_label() if label == cond1: gene_name = A.index[event.ind[0]] x1 = round(A.loc[gene_name].log2FoldChange,1) y1 = round(-np.log10(A.loc[gene_name].padj),1) try: x2 = round(B.loc[gene_name].log2FoldChange,1) y2 = round(-np.log10(B.loc[gene_name].padj),1) except: x2, y2 = None, None else: gene_name = B.index[event.ind[0]] x1 = round(B.loc[gene_name].log2FoldChange,1) y1 = round(-np.log10(B.loc[gene_name].padj),1) try: x2 = round(A.loc[gene_name].log2FoldChange,1) y2 = round(-np.log10(A.loc[gene_name].padj),1) except: x2, y2 = None, None try: if x2 is None: ax.title.set_text("{} at pos [{},{}]".format( gene_name,x1,y1)) else: ax.title.set_text("{} at pos [{},{}] and [{},{}]".format( gene_name,x1,y1,x2,y2)) except: print("exception") ax.title.set_text("") pylab.draw() fig = pylab.gcf() fig.canvas.mpl_connect('pick_event', onpick)
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#", title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:, "snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:, 'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:, 'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:, 'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:, 'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot( self, size=10, alpha=0.7, marker="o", fontsize=16, xlabel="fold change", logy=False, threshold_lines={ "color": "black", "ls": "--", "width": 0.5 }, ylabel="p-value", add_broken_axes=False, broken_axes={"ylims": ((0, 10), (50, 100))}, ): """ :param size: size of the markers :param alpha: transparency of the marker :param fontsize: :param xlabel: :param ylabel: :param center: If centering the x axis """ pylab.clf() if add_broken_axes: #pragma: no cover from brokenaxes import brokenaxes _ylims = broken_axes.get("ylims", None) _xlims = broken_axes.get("xlims", None) bax = brokenaxes(ylims=_ylims, xlims=_xlims) else: bax = pylab bax.scatter( self.df.fold_change, self.df.pvalue, s=size, alpha=alpha, c=self.df.color, marker=marker, edgecolors="None", ) bax.grid() # pylab.ylim([0, pylab.ylim()[1]]) # M = max(abs(self.fold_change)) * 1.1 # pylab.xlim([-M, M]) try: bax.set_xlabel(xlabel, fontsize=fontsize) bax.set_ylabel(ylabel, fontsize=fontsize) except: bax.xlabel(xlabel, fontsize=fontsize) bax.ylabel(ylabel, fontsize=fontsize) bax.axhline( self.pvalue_threshold, color=threshold_lines["color"], linestyle=threshold_lines["ls"], linewidth=threshold_lines["width"], ) bax.axvline( self.fold_change_threshold, color=threshold_lines["color"], linestyle=threshold_lines["ls"], linewidth=threshold_lines["width"], ) bax.axvline( -1 * self.fold_change_threshold, color=threshold_lines["color"], linestyle=threshold_lines["ls"], linewidth=threshold_lines["width"], ) if logy is True: ax = pylab.gca() ax.set(yscale="log")