def histogram_sequence_lengths(self, logy=True): """Histogram sequence lengths .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.histogram_sequence_lengths() """ data = [len(x) for x in self.sequences] bary, barx = np.histogram(data, bins=range(max(data)+1)) # get rid of zeros to avoid warnings bx = [x for x,y in zip(barx, bary) if y!=0] by = [y for x,y in zip(barx, bary) if y!=0] if logy: pylab.bar(bx, pylab.log10(by)) else: pylab.bar(bx, by) pylab.xlim([1,max(data)+1]) pylab.grid(True) pylab.xlabel("position (bp)", fontsize=self.fontsize) pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
def scale_data(self, transform_method="log", max_features=500): """ - Replace zeros with 1 (avoid log issue) - transform the data using log10 or anscombe transform - scale the data using the scaler attribute (standard scaler by default) """ assert transform_method in ['log', 'anscombe'] # normalise the data # First, we transform the data data = self.df.copy() data = data.replace(0, 1) self.data = data if transform_method == "log": data = pylab.log10(data) elif transform_method == "anscombe": from sequana.vst import VST data = VST.anscombe(data) # then we keep only the first N most dispersed features tokeep = data.std(axis=1).sort_values(ascending=False).index[0:max_features] data = data.loc[tokeep] data = self.scaler.fit_transform(data) return data, tokeep
def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.scatter(-pylab.log10(df['Adjusted P-value']), range(len(df)), s=10 * df['size'], c=df['size']) pylab.xlabel("Odd ratio") pylab.ylabel("Gene sets") pylab.yticks(range(len(df)), df.name) a, b = pylab.xlim() pylab.xlim([0, b]) pylab.grid(True) ax = pylab.gca() M = max(df['size']) if M > 100: l1, l2, l3 = "10", "100", str(M) else: l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M) handles = [ pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""), pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""), pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="") ] ax.legend(handles=handles, loc="upper left", title="gene-set size") pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.tight_layout() ax = pylab.colorbar(pylab.gci()) return df
def plot_hist_coverage(self, logx=True, logy=True, fontsize=16, N=20, fignum=1, hold=False, alpha=0.5, filename=None, **kw_hist): """ """ if hold is False: pylab.figure(fignum) pylab.clf() ax = pylab.gca() ax.set_facecolor('#eeeeee') data = self.df['cov'].dropna().values maxcov = data.max() if logx is True and logy is True: bins = pylab.logspace(0, pylab.log10(maxcov), N) pylab.hist(data, bins=bins, log=True, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.semilogx() pylab.xlabel("Coverage (log scale)", fontsize=fontsize) pylab.ylabel("Count (log scale)", fontsize=fontsize) elif logx is False and logy is True: pylab.hist(data, bins=N, log=True, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.xlabel("Coverage", fontsize=fontsize) pylab.ylabel("Count (log scale)", fontsize=fontsize) elif logx is True and logy is False: bins = pylab.logspace(0, pylab.log10(maxcov), N) pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.xlabel("Coverage (log scale)", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) pylab.semilogx() else: pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.xlabel("Coverage", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) pylab.grid(True) if filename: pylab.savefig(filename)
def plot_density(self): import seaborn seaborn.set() for sample in self.counts_raw.columns: seaborn.kdeplot(pylab.log10(self.counts_raw[sample].clip(lower=1))) self._format_plot( title="Count density distribution", xlabel="Raw counts (log10)", ylabel="Density", )
def barplot(self, enrich, cutoff=0.05, nmax=10): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value'])) pylab.yticks(range(len(df)), df.name) pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.grid(True) pylab.xlabel("Adjusted p-value (log10)") pylab.ylabel("Gene sets") a, b = pylab.xlim() pylab.xlim([0, b]) pylab.tight_layout() return df
def plot(self, bins=100, cmap="hot_r", fontsize=10, Nlevels=4, xlabel=None, ylabel=None, norm=None, range=None, normed=False, colorbar=True, contour=True, grid=True, **kargs): """plots histogram of mean across replicates versus coefficient variation :param int bins: binning for the 2D histogram (either a float or list of 2 binning values). :param cmap: a valid colormap (defaults to hot_r) :param fontsize: fontsize for the labels :param int Nlevels: must be more than 2 :param str xlabel: set the xlabel (overwrites content of the dataframe) :param str ylabel: set the ylabel (overwrites content of the dataframe) :param norm: set to 'log' to show the log10 of the values. :param normed: normalise the data :param range: as in pylab.Hist2D : a 2x2 shape [[-3,3],[-4,4]] :param contour: show some contours (default to True) :param bool grid: Show unerlying grid (defaults to True) If the input is a dataframe, the xlabel and ylabel will be populated with the column names of the dataframe. """ X = self.df[self.df.columns[0]].values Y = self.df[self.df.columns[1]].values if len(X) > 10000: logger.info("Computing 2D histogram. Please wait") pylab.clf() if norm == 'log': from matplotlib import colors res = pylab.hist2d(X, Y, bins=bins, density=normed, cmap=cmap, norm=colors.LogNorm()) else: res = pylab.hist2d(X, Y, bins=bins, cmap=cmap, density=normed, range=range) if colorbar is True: pylab.colorbar() if contour: try: bins1 = bins[0] bins2 = bins[1] except: bins1 = bins bins2 = bins X, Y = pylab.meshgrid(res[1][0:bins1], res[2][0:bins2]) if contour: if res[0].max().max() < 10 and norm == 'log': pylab.contour(X, Y, res[0].transpose()) else: levels = [ round(x) for x in pylab.logspace( 0, pylab.log10(res[0].max().max()), Nlevels) ] pylab.contour(X, Y, res[0].transpose(), levels[2:]) #pylab.clabel(C, fontsize=fontsize, inline=1) if ylabel is None: ylabel = self.df.columns[1] if xlabel is None: xlabel = self.df.columns[0] pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True) return res
def plot_volcano( self, padj=0.05, add_broken_axes=False, markersize=4, limit_broken_line=[20, 40], plotly=False, annotations=None, ): """ .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_volcano() """ if plotly: from plotly import express as px df = self.df.copy() if annotations is not None: try: df = pd.concat([df, annotations.annotation], axis=1) except Exception as err: logger.warning( f"Could not merge rnadiff table with annotation. Full error is: {err}" ) df["log_adj_pvalue"] = -pylab.log10(df.padj) df["significance"] = [ "<{}".format(padj) if x else ">={}".format(padj) for x in df.padj < padj ] if "Name" in df.columns: hover_name = "Name" elif "gene_id" in df.columns: hover_name = "gene_id" elif "locus_tag" in df.columns: hover_name = "locus_tag" elif "ID" in df.columns: hover_name = "ID" else: hover_name = None fig = px.scatter( df, x="log2FoldChange", y="log_adj_pvalue", hover_name=hover_name, hover_data=["baseMean"], log_y=False, opacity=0.5, color="significance", height=600, labels={"log_adj_pvalue": "log adjusted p-value"}, ) # axes[0].axhline( # -np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)" # i) # in future version of plotly, a add_hlines will be available. For # now, this is the only way to add axhline fig.update_layout(shapes=[ dict( type="line", xref="x", x0=df.log2FoldChange.min(), x1=df.log2FoldChange.max(), yref="y", y0=-pylab.log10(padj), y1=-pylab.log10(padj), line=dict(color="black", width=1, dash="dash"), ) ]) return fig from brokenaxes import brokenaxes M = max(-pylab.log10(self.df.padj.dropna())) br1, br2 = limit_broken_line if M > br1: if add_broken_axes: bax = brokenaxes(ylims=((0, br1), (M - 10, M)), xlims=None) else: bax = pylab else: bax = pylab d1 = self.df.query("padj>@padj") d2 = self.df.query("padj<=@padj") bax.plot( d1.log2FoldChange, -np.log10(d1.padj), marker="o", alpha=0.5, color="k", lw=0, markersize=markersize, ) bax.plot( d2.log2FoldChange, -np.log10(d2.padj), marker="o", alpha=0.5, color="r", lw=0, markersize=markersize, ) bax.grid(True) try: bax.set_xlabel("fold change") bax.set_ylabel("log10 adjusted p-value") except: bax.xlabel("fold change") bax.ylabel("log10 adjusted p-value") m1 = abs(min(self.df.log2FoldChange)) m2 = max(self.df.log2FoldChange) limit = max(m1, m2) try: bax.set_xlim([-limit, limit]) except: bax.xlim([-limit, limit]) try: y1, _ = bax.get_ylim() ax1 = bax.axs[0].set_ylim([br2, y1[1] * 1.1]) except: y1, y2 = bax.ylim() bax.ylim([0, y2]) bax.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)") return bax if colors is None: colors = {} for sample in self.sample_names: colors[sample] = self.colors[self.get_cond_from_sample(sample)] if plotly is True: assert n_components == 3 variance = p.plot( n_components=n_components, colors=colors, show_plot=False, max_features=max_features, ) from plotly import express as px df = pd.DataFrame(p.Xr) df.columns = ["PC1", "PC2", "PC3"] df["names"] = self.sample_names df["colors"] = [colors[x] for x in self.sample_names] df["size"] = [10] * len(df) df[self.condition] = [ self.get_cond_from_sample(sample) for sample in self.sample_names ] fig = px.scatter_3d( df, x="PC1", y="PC2", z="PC3", color=self.condition, labels={ "PC1": "PC1 ({}%)".format(round(100 * variance[0], 2)), "PC2": "PC2 ({}%)".format(round(100 * variance[1], 2)), "PC3": "PC3 ({}%)".format(round(100 * variance[2], 2)), }, height=800, text="names", ) return fig else: variance = p.plot(n_components=n_components, colors=colors, max_features=max_features) return variance
def _get_summary_pathway(self, pathway_ID): genes = self.df_pathways.loc[pathway_ID]['GENE'] df_down = self.rnadiff.df.query( "padj<=0.05 and log2FoldChange<0").copy() df_up = self.rnadiff.df.query("padj<=0.05 and log2FoldChange>0").copy() #f_down = self.rnadiff.dr_gene_lists[self.comparison] logger.info("Total down-regulated: {}".format(len(df_down))) logger.info("Total up-regulated: {}".format(len(df_up))) mapper = {} for k, v in genes.items(): mapper[v.split(";")[0]] = k self.genes = genes self.df_down = df_down self.df_up = df_up summary_names = [] summary_keggids = [] summary_types = [] summary_pvalues = [] summary_fcs = [] if self.mapper is not None: if 'Name' not in df_down.columns: df_down['Name'] = df_down['ID'] Names = [] for index in df_down.index: Names.append(self.mapper.loc[index]['name'][0]) df_down['Name'] = Names if 'Name' not in df_up.columns: df_up['Name'] = df_up['ID'] Names = [] for index in df_up.index: Names.append(self.mapper.loc[index]['name'][0]) df_up['Name'] = Names for name, kegg_id in mapper.items(): summary_names.append(name) summary_keggids.append(kegg_id) if name.lower() in [x.lower() for x in df_down.Name]: pvalue = -pylab.log10( df_down.query("Name==@name").pvalue.values[0]) fc = df_down.query("Name==@name").log2FoldChange.values[0] summary_fcs.append(fc) summary_pvalues.append(pvalue) summary_types.append("-") elif name.lower() in [x.lower() for x in df_up.Name]: pvalue = -pylab.log10( df_up.query("Name==@name").pvalue.values[0]) summary_pvalues.append(pvalue) fc = df_up.query("Name==@name").log2FoldChange.values[0] summary_fcs.append(fc) summary_types.append("+") else: summary_pvalues.append(None) summary_fcs.append(None) summary_types.append("=") summary = pd.DataFrame({ "type": summary_types, "name": summary_names, "pvalue": summary_pvalues, "fc": summary_fcs, "keggid": summary_keggids }) summary['description'] = [ self.pathways[pathway_ID]['GENE'][x] for x in summary.keggid ] return summary
def plot_go_terms(self, ontologies, max_features=50, log=False, fontsize=8, minimum_genes=0, pvalue=0.05, cmap="summer_r", sort_by="fold_enrichment", show_pvalues=False, include_negative_enrichment=False, fdr_threshold=0.05, compute_levels=True, progress=True): assert sort_by in ['pValue', 'fold_enrichment', 'fdr'] # FIXME: pvalue and fold_enrichment not sorted in same order pylab.clf() df = self.get_data( ontologies, include_negative_enrichment=include_negative_enrichment, fdr=fdr_threshold) if len(df) == 0: return df df = df.query("pValue<=@pvalue") logger.info("Filtering out pvalue>{}. Kept {} GO terms".format( pvalue, len(df))) df = df.reset_index(drop=True) # Select a subset of the data to keep the best max_features in terms of # pValue subdf = df.query("number_in_list>@minimum_genes").copy() logger.info( "Filtering out GO terms with less than {} genes: Kept {} GO terms". format(minimum_genes, len(subdf))) logger.info("Filtering out the 3 parent terms") subdf = subdf.query("id not in @self.ontologies") # Keeping only a part of the data, sorting by pValue if sort_by == "pValue": subdf = subdf.sort_values(by="pValue", ascending=False).iloc[-max_features:] df = df.sort_values(by="pValue", ascending=False) elif sort_by == "fold_enrichment": subdf = subdf.sort_values(by="abs_log2_fold_enrichment", ascending=True).iloc[-max_features:] df = df.sort_values(by="abs_log2_fold_enrichment", ascending=False) elif sort_by == "fdr": subdf = subdf.sort_values(by="fdr", ascending=False).iloc[-max_features:] df = df.sort_values(by="fdr", ascending=False) subdf = subdf.reset_index(drop=True) # We get all levels for each go id. # They are stored by MF, CC or BP if compute_levels: paths = self.get_graph(list(subdf['id'].values), progress=progress) levels = [] keys = list(paths.keys()) goid_levels = paths[keys[0]] if len(keys) > 1: for k in keys[1:]: goid_levels.update(paths[k]) levels = [goid_levels[ID] for ID in subdf['id'].values] subdf["level"] = levels else: subdf['level'] = "" N = len(subdf) size_factor = 12000 / len(subdf) max_size = subdf.number_in_list.max() min_size = subdf.number_in_list.min() sizes = [ max(max_size * 0.2, x) for x in size_factor * subdf.number_in_list.values / subdf.number_in_list.max() ] m1 = min(sizes) m3 = max(sizes) m2 = m1 + (m3 - m1) / 2 if log: pylab.scatter(pylab.log2(subdf.fold_enrichment), range(len(subdf)), c=subdf.fdr, s=sizes, cmap=cmap, alpha=0.8, ec="k", vmin=0, vmax=fdr_threshold, zorder=10) #pylab.barh(range(N), pylab.log2(subdf.fold_enrichment), color="r", # label="pvalue>0.05; FDR>0.05") #pylab.axvline(1, color="gray", ls="--") #pylab.axvline(-1, color="gray", ls="--") else: pylab.scatter(subdf.fold_enrichment, range(len(subdf)), c=subdf.fdr, cmap=cmap, s=sizes, ec="k", alpha=.8, vmin=0, vmax=fdr_threshold, zorder=10) # pylab.barh(range(N), subdf.fold_enrichment, color="r", # label="not significant") pylab.grid(zorder=-10) ax2 = pylab.colorbar(shrink=0.5) ax2.ax.set_ylabel('FDR') labels = [ x if len(x) < 50 else x[0:47] + "..." for x in list(subdf.label) ] ticks = [ "{} ({}) {}".format(ID, level, "; " + label.title()) for level, ID, label in zip(subdf['level'], subdf.id, labels) ] pylab.yticks(range(N), ticks, fontsize=fontsize, ha='left') yax = pylab.gca().get_yaxis() try: pad = [x.label.get_window_extent().width for x in yax.majorTicks] yax.set_tick_params(pad=max(pad)) except: yax.set_tick_params(pad=60 * fontsize * 0.7) yax.set_tick_params(pad=60 * fontsize * 0.6) fc_max = subdf.fold_enrichment.max(skipna=True) fc_min = subdf.fold_enrichment.min(skipna=True) # go into log2 space fc_max = pylab.log2(fc_max) fc_min = pylab.log2(fc_min) abs_max = max(fc_max, abs(fc_min), 1) if log: fc_max = abs_max * 1.5 else: fc_max = 2**abs_max * 1.2 pylab.axvline(0, color="k", lw=2) if log: pylab.xlabel("Fold Enrichment (log2)") else: pylab.xlabel("Fold Enrichment") if include_negative_enrichment: pylab.xlim([-fc_max, fc_max]) else: pylab.xlim([0, fc_max]) pylab.tight_layout() # The pvalue: if show_pvalues: ax = pylab.gca().twiny() ax.set_xlim([0, max(-pylab.log10(subdf.pValue)) * 1.2]) ax.set_xlabel("p-values (log10)", fontsize=12) ax.plot(-pylab.log10(subdf.pValue), range(len(subdf)), label="pvalue", lw=2, color="k") ax.axvline(1.33, lw=1, ls="--", color="grey", label="pvalue=0.05") pylab.tight_layout() pylab.legend(loc="lower right") s1 = pylab.scatter([], [], s=m1, marker='o', color='#555555', ec="k") s2 = pylab.scatter([], [], s=m2, marker='o', color='#555555', ec="k") s3 = pylab.scatter([], [], s=m3, marker='o', color='#555555', ec="k") if len(subdf) < 10: labelspacing = 1.5 * 4 borderpad = 4 handletextpad = 2 elif len(subdf) < 20: labelspacing = 1.5 * 2 borderpad = 1 handletextpad = 2 else: labelspacing = 1.5 borderpad = 2 handletextpad = 2 if len(subdf) >= 3: leg = pylab.legend( (s1, s2, s3), (str(int(min_size)), str(int(min_size + (max_size - min_size) / 2)), str(int(max_size))), scatterpoints=1, loc='lower right', ncol=1, frameon=True, title="gene-set size", labelspacing=labelspacing, borderpad=borderpad, handletextpad=handletextpad, fontsize=8) else: leg = pylab.legend((s1, ), (str(int(min_size)), ), scatterpoints=1, loc='lower right', ncol=1, frameon=True, title="gene-set size", labelspacing=labelspacing, borderpad=borderpad, handletextpad=handletextpad, fontsize=8) frame = leg.get_frame() frame.set_facecolor('#b4aeae') frame.set_edgecolor('black') frame.set_alpha(1) self.subdf = subdf self.df = df return df