Example #1
0
    def histogram_sequence_lengths(self, logy=True):
        """Histogram sequence lengths

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.histogram_sequence_lengths()

        """
        data = [len(x) for x in self.sequences]
        bary, barx = np.histogram(data, bins=range(max(data)+1))

        # get rid of zeros to avoid warnings
        bx = [x for x,y in zip(barx, bary) if y!=0]
        by = [y for x,y in zip(barx, bary) if y!=0]
        if logy:
            pylab.bar(bx, pylab.log10(by))
        else:
            pylab.bar(bx, by)

        pylab.xlim([1,max(data)+1])

        pylab.grid(True)
        pylab.xlabel("position (bp)", fontsize=self.fontsize)
        pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
Example #2
0
    def scale_data(self, transform_method="log", max_features=500):
        """

        - Replace zeros with 1 (avoid log issue)
        - transform the data using log10 or anscombe transform
        - scale the data using the scaler attribute (standard scaler by default)

        """
        assert transform_method in ['log', 'anscombe']
        # normalise the data

        # First, we transform the data
        data = self.df.copy()
        data = data.replace(0, 1)
        self.data = data
        if transform_method == "log":
            data = pylab.log10(data)
        elif transform_method == "anscombe":
            from sequana.vst import VST
            data = VST.anscombe(data)

        # then we keep only the first N most dispersed features
        tokeep = data.std(axis=1).sort_values(ascending=False).index[0:max_features]
        data = data.loc[tokeep]
        data = self.scaler.fit_transform(data)
        return data, tokeep
Example #3
0
    def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.scatter(-pylab.log10(df['Adjusted P-value']),
                      range(len(df)),
                      s=10 * df['size'],
                      c=df['size'])

        pylab.xlabel("Odd ratio")
        pylab.ylabel("Gene sets")
        pylab.yticks(range(len(df)), df.name)
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.grid(True)
        ax = pylab.gca()

        M = max(df['size'])
        if M > 100:
            l1, l2, l3 = "10", "100", str(M)
        else:
            l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M)

        handles = [
            pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="")
        ]
        ax.legend(handles=handles, loc="upper left", title="gene-set size")

        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.tight_layout()
        ax = pylab.colorbar(pylab.gci())
        return df
Example #4
0
    def plot_hist_coverage(self, logx=True, logy=True, fontsize=16, N=20,
        fignum=1, hold=False, alpha=0.5, filename=None, **kw_hist):
        """


        """
        if hold is False:
            pylab.figure(fignum)
            pylab.clf()
        ax = pylab.gca()
        ax.set_facecolor('#eeeeee')

        data = self.df['cov'].dropna().values

        maxcov = data.max()
        if logx is True and logy is True:
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
            pylab.hist(data, bins=bins, log=True, label=self.chrom_name,
                alpha=alpha, **kw_hist)
            pylab.semilogx()
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
        elif logx is False and logy is True:
            pylab.hist(data, bins=N, log=True, label=self.chrom_name,
                alpha=alpha, **kw_hist)
            pylab.xlabel("Coverage", fontsize=fontsize)
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
        elif logx is True and logy is False:
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
            pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha,
                **kw_hist)
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
            pylab.ylabel("Count", fontsize=fontsize)
            pylab.semilogx()
        else:
            pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha,
                **kw_hist)
            pylab.xlabel("Coverage", fontsize=fontsize)
            pylab.ylabel("Count", fontsize=fontsize)
        pylab.grid(True)
        if filename:
            pylab.savefig(filename)
Example #5
0
    def plot_density(self):
        import seaborn

        seaborn.set()
        for sample in self.counts_raw.columns:
            seaborn.kdeplot(pylab.log10(self.counts_raw[sample].clip(lower=1)))

        self._format_plot(
            title="Count density distribution",
            xlabel="Raw counts (log10)",
            ylabel="Density",
        )
Example #6
0
    def barplot(self, enrich, cutoff=0.05, nmax=10):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value']))
        pylab.yticks(range(len(df)), df.name)
        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.grid(True)
        pylab.xlabel("Adjusted p-value (log10)")
        pylab.ylabel("Gene sets")
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.tight_layout()
        return df
Example #7
0
    def plot(self,
             bins=100,
             cmap="hot_r",
             fontsize=10,
             Nlevels=4,
             xlabel=None,
             ylabel=None,
             norm=None,
             range=None,
             normed=False,
             colorbar=True,
             contour=True,
             grid=True,
             **kargs):
        """plots histogram of mean across replicates versus coefficient variation

        :param int bins: binning for the 2D histogram (either a float or list
            of 2 binning values).
        :param cmap: a valid colormap (defaults to hot_r)
        :param fontsize: fontsize for the labels
        :param int Nlevels: must be more than 2
        :param str xlabel: set the xlabel (overwrites content of the dataframe)
        :param str ylabel: set the ylabel (overwrites content of the dataframe)
        :param norm: set to 'log' to show the log10 of the values.
        :param normed: normalise the data
        :param range: as in pylab.Hist2D : a 2x2 shape [[-3,3],[-4,4]]
        :param contour: show some contours (default to True)
        :param bool grid: Show unerlying grid (defaults to True)

        If the input is a dataframe, the xlabel and ylabel will be populated
        with the column names of the dataframe.

        """
        X = self.df[self.df.columns[0]].values
        Y = self.df[self.df.columns[1]].values
        if len(X) > 10000:
            logger.info("Computing 2D histogram. Please wait")

        pylab.clf()
        if norm == 'log':
            from matplotlib import colors
            res = pylab.hist2d(X,
                               Y,
                               bins=bins,
                               density=normed,
                               cmap=cmap,
                               norm=colors.LogNorm())
        else:
            res = pylab.hist2d(X,
                               Y,
                               bins=bins,
                               cmap=cmap,
                               density=normed,
                               range=range)

        if colorbar is True:
            pylab.colorbar()

        if contour:
            try:
                bins1 = bins[0]
                bins2 = bins[1]
            except:
                bins1 = bins
                bins2 = bins

            X, Y = pylab.meshgrid(res[1][0:bins1], res[2][0:bins2])
            if contour:
                if res[0].max().max() < 10 and norm == 'log':
                    pylab.contour(X, Y, res[0].transpose())
                else:
                    levels = [
                        round(x) for x in pylab.logspace(
                            0, pylab.log10(res[0].max().max()), Nlevels)
                    ]
                    pylab.contour(X, Y, res[0].transpose(), levels[2:])
                #pylab.clabel(C, fontsize=fontsize, inline=1)

        if ylabel is None:
            ylabel = self.df.columns[1]
        if xlabel is None:
            xlabel = self.df.columns[0]

        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)

        if grid is True:
            pylab.grid(True)

        return res
Example #8
0
    def plot_volcano(
        self,
        padj=0.05,
        add_broken_axes=False,
        markersize=4,
        limit_broken_line=[20, 40],
        plotly=False,
        annotations=None,
    ):
        """

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_volcano()

        """

        if plotly:
            from plotly import express as px

            df = self.df.copy()

            if annotations is not None:
                try:
                    df = pd.concat([df, annotations.annotation], axis=1)
                except Exception as err:
                    logger.warning(
                        f"Could not merge rnadiff table with annotation. Full error is: {err}"
                    )
            df["log_adj_pvalue"] = -pylab.log10(df.padj)
            df["significance"] = [
                "<{}".format(padj) if x else ">={}".format(padj)
                for x in df.padj < padj
            ]

            if "Name" in df.columns:
                hover_name = "Name"
            elif "gene_id" in df.columns:
                hover_name = "gene_id"
            elif "locus_tag" in df.columns:
                hover_name = "locus_tag"
            elif "ID" in df.columns:
                hover_name = "ID"
            else:
                hover_name = None
            fig = px.scatter(
                df,
                x="log2FoldChange",
                y="log_adj_pvalue",
                hover_name=hover_name,
                hover_data=["baseMean"],
                log_y=False,
                opacity=0.5,
                color="significance",
                height=600,
                labels={"log_adj_pvalue": "log adjusted p-value"},
            )
            # axes[0].axhline(
            # -np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)"
            # i)
            # in future version of plotly, a add_hlines will be available. For
            # now, this is the only way to add axhline
            fig.update_layout(shapes=[
                dict(
                    type="line",
                    xref="x",
                    x0=df.log2FoldChange.min(),
                    x1=df.log2FoldChange.max(),
                    yref="y",
                    y0=-pylab.log10(padj),
                    y1=-pylab.log10(padj),
                    line=dict(color="black", width=1, dash="dash"),
                )
            ])

            return fig

        from brokenaxes import brokenaxes

        M = max(-pylab.log10(self.df.padj.dropna()))

        br1, br2 = limit_broken_line
        if M > br1:
            if add_broken_axes:
                bax = brokenaxes(ylims=((0, br1), (M - 10, M)), xlims=None)
            else:
                bax = pylab
        else:
            bax = pylab

        d1 = self.df.query("padj>@padj")
        d2 = self.df.query("padj<=@padj")
        bax.plot(
            d1.log2FoldChange,
            -np.log10(d1.padj),
            marker="o",
            alpha=0.5,
            color="k",
            lw=0,
            markersize=markersize,
        )
        bax.plot(
            d2.log2FoldChange,
            -np.log10(d2.padj),
            marker="o",
            alpha=0.5,
            color="r",
            lw=0,
            markersize=markersize,
        )

        bax.grid(True)
        try:
            bax.set_xlabel("fold change")
            bax.set_ylabel("log10 adjusted p-value")
        except:
            bax.xlabel("fold change")
            bax.ylabel("log10 adjusted p-value")

        m1 = abs(min(self.df.log2FoldChange))
        m2 = max(self.df.log2FoldChange)
        limit = max(m1, m2)
        try:
            bax.set_xlim([-limit, limit])
        except:
            bax.xlim([-limit, limit])
        try:
            y1, _ = bax.get_ylim()
            ax1 = bax.axs[0].set_ylim([br2, y1[1] * 1.1])
        except:
            y1, y2 = bax.ylim()
            bax.ylim([0, y2])
        bax.axhline(-np.log10(0.05),
                    lw=2,
                    ls="--",
                    color="r",
                    label="pvalue threshold (0.05)")
        return bax

        if colors is None:
            colors = {}
            for sample in self.sample_names:
                colors[sample] = self.colors[self.get_cond_from_sample(sample)]

        if plotly is True:
            assert n_components == 3
            variance = p.plot(
                n_components=n_components,
                colors=colors,
                show_plot=False,
                max_features=max_features,
            )
            from plotly import express as px

            df = pd.DataFrame(p.Xr)
            df.columns = ["PC1", "PC2", "PC3"]
            df["names"] = self.sample_names
            df["colors"] = [colors[x] for x in self.sample_names]
            df["size"] = [10] * len(df)
            df[self.condition] = [
                self.get_cond_from_sample(sample)
                for sample in self.sample_names
            ]
            fig = px.scatter_3d(
                df,
                x="PC1",
                y="PC2",
                z="PC3",
                color=self.condition,
                labels={
                    "PC1": "PC1 ({}%)".format(round(100 * variance[0], 2)),
                    "PC2": "PC2 ({}%)".format(round(100 * variance[1], 2)),
                    "PC3": "PC3 ({}%)".format(round(100 * variance[2], 2)),
                },
                height=800,
                text="names",
            )
            return fig
        else:
            variance = p.plot(n_components=n_components,
                              colors=colors,
                              max_features=max_features)

        return variance
Example #9
0
    def _get_summary_pathway(self, pathway_ID):
        genes = self.df_pathways.loc[pathway_ID]['GENE']
        df_down = self.rnadiff.df.query(
            "padj<=0.05 and log2FoldChange<0").copy()
        df_up = self.rnadiff.df.query("padj<=0.05 and log2FoldChange>0").copy()

        #f_down = self.rnadiff.dr_gene_lists[self.comparison]

        logger.info("Total down-regulated: {}".format(len(df_down)))
        logger.info("Total up-regulated: {}".format(len(df_up)))

        mapper = {}
        for k, v in genes.items():
            mapper[v.split(";")[0]] = k
        self.genes = genes
        self.df_down = df_down
        self.df_up = df_up
        summary_names = []
        summary_keggids = []
        summary_types = []
        summary_pvalues = []
        summary_fcs = []

        if self.mapper is not None:
            if 'Name' not in df_down.columns:
                df_down['Name'] = df_down['ID']
                Names = []
                for index in df_down.index:
                    Names.append(self.mapper.loc[index]['name'][0])
                df_down['Name'] = Names
            if 'Name' not in df_up.columns:
                df_up['Name'] = df_up['ID']
                Names = []
                for index in df_up.index:
                    Names.append(self.mapper.loc[index]['name'][0])
                df_up['Name'] = Names

        for name, kegg_id in mapper.items():
            summary_names.append(name)
            summary_keggids.append(kegg_id)

            if name.lower() in [x.lower() for x in df_down.Name]:
                pvalue = -pylab.log10(
                    df_down.query("Name==@name").pvalue.values[0])
                fc = df_down.query("Name==@name").log2FoldChange.values[0]
                summary_fcs.append(fc)
                summary_pvalues.append(pvalue)
                summary_types.append("-")
            elif name.lower() in [x.lower() for x in df_up.Name]:
                pvalue = -pylab.log10(
                    df_up.query("Name==@name").pvalue.values[0])
                summary_pvalues.append(pvalue)
                fc = df_up.query("Name==@name").log2FoldChange.values[0]
                summary_fcs.append(fc)
                summary_types.append("+")
            else:
                summary_pvalues.append(None)
                summary_fcs.append(None)
                summary_types.append("=")

        summary = pd.DataFrame({
            "type": summary_types,
            "name": summary_names,
            "pvalue": summary_pvalues,
            "fc": summary_fcs,
            "keggid": summary_keggids
        })
        summary['description'] = [
            self.pathways[pathway_ID]['GENE'][x] for x in summary.keggid
        ]
        return summary
Example #10
0
    def plot_go_terms(self,
                      ontologies,
                      max_features=50,
                      log=False,
                      fontsize=8,
                      minimum_genes=0,
                      pvalue=0.05,
                      cmap="summer_r",
                      sort_by="fold_enrichment",
                      show_pvalues=False,
                      include_negative_enrichment=False,
                      fdr_threshold=0.05,
                      compute_levels=True,
                      progress=True):

        assert sort_by in ['pValue', 'fold_enrichment', 'fdr']

        # FIXME: pvalue and fold_enrichment not sorted in same order
        pylab.clf()

        df = self.get_data(
            ontologies,
            include_negative_enrichment=include_negative_enrichment,
            fdr=fdr_threshold)

        if len(df) == 0:
            return df

        df = df.query("pValue<=@pvalue")
        logger.info("Filtering out pvalue>{}. Kept {} GO terms".format(
            pvalue, len(df)))
        df = df.reset_index(drop=True)

        # Select a subset of the data to keep the best max_features in terms of
        # pValue
        subdf = df.query("number_in_list>@minimum_genes").copy()
        logger.info(
            "Filtering out GO terms with less than {} genes: Kept {} GO terms".
            format(minimum_genes, len(subdf)))

        logger.info("Filtering out the 3 parent terms")
        subdf = subdf.query("id not in @self.ontologies")

        # Keeping only a part of the data, sorting by pValue
        if sort_by == "pValue":
            subdf = subdf.sort_values(by="pValue",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="pValue", ascending=False)
        elif sort_by == "fold_enrichment":
            subdf = subdf.sort_values(by="abs_log2_fold_enrichment",
                                      ascending=True).iloc[-max_features:]
            df = df.sort_values(by="abs_log2_fold_enrichment", ascending=False)
        elif sort_by == "fdr":
            subdf = subdf.sort_values(by="fdr",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="fdr", ascending=False)

        subdf = subdf.reset_index(drop=True)

        # We get all levels for each go id.
        # They are stored by MF, CC or BP
        if compute_levels:
            paths = self.get_graph(list(subdf['id'].values), progress=progress)
            levels = []
            keys = list(paths.keys())
            goid_levels = paths[keys[0]]
            if len(keys) > 1:
                for k in keys[1:]:
                    goid_levels.update(paths[k])
            levels = [goid_levels[ID] for ID in subdf['id'].values]
            subdf["level"] = levels
        else:
            subdf['level'] = ""
        N = len(subdf)

        size_factor = 12000 / len(subdf)
        max_size = subdf.number_in_list.max()
        min_size = subdf.number_in_list.min()
        sizes = [
            max(max_size * 0.2, x) for x in size_factor *
            subdf.number_in_list.values / subdf.number_in_list.max()
        ]

        m1 = min(sizes)
        m3 = max(sizes)
        m2 = m1 + (m3 - m1) / 2

        if log:
            pylab.scatter(pylab.log2(subdf.fold_enrichment),
                          range(len(subdf)),
                          c=subdf.fdr,
                          s=sizes,
                          cmap=cmap,
                          alpha=0.8,
                          ec="k",
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
            #pylab.barh(range(N), pylab.log2(subdf.fold_enrichment), color="r",
            #    label="pvalue>0.05; FDR>0.05")
            #pylab.axvline(1, color="gray", ls="--")
            #pylab.axvline(-1, color="gray", ls="--")
        else:
            pylab.scatter(subdf.fold_enrichment,
                          range(len(subdf)),
                          c=subdf.fdr,
                          cmap=cmap,
                          s=sizes,
                          ec="k",
                          alpha=.8,
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
        #    pylab.barh(range(N), subdf.fold_enrichment, color="r",
        #    label="not significant")
        pylab.grid(zorder=-10)
        ax2 = pylab.colorbar(shrink=0.5)
        ax2.ax.set_ylabel('FDR')

        labels = [
            x if len(x) < 50 else x[0:47] + "..." for x in list(subdf.label)
        ]
        ticks = [
            "{} ({}) {}".format(ID, level, "; " + label.title())
            for level, ID, label in zip(subdf['level'], subdf.id, labels)
        ]

        pylab.yticks(range(N), ticks, fontsize=fontsize, ha='left')

        yax = pylab.gca().get_yaxis()
        try:
            pad = [x.label.get_window_extent().width for x in yax.majorTicks]
            yax.set_tick_params(pad=max(pad))
        except:
            yax.set_tick_params(pad=60 * fontsize * 0.7)
        yax.set_tick_params(pad=60 * fontsize * 0.6)

        fc_max = subdf.fold_enrichment.max(skipna=True)
        fc_min = subdf.fold_enrichment.min(skipna=True)
        # go into log2 space
        fc_max = pylab.log2(fc_max)
        fc_min = pylab.log2(fc_min)
        abs_max = max(fc_max, abs(fc_min), 1)

        if log:
            fc_max = abs_max * 1.5
        else:
            fc_max = 2**abs_max * 1.2

        pylab.axvline(0, color="k", lw=2)
        if log:
            pylab.xlabel("Fold Enrichment (log2)")
        else:
            pylab.xlabel("Fold Enrichment")
        if include_negative_enrichment:
            pylab.xlim([-fc_max, fc_max])
        else:
            pylab.xlim([0, fc_max])
        pylab.tight_layout()

        # The pvalue:
        if show_pvalues:
            ax = pylab.gca().twiny()
            ax.set_xlim([0, max(-pylab.log10(subdf.pValue)) * 1.2])
            ax.set_xlabel("p-values (log10)", fontsize=12)
            ax.plot(-pylab.log10(subdf.pValue),
                    range(len(subdf)),
                    label="pvalue",
                    lw=2,
                    color="k")
            ax.axvline(1.33, lw=1, ls="--", color="grey", label="pvalue=0.05")
            pylab.tight_layout()
            pylab.legend(loc="lower right")
        s1 = pylab.scatter([], [], s=m1, marker='o', color='#555555', ec="k")
        s2 = pylab.scatter([], [], s=m2, marker='o', color='#555555', ec="k")
        s3 = pylab.scatter([], [], s=m3, marker='o', color='#555555', ec="k")

        if len(subdf) < 10:
            labelspacing = 1.5 * 4
            borderpad = 4
            handletextpad = 2
        elif len(subdf) < 20:
            labelspacing = 1.5 * 2
            borderpad = 1
            handletextpad = 2
        else:
            labelspacing = 1.5
            borderpad = 2
            handletextpad = 2

        if len(subdf) >= 3:
            leg = pylab.legend(
                (s1, s2, s3),
                (str(int(min_size)),
                 str(int(min_size +
                         (max_size - min_size) / 2)), str(int(max_size))),
                scatterpoints=1,
                loc='lower right',
                ncol=1,
                frameon=True,
                title="gene-set size",
                labelspacing=labelspacing,
                borderpad=borderpad,
                handletextpad=handletextpad,
                fontsize=8)
        else:
            leg = pylab.legend((s1, ), (str(int(min_size)), ),
                               scatterpoints=1,
                               loc='lower right',
                               ncol=1,
                               frameon=True,
                               title="gene-set size",
                               labelspacing=labelspacing,
                               borderpad=borderpad,
                               handletextpad=handletextpad,
                               fontsize=8)

        frame = leg.get_frame()
        frame.set_facecolor('#b4aeae')
        frame.set_edgecolor('black')
        frame.set_alpha(1)

        self.subdf = subdf
        self.df = df
        return df