Beispiel #1
0
    def plot_unknown_barcodes(self, N=20):
        ub = self.data['UnknownBarcodes']
        df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub})
        if "unknown" in df.index and len(df) == 1:
            df.loc['known'] = [0 for i in df.columns]

        # if data is made of undetermined only, the dataframe is just made of
        # N lanes with one entry : unknown
        S = df.sum(axis=1).sort_values(ascending=False).index[0:N]
        data = df.loc[S][::-1]
        #print(data)

        data.columns = ["Lane {}".format(x) for x in data.columns]
        from matplotlib import rcParams
        rcParams['axes.axisbelow'] = True
        pylab.figure(figsize=(10, 8))
        ax = pylab.gca()
        data.plot(kind="barh", width=1, ec="k", ax=ax)
        rcParams['axes.axisbelow'] = False
        pylab.xlabel("Number of reads", fontsize=12)
        pylab.ylabel("")
        pylab.grid(True)
        pylab.legend(
            ["Lane {}".format(x) for x in range(1,
                                                len(df.columns) + 1)],
            loc="lower right")
        try:
            pylab.tight_layout()
        except Exception as err:
            print(err)
        return data
Beispiel #2
0
    def plot_volcano(self):
        """
        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_volcano()

        """
        d1 = self.df.query("padj>0.05")
        d2 = self.df.query("padj<=0.05")

        fig = pylab.figure()
        pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o",
            alpha=0.5, color="r", lw=0)
        pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o",
            alpha=0.5, color="k", lw=0)

        pylab.grid(True)
        pylab.xlabel("fold change")
        pylab.ylabel("log10 adjusted p-value")
        m1 = abs(min(self.df.log2FoldChange))
        m2 = max(self.df.log2FoldChange)
        limit = max(m1,m2)
        pylab.xlim([-limit, limit])
        y1,y2 = pylab.ylim()
        pylab.ylim([0,y2])

        pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
Beispiel #3
0
    def run_enrichment_kegg(self,
                            organism,
                            annot_col="Name",
                            out_dir="enrichment"):  # pragma: no cover

        out_dir = Path(out_dir) / "figures"
        out_dir.mkdir(exist_ok=True, parents=True)

        gene_lists_dict = self.get_gene_lists(annot_col=annot_col, dropna=True)
        enrichment = {}

        for compa in self.comparisons:
            gene_lists = gene_lists_dict[compa]
            ke = KeggPathwayEnrichment(gene_lists, organism, progress=False)
            ke.compute_enrichment()

            for direction in ["up", "down", "all"]:
                enrichment[(compa, direction)] = ke._get_final_df(
                    ke.enrichment[direction].results, nmax=10000)
                pylab.figure()
                ke.scatterplot(direction)
                pylab.tight_layout()
                pylab.savefig(out_dir / f"kegg_{compa}_{direction}.pdf")
                pylab.savefig(out_dir / f"kegg_{compa}_{direction}.png")

            logger.info(f"KEGG enrichment for {compa} DONE.")

        df = pd.concat(enrichment).sort_index()
        df.index.rename(["comparison", "direction", "index"], inplace=True)

        self.enrichment_kegg = df

        # Export results (should be moved to enrichment.py at some point I think)
        with pd.ExcelWriter(out_dir.parent / "enrichment_kegg.xlsx") as writer:
            df = self.enrichment_kegg.copy()
            df.reset_index(inplace=True)
            df.to_excel(writer, "kegg", index=False)
            ws = writer.sheets["kegg"]
            try:
                ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
            except:
                logger.warning("Fixme")
Beispiel #4
0
    def plot_hist_coverage(self, logx=True, logy=True, fontsize=16, N=20,
        fignum=1, hold=False, alpha=0.5, filename=None, **kw_hist):
        """


        """
        if hold is False:
            pylab.figure(fignum)
            pylab.clf()
        ax = pylab.gca()
        ax.set_facecolor('#eeeeee')

        data = self.df['cov'].dropna().values

        maxcov = data.max()
        if logx is True and logy is True:
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
            pylab.hist(data, bins=bins, log=True, label=self.chrom_name,
                alpha=alpha, **kw_hist)
            pylab.semilogx()
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
        elif logx is False and logy is True:
            pylab.hist(data, bins=N, log=True, label=self.chrom_name,
                alpha=alpha, **kw_hist)
            pylab.xlabel("Coverage", fontsize=fontsize)
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
        elif logx is True and logy is False:
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
            pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha,
                **kw_hist)
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
            pylab.ylabel("Count", fontsize=fontsize)
            pylab.semilogx()
        else:
            pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha,
                **kw_hist)
            pylab.xlabel("Coverage", fontsize=fontsize)
            pylab.ylabel("Count", fontsize=fontsize)
        pylab.grid(True)
        if filename:
            pylab.savefig(filename)
Beispiel #5
0
    def plot(self,
             n_components=2,
             n_neighbors=5,
             transform="log",
             switch_x=False,
             switch_y=False,
             switch_z=False,
             colors=None,
             max_features=500,
             show_plot=True):
        """

        :param n_components: at number starting at 2 or a value below 1
            e.g. 0.95 means select automatically the number of components to
            capture 95% of the variance
        :param transform: can be 'log' or 'anscombe', log is just log10. count
            with zeros, are set to 1
        """
        from sklearn.manifold import Isomap
        import numpy as np

        pylab.clf()

        data, kept = self.scale_data(transform_method=transform,
                                     max_features=max_features)

        iso = Isomap(n_neighbors=n_neighbors, n_components=n_components)
        iso.fit(data.T)
        Xr = iso.transform(data.T)
        self.Xr = Xr

        if switch_x:
            Xr[:, 0] *= -1
        if switch_y:
            Xr[:, 1] *= -1
        if switch_z:
            Xr[:, 2] *= -1

        # PC1 vs PC2
        if show_plot:
            pylab.figure(1)
            self._plot(Xr, pca=None, pc1=0, pc2=1, colors=colors)

        if n_components >= 3:
            if show_plot:
                pylab.figure(2)
                self._plot(Xr, pca=None, pc1=0, pc2=2, colors=colors)
                pylab.figure(3)
                self._plot(Xr, pca=None, pc1=1, pc2=2, colors=colors)
        return iso
Beispiel #6
0
    def plot(self, n_components=2, transform="log", switch_x=False,
            switch_y=False, switch_z=False, colors=None,
            max_features=500, show_plot=True):
        """

        :param n_components: at number starting at 2 or a value below 1
            e.g. 0.95 means select automatically the number of components to
            capture 95% of the variance
        :param transform: can be 'log' or 'anscombe', log is just log10. count
            with zeros, are set to 1
        """
        assert transform in ['log', 'anscombe']

        from sklearn.decomposition import PCA
        import numpy as np

        pylab.clf()
        pca = PCA(n_components)

        data, kept = self.scale_data(transform_method=transform, max_features=max_features)

        pca.fit(data.T)

        Xr = pca.transform(self.scaler.fit_transform(self.df.loc[kept].T))
        self.Xr = Xr

        if switch_x:
            Xr[:,0] *= -1
        if switch_y:
            Xr[:,1] *= -1
        if switch_z:
            Xr[:,2] *= -1

        # PC1 vs PC2
        if show_plot:
            pylab.figure(1)
            self._plot(Xr, pca=pca, pc1=0,pc2=1, colors=colors)

        if len(pca.explained_variance_ratio_) >= 3:
            if show_plot:
                pylab.figure(2)
                self._plot(Xr, pca=pca, pc1=0,pc2=2, colors=colors)
                pylab.figure(3)
                self._plot(Xr, pca=None, pc1=1,pc2=2, colors=colors)

        return pca.explained_variance_ratio_
Beispiel #7
0
    def plot(self,
             kind="pie",
             cmap="copper",
             threshold=1,
             radius=0.9,
             textcolor="red",
             **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_biokit(list(self.taxons.index))
        df.ix[-1] = ["Unclassified"] * 8
        data = self.taxons.copy()
        data.ix[-1] = self.unclassified

        data = data / data.sum() * 100
        assert threshold > 0 and threshold < 100
        others = data[data < threshold].sum()
        data = data[data > threshold]
        names = df.ix[data.index]['name']

        data.index = names.values
        data.ix['others'] = others
        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        # text may be long so, let us increase the figsize a little bit
        pylab.figure(figsize=(10, 8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind,
                           cmap=cmap,
                           autopct='%1.1f%%',
                           radius=radius,
                           **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind, **kargs)
            pylab.xlabel(" percentage ")

        return data
Beispiel #8
0
    def plot(
            self,
            num=1,
            cmap=None,
            colorbar=True,
            vmin=None,
            vmax=None,
            colorbar_position="right",
            gradient_span="None",
            figsize=(12, 8),
            fontsize=None,
    ):
        """

        Using as input::

            df = pd.DataFrame({'A':[1,0,1,1],
                               'B':[.9,0.1,.6,1],
                            'C':[.5,.2,0,1],
                            'D':[.5,.2,0,1]})

        we can plot the heatmap + dendogram as follows::

            h = Heatmap(df)
            h.plot(vmin=0, vmax=1.1)


        .. plot::
            :include-source:
            :width: 80%

            from sequana.viz import heatmap
            df = heatmap.get_heatmap_df()
            h = heatmap.Heatmap(df)
            h.category_column['A'] = 1
            h.category_column['C'] = 1
            h.category_column['D'] = 2
            h.category_column['B'] = 2
            h.plot()


        """
        # save all parameters in a dict
        layout = {}

        if cmap is None:
            cmap = self.params.cmap
        try:
            import colormap

            cmap = colormap.cmap_builder(cmap)
        except:
            pass

        # keep track of row and column names for later.
        row_header = self.frame.index
        column_header = self.frame.columns

        import matplotlib

        # FIXME something clever for the fontsize
        if len(row_header) > 100 or len(column_header) > 100:
            matplotlib.rcParams["font.size"] = 6
        if len(row_header) > 50 or len(column_header) > 50:
            matplotlib.rcParams["font.size"] = 7
        if len(row_header) > 30 or len(column_header) > 30:
            matplotlib.rcParams["font.size"] = 8
        else:
            matplotlib.rcParams["font.size"] = 12
        if fontsize:
            matplotlib.rcParams["font.size"] = fontsize

        # scaling min/max range
        self.gradient_span = gradient_span  #'only_max'
        # min_to_max, min_to_max_centered, only_max, only_min

        if self.gradient_span == "min_to_max_centered":
            vmax = self.frame.max().max()
            vmin = self.frame.min().min()
            vmax = max([vmax, abs(vmin)])
            vmin = vmax * -1
        if self.gradient_span == "only_max":
            vmin = 0
            vmax = self.frame.max().max()
        if self.gradient_span == "only_min":
            vmin = self.frame.min().min()
            vmax = 0
        norm = matplotlib.colors.Normalize(vmin, vmax)

        # Scale the figure window size #
        fig = pylab.figure(num=num, figsize=figsize)
        fig.clf()

        # LAYOUT --------------------------------------------------
        # ax1 (dendrogram 1) on the left of the heatmap
        [ax1_x, ax1_y, ax1_w, ax1_h] = [0.05, 0.22, 0.2, 0.6]
        width_between_ax1_axr = 0.004
        # distance between the top color bar axis and the matrix
        height_between_ax1_axc = 0.004
        # Sufficient size to show
        color_bar_w = 0.015

        # axr, placement of row side colorbar
        # second to last controls the width of the side color bar - 0.015 when showing
        [axr_x, axr_y, axr_w, axr_h] = [0.31, 0.1, color_bar_w, 0.6]
        axr_x = ax1_x + ax1_w + width_between_ax1_axr
        axr_y = ax1_y
        axr_h = ax1_h
        width_between_axr_axm = 0.004

        # axc, placement of column side colorbar #
        # last one controls the hight of the top color bar - 0.015 when showing
        [axc_x, axc_y, axc_w, axc_h] = [0.4, 0.63, 0.5, color_bar_w]
        axc_x = axr_x + axr_w + width_between_axr_axm
        axc_y = ax1_y + ax1_h + height_between_ax1_axc
        height_between_axc_ax2 = 0.004

        # axm, placement of heatmap for the data matrix # why larger than 1?
        [axm_x, axm_y, axm_w, axm_h] = [0.4, 0.9, 2.5, 0.5]
        axm_x = axr_x + axr_w + width_between_axr_axm
        axm_y = ax1_y
        axm_h = ax1_h
        axm_w = axc_w

        # ax2 (dendrogram 2), on the top of the heatmap #
        [ax2_x, ax2_y, ax2_w, ax2_h] = [0.3, 0.72, 0.6, 0.15]
        ax2_x = axr_x + axr_w + width_between_axr_axm
        ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2
        ax2_w = axc_w

        # axcb - placement of the color legend #
        if colorbar_position == "top left":
            [axcb_x, axcb_y, axcb_w, axcb_h] = [0.07, 0.88, 0.18, 0.09]
        elif colorbar_position == "right":
            [axcb_x, axcb_y, axcb_w, axcb_h] = [0.85, 0.2, 0.08, 0.6]
        else:
            raise ValueError("'top left' or 'right' accepted for now")

        # COMPUTATION DENDOGRAM 1 -------------------------------------
        if self.column_method:
            Y = self.linkage(self.frame.transpose(), self.column_method,
                             self.column_metric)
            ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=True)

            #     p=30,    truncate_mode=None,    color_threshold=None,    get_leaves=True,
            # orientation='top    labels=None,    count_sort=False,    distance_sort=False,
            #     show_leaf_counts=True,    no_plot=False,    no_labels=False,    leaf_font_size=None,
            #     leaf_rotation=None,    leaf_label_func=None,    show_contracted=False,
            #     link_color_func=None,    ax=None,    above_threshold_color='b',            #

            # color_threshold=0 and above_threshold_color='k' colors all
            # dendogram into black
            Z = hierarchy.dendrogram(
                Y,
                color_threshold=0,
                above_threshold_color="k",
                distance_sort="descending",
            )
            ind2 = hierarchy.fcluster(Y, 0.7 * max(Y[:, 2]),
                                      self.cluster_criterion)

            ax2.set_xticks([])
            ax2.set_yticks([])
            # apply the clustering for the array-dendrograms to the actual matrix data
            idx2 = Z["leaves"]
            self.frame = self.frame.iloc[:, idx2]
            # reorder the flat cluster to match the order of the leaves the dendrogram
            ind2 = ind2[idx2]
            layout["dendogram2"] = ax2
        else:
            idx2 = range(self.frame.shape[1])

        # COMPUTATION DENDOGRAM 2 ---------------------------------
        if self.row_method:
            Y = self.linkage(self.frame, self.row_method, self.row_metric)

            ax1 = fig.add_axes([ax1_x, ax1_y, ax1_w, ax1_h], frame_on=True)
            Z = hierarchy.dendrogram(
                Y,
                orientation="right",
                color_threshold=0,
                above_threshold_color="k",
                distance_sort="descending",
            )
            ind1 = hierarchy.fcluster(Y, 0.7 * max(Y[:, 2]),
                                      self.cluster_criterion)

            ax1.set_xticks([])
            ax1.set_yticks([])
            # apply the clustering for the array-dendrograms to the actual matrix data
            idx1 = Z["leaves"]
            self.frame = self.frame.iloc[idx1, :]
            # reorder the flat cluster to match the order of the leaves the dendrogram
            ind1 = ind1[idx1]
            layout["dendogram1"] = ax1
        else:
            idx1 = range(self.frame.shape[0])

        # HEATMAP itself
        axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h])
        axm.imshow(
            self.frame,
            aspect="auto",
            origin="lower",
            interpolation="None",
            cmap=cmap,
            norm=norm,
        )
        axm.set_xticks([])
        axm.set_yticks([])
        layout["heatmap"] = axm

        # TEXT
        new_row_header = []
        new_column_header = []
        for i in range(self.frame.shape[0]):
            axm.text(
                self.frame.shape[1] - 0.5,
                i,
                "  " + str(row_header[idx1[i]]),
                verticalalignment="center",
            )
            new_row_header.append(
                row_header[idx1[i]] if self.row_method else row_header[i])

        for i in range(self.frame.shape[1]):
            axm.text(
                i,
                -0.9,
                " " + str(column_header[idx2[i]]),
                rotation=90,
                verticalalignment="top",
                horizontalalignment="center",
            )
            new_column_header.append(column_header[idx2[i]] if self.
                                     column_method else column_header[i])

        # CATEGORY column ------------------------------
        if self.category_column:
            axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h])

            category_col = [
                self.category_column[self.df.columns[i]] for i in idx2
            ]

            dc = np.array(category_col, dtype=int)
            dc.shape = (1, len(ind2))
            cmap_c = matplotlib.colors.ListedColormap(
                self.params.col_side_colors)
            axc.matshow(dc, aspect="auto", origin="lower", cmap=cmap_c)
            axc.set_xticks([])
            axc.set_yticks([])
            layout["category_column"] = axc

        # CATEGORY row -------------------------------
        if self.category_row:
            axr = fig.add_axes([axr_x, axr_y, axr_w, axr_h])
            # self.category_row must be a dictionary with names as found in the columns
            # of the dataframe.

            category_row = [self.category_row[self.df.index[i]] for i in idx1]

            dr = np.array(category_row, dtype=int)
            dr.shape = (len(category_row), 1)
            cmap_r = matplotlib.colors.ListedColormap(
                self.params.col_side_colors)
            axr.matshow(dr, aspect="auto", origin="lower", cmap=cmap_r)
            axr.set_xticks([])
            axr.set_yticks([])
            layout["category_row"] = axr

        # COLORBAR ----------------------
        if colorbar == True:
            axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h],
                                frame_on=False)
            if colorbar_position == "right":
                orientation = "vertical"
            else:
                orientation = "horizontal"
            cb = matplotlib.colorbar.ColorbarBase(ax=axcb,
                                                  cmap=cmap,
                                                  norm=norm,
                                                  orientation=orientation)
            # axcb.set_title("whatever")
            # max_cb_ticks = 5
            # axcb.xaxis.set_major_locator(matplotlib.ticker.MaxNLocator(max_cb_ticks))
            layout["colorbar"] = cb
            layout["colorbar_scalablemap"] = axcb

        #   could be useful
        self.d = {"ordered": self.frame.copy(), "rorder": idx1, "corder": idx2}

        return layout
Beispiel #9
0
    def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9,
                textcolor="red", **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_biokit(list(self.taxons.index))
        df.ix[-1] = ["Unclassified"] * 8
        data = self.taxons.copy()
        data.ix[-1] = self.unclassified

        data = data/data.sum()*100
        assert threshold > 0 and threshold < 100
        others = data[data<threshold].sum()
        data = data[data>threshold]
        names = df.ix[data.index]['name']

        data.index = names.values
        data.ix['others'] = others
        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        # text may be long so, let us increase the figsize a little bit
        pylab.figure(figsize=(10,8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%',
                radius=radius, **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind,  **kargs)
            pylab.xlabel(" percentage ")

        return data
Beispiel #10
0
    def run_enrichment_go(self,
                          taxon,
                          annot_col="Name",
                          out_dir="enrichment"):  # pragma: no cover

        out_dir = Path(out_dir) / "figures"
        out_dir.mkdir(exist_ok=True, parents=True)

        gene_lists_dict = self.get_gene_lists(annot_col=annot_col,
                                              Nmax=2000,
                                              dropna=True)
        enrichment = {}
        ontologies = {
            "GO:0003674": "BP",
            "GO:0008150": "MF",
            "GO:0005575": "CC"
        }
        failed_enrichments = []

        for compa in self.comparisons:
            gene_lists = gene_lists_dict[compa]
            pe = PantherEnrichment(gene_lists, taxon)
            pe.compute_enrichment(ontologies=ontologies.keys(), progress=False)

            for direction in ["up", "down", "all"]:
                if not pe.enrichment[direction]:
                    logger.warning(
                        f"No enrichment computed, so no plots computed for {compa} {direction} {ontology}"
                    )
                    failed_enrichments.append({
                        "comparison":
                        compa,
                        "direction":
                        direction,
                        "GO":
                        "all",
                        "reason":
                        "no enrichment computed",
                    })
                    continue

                for ontology in ontologies.keys():
                    pylab.figure()
                    enrichment_df = pe.plot_go_terms(direction,
                                                     ontology,
                                                     compute_levels=False)
                    if enrichment_df.empty:
                        failed_enrichments.append({
                            "comparison":
                            compa,
                            "direction":
                            direction,
                            "GO":
                            ontology,
                            "reason":
                            "no enrichment found",
                        })
                    else:
                        enrichment[(compa, direction,
                                    ontology)] = enrichment_df
                        pylab.tight_layout()
                        pylab.savefig(
                            out_dir /
                            f"go_{compa}_{direction}_{ontologies[ontology]}.pdf"
                        )
                        pe.save_chart(
                            enrichment_df,
                            out_dir /
                            f"chart_{compa}_{direction}_{ontologies[ontology]}.png",
                        )

            logger.info(f"Panther enrichment for {compa} DONE.")

        df = pd.concat(enrichment).sort_index()
        df.index.rename(["comparison", "direction", "GO_category", "index"],
                        inplace=True)

        self.enrichment_go = df
        self.failed_go_enrichments = pd.DataFrame(failed_enrichments)

        # Export results (should be moved to enrichment.py at some point I think)
        with pd.ExcelWriter(out_dir.parent / "enrichment_go.xlsx") as writer:
            df = self.enrichment_go.copy()
            df.reset_index(inplace=True)
            df.to_excel(writer, "go", index=False)
            ws = writer.sheets["go"]
            try:
                ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
            except:
                logger.warning("XLS formatting issue.")
Beispiel #11
0
    def plot(self,
             kind="pie",
             cmap="tab20c",
             threshold=1,
             radius=0.9,
             textcolor="red",
             **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.


        .. todo:: For a future release, we could use this kind of plot 
            https://stackoverflow.com/questions/57720935/how-to-use-correct-cmap-colors-in-nested-pie-chart-in-matplotlib
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_db(list(self.taxons.index))

        # we add the unclassified only if needed
        if self.unclassified > 0:
            df.loc[-1] = ["Unclassified"] * 8

        data = self.taxons.copy()

        # we add the unclassified only if needed
        if self.unclassified > 0:
            data.loc[-1] = self.unclassified

        data = data / data.sum() * 100
        assert threshold > 0 and threshold < 100

        # everything below the threshold (1) is gather together and summarised
        # into 'others'
        others = data[data < threshold].sum()

        data = data[data >= threshold]
        names = df.loc[data.index]['name']

        data.index = names.values

        if others > 0:
            data.loc['others'] = others

        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        pylab.figure(figsize=(10, 8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind,
                           cmap=cmap,
                           autopct='%1.1f%%',
                           radius=radius,
                           **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind, **kargs)
            pylab.xlabel(" percentage ")

        return data
    max(df_results["Illumina_score"].dropna()))
list_pacbio_analysis = [col for col in list_analysis if ('Pacbio' in col)]
for analysis in list_pacbio_analysis:
    df_results[analysis + "_score"] = round(
        df_results[analysis + "_score"] / 100., 2)

if len(sys.argv) > 5:
    print("perso")
    colors = custom_colormap
else:
    cmap = pylab.cm.get_cmap(colormap)
    colors = [cmap(i) for i in np.linspace(0, 1, len(list_analysis))]

# get results for curves

pylab.figure(figsize=(8, 8))
for i in range(len(list_analysis)):
    analysis = list_analysis[i]
    res = compute_table_performance(analysis, df_results)
    print("%s" % analysis)
    # [TP, FP, FN, TN]
    # print(len(res[0]), len(res[1]), res[2], res[3] , sum([len(res[0]), len(res[1]), res[2], res[3]]))
    TP = res[0]
    FP = res[1]
    FN = [0] * res[2]
    TN = [0] * res[3]
    y_true = np.array([1] * len(TP) + [1] * len(FN) + [0] * len(FP) +
                      [0] * len(TN))
    y_scores = np.array(TP + FN + FP + TN)
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    pylab.plot(recall, precision, color=colors[i], label=analysis)