Ejemplo n.º 1
0
    def plot_bar_flags(self, logy=True, fontsize=16, filename=None):
        """Plot an histogram of the flags contained in the BAM

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam', "testing"))
            b.plot_bar_flags()

        .. seealso:: :class:`SAMFlags` for meaning of each flag
        """
        df = self.get_flags_as_df()
        df = df.sum()
        pylab.clf()
        if logy is True:
            barplot = df.plot(kind='bar', logy=logy, grid=True)
        else:
            barplot = df.plot(kind='bar', grid=True)
        pylab.xlabel("flags", fontsize=fontsize)
        pylab.ylabel("count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
        return barplot
Ejemplo n.º 2
0
    def plot_unknown_barcodes(self, N=20):
        ub = self.data['UnknownBarcodes']
        df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub})
        if "unknown" in df.index and len(df) == 1:
            df.loc['known'] = [0 for i in df.columns]

        # if data is made of undetermined only, the dataframe is just made of
        # N lanes with one entry : unknown
        S = df.sum(axis=1).sort_values(ascending=False).index[0:N]
        data = df.loc[S][::-1]
        #print(data)

        data.columns = ["Lane {}".format(x) for x in data.columns]
        from matplotlib import rcParams
        rcParams['axes.axisbelow'] = True
        pylab.figure(figsize=(10, 8))
        ax = pylab.gca()
        data.plot(kind="barh", width=1, ec="k", ax=ax)
        rcParams['axes.axisbelow'] = False
        pylab.xlabel("Number of reads", fontsize=12)
        pylab.ylabel("")
        pylab.grid(True)
        pylab.legend(
            ["Lane {}".format(x) for x in range(1,
                                                len(df.columns) + 1)],
            loc="lower right")
        try:
            pylab.tight_layout()
        except Exception as err:
            print(err)
        return data
Ejemplo n.º 3
0
 def plot_bar(self, spikes_filename=None, ratio=100):
     data = self.spikes_found(spikes_filename)
     lengths = [self.SIRV_lengths[x] for x in data.index]
     data.plot(kind="bar")
     pylab.plot(np.array(lengths)/ratio)
     pylab.tight_layout()
     return data
Ejemplo n.º 4
0
 def plot_bar(self, spikes_filename=None, ratio=100):
     data = self.spikes_found(spikes_filename)
     lengths = [self.SIRV_lengths[x] for x in data.index]
     data.plot(kind="bar")
     pylab.plot(np.array(lengths) / ratio)
     pylab.tight_layout()
     return data
Ejemplo n.º 5
0
    def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True):
        """Number Of Polymerase Reads Per Barcode"""
        PR = self.df_barcoded["Polymerase Reads"].sum()
        data = self.df_barcoded['Polymerase Reads'].sort_values(
            ascending=False).values
        pylab.plot([int(x) for x in range(1,
                                          len(data) + 1)],
                   data,
                   label="barcodes")
        pylab.axhline(data.mean(), color="r", label="average")

        try:
            if unbarcoded is True:
                unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0]
                pylab.axhline(unbar, color="k", ls="--", label="not barcoded")
        except:
            pass

        pylab.xlabel("Barcode Rank Order", fontsize=fontsize)
        pylab.ylabel("Counts of Reads", fontsize=fontsize)
        pylab.title("Total Polymerase count: {}".format(PR))
        pylab.legend()
        pylab.ylim(ymin=0)
        try:
            pylab.tight_layout()
        except:
            pass
Ejemplo n.º 6
0
    def plot_hist_normalized_coverage(self, filename=None, binwidth=0.1,
            max_z=4):
        """ Barplot of the normalized coverage with gaussian fitting

        """
        pylab.clf()
        # if there are a NaN -> can't set up binning
        d = self.df["scale"][self.range[0]:self.range[1]].dropna()
        # remove outlier -> plot crash if range between min and max is too high
        d = d[np.abs(d - d.mean()) <= (4 * d.std())]
        bins = self._set_bins(d, binwidth)
        self.mixture_fitting.data = d
        try:
            self.mixture_fitting.plot(self.gaussians_params, bins=bins, Xmin=0,
                                      Xmax=max_z)
        except ZeroDivisionError:
            pass
        pylab.grid(True)
        pylab.xlim([0,max_z])
        pylab.xlabel("Normalised per-base coverage")
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
Ejemplo n.º 7
0
    def plot_gc_vs_coverage(self, filename=None, bins=None, Nlevels=6,
                            fontsize=20, norm="log", ymin=0, ymax=100,
                            contour=True, **kwargs):

        if Nlevels is None or Nlevels==0:
            contour = False

        data = self.df[['cov','gc']].copy()
        data['gc'] *= 100
        data = data.dropna()
        if bins is None:
            bins = [100, min(int(data['gc'].max()-data['gc'].min()+1),
                    max(5,self.bed.gc_window_size - 4))]
            bins[0] = max(10, min(bins[0], self.df['cov'].max()))

        from biokit import Hist2D
        h2 = Hist2D(data)

        try:
            h2.plot(bins=bins, xlabel="Per-base coverage",
                    ylabel=r'GC content (%)',
                    Nlevels=Nlevels, contour=contour, norm=norm,
                    fontsize=fontsize, **kwargs)
        except:
            h2.plot(bins=bins, xlabel="Per-base coverage",
                    ylabel=r'GC content (%)' ,
                    Nlevels=Nlevels, contour=False, norm=norm,
                    fontsize=fontsize, **kwargs)
        pylab.ylim([ymin, ymax])
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
Ejemplo n.º 8
0
    def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.scatter(-pylab.log10(df['Adjusted P-value']),
                      range(len(df)),
                      s=10 * df['size'],
                      c=df['size'])

        pylab.xlabel("Odd ratio")
        pylab.ylabel("Gene sets")
        pylab.yticks(range(len(df)), df.name)
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.grid(True)
        ax = pylab.gca()

        M = max(df['size'])
        if M > 100:
            l1, l2, l3 = "10", "100", str(M)
        else:
            l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M)

        handles = [
            pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="")
        ]
        ax.legend(handles=handles, loc="upper left", title="gene-set size")

        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.tight_layout()
        ax = pylab.colorbar(pylab.gci())
        return df
Ejemplo n.º 9
0
    def plot_bar_flags(self, logy=True, fontsize=16, filename=None):
        """Plot an histogram of the flags contained in the BAM

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam', "testing"))
            b.plot_bar_flags()

        .. seealso:: :class:`SAMFlags` for meaning of each flag
        """
        df = self.get_flags_as_df()
        df = df.sum()
        pylab.clf()
        if logy is True:
            barplot = df.plot(kind='bar', logy=logy, grid=True)
        else:
            barplot = df.plot(kind='bar', grid=True)
        pylab.xlabel("flags", fontsize=fontsize)
        pylab.ylabel("count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
        return barplot
Ejemplo n.º 10
0
    def plot_bar_mapq(self, fontsize=16, filename=None):
        """Plots bar plots of the MAPQ (quality) of alignments

            .. plot::
                :include-source:

                from sequana import BAM, sequana_data
                b = BAM(sequana_data('test.bam', "testing"))
                b.plot_bar_mapq()

        """
        df = self.get_mapq_as_df()
        df.plot(kind='hist',
                bins=range(0,
                           df.max().values[0] + 1),
                legend=False,
                grid=True,
                logy=True)
        pylab.xlabel("MAPQ", fontsize=fontsize)
        pylab.ylabel("Count", fontsize=fontsize)
        try:
            # This may raise issue on MAC platforms
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
Ejemplo n.º 11
0
 def plot_subreads_histogram(self, bins=10, fontsize=12):
     self.df_barcoded['Subreads'].hist(bins=bins, ec="k", rwidth=0.8)
     pylab.xlabel("Number of subreads", fontsize=fontsize)
     pylab.ylabel("Number of Barcoded Samples", fontsize=fontsize)
     try:
         pylab.tight_layout()
     except:
         pass
Ejemplo n.º 12
0
 def plot_padj_hist(self, bins=60, fontsize=16):
     pylab.hist(self.df.padj.dropna(), bins=bins, ec="k")
     pylab.grid(True)
     pylab.xlabel("Adjusted p-value", fontsize=fontsize)
     pylab.ylabel("Occurences", fontsize=fontsize)
     try:
         pylab.tight_layout()
     except:
         pass
Ejemplo n.º 13
0
 def plot_pvalue_hist(self, bins=60, fontsize=16, rotation=0):
     pylab.hist(self.df.pvalue.dropna(), bins=bins, ec="k")
     pylab.grid(True)
     pylab.xlabel("raw p-value", fontsize=fontsize)
     pylab.ylabel("Occurences", fontsize=fontsize)
     try:
         pylab.tight_layout()
     except:
         pass
Ejemplo n.º 14
0
    def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None):
        df = self.get_data_reads()

        # this is ugly but will do the job for now
        under = df.query("name=='Undetermined'")
        others = df.query("name!='Undetermined'")

        under = under.groupby("name").sum().reset_index()
        others = others.groupby("name").sum().reset_index()

        under = under[["name", "count"]].set_index("name")
        others = others[["name", "count"]].set_index("name")

        all_data = others.sort_index(ascending=False)
        all_data.columns = ["samples"]

        # appended at the end
        all_data.loc['undetermined'] = 0

        # revert back
        all_data = all_data.loc[::-1]

        # just for legend
        under.columns = ['undetermined']
        if all_data.sum().min() > 1e6:
            all_data /= 1e6
            under /= 1e6
            M = True
        else:
            M = False

        all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k')

        under.plot(kind="barh",
                   alpha=alpha,
                   color="red",
                   ax=pylab.gca(),
                   zorder=1,
                   width=width,
                   ec='k')
        pylab.ylim([-0.5, len(all_data) + 0.5])
        if len(all_data) < 100:
            pylab.yticks(range(len(all_data)), all_data.index)

        pylab.legend()
        pylab.grid(True, zorder=-1)
        if M:
            pylab.xlabel("Number of reads (M)")
        else:
            pylab.xlabel("Number of reads")
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename, dpi=200)
Ejemplo n.º 15
0
 def hist_quality_per_barcode(self, bins=10, fontsize=12):
     self.df_barcoded['Mean Barcode Quality'].hist(bins=bins,
                                                   ec="k",
                                                   rwidth=0.8)
     pylab.xlabel("Mean Barcode Quality", fontsize=fontsize)
     pylab.ylabel("Number of Barcoded Samples", fontsize=fontsize)
     try:
         pylab.tight_layout()
     except:
         pass
Ejemplo n.º 16
0
 def hist_mean_polymerase_read_length(self, bins=10, fontsize=12):
     self.df_barcoded['Mean Read Length'].hist(bins=bins,
                                               ec="k",
                                               rwidth=0.8)
     pylab.xlabel("Mean Polymerase Read Length", fontsize=fontsize)
     pylab.ylabel("Number of Barcoded Samples", fontsize=fontsize)
     try:
         pylab.tight_layout()
     except:
         pass
Ejemplo n.º 17
0
    def hist_GC(self,
                bins=50,
                alpha=0.5,
                hold=False,
                fontsize=12,
                grid=True,
                xlabel="GC %",
                ylabel="#",
                label="",
                title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC = np.mean(self.df.loc[:, 'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" % (mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:, 'GC_content'],
                   bins=bins,
                   alpha=alpha,
                   label=label + ", mean : " + str(round(mean_GC, 2)) +
                   ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try:
            pylab.tight_layout()
        except:
            pass
Ejemplo n.º 18
0
 def plot_piechart(self, df):
     # Here we show the GO terms that have number in list > 0
     # Note, that this is dangerous to look only at this picture without
     # the reference plot, which data is not available thourg the pathner API
     labels = []
     for this in df.query("number_in_list!=0").label.values:
         if len(this) > 50:
             labels.append(this)
         else:
             labels.append(this[0:50] + "...")
     pylab.pie(df.query("number_in_list!=0").number_in_list, labels=labels)
     pylab.tight_layout()
Ejemplo n.º 19
0
    def plot_feature_most_present(self):
        """"""

        df = []

        for x, y in self.counts_raw.idxmax().iteritems():

            most_exp_gene_count = self.counts_raw.stack().loc[y, x]
            total_sample_count = self.counts_raw.sum().loc[x]

            df.append({
                "label":
                x,
                "gene_id":
                y,
                "count":
                most_exp_gene_count,
                "total_sample_count":
                total_sample_count,
                "most_exp_percent":
                most_exp_gene_count / total_sample_count * 100,
            })

        df = pd.DataFrame(df).set_index("label")
        df = pd.concat([self.design_df, df], axis=1)

        pylab.clf()
        p = pylab.barh(
            df.index,
            df.most_exp_percent,
            color=df.group_color,
            zorder=10,
            lw=1,
            ec="k",
            height=0.9,
        )

        for idx, rect in enumerate(p):
            pylab.text(
                2,  # * rect.get_height(),
                idx,  # rect.get_x() + rect.get_width() / 2.0,
                df.gene_id.iloc[idx],
                ha="center",
                va="center",
                rotation=0,
                zorder=20,
            )

        self._format_plot(
            # title="Counts monopolized by the most expressed gene",
            # xlabel="Sample",
            xlabel="Percent of total reads", )
        pylab.tight_layout()
Ejemplo n.º 20
0
    def barplot(self, enrich, cutoff=0.05, nmax=10):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value']))
        pylab.yticks(range(len(df)), df.name)
        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.grid(True)
        pylab.xlabel("Adjusted p-value (log10)")
        pylab.ylabel("Gene sets")
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.tight_layout()
        return df
Ejemplo n.º 21
0
    def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1):
        aa = self.df.query("reference_name not in [-1, '-1']").copy()
        if len(aa) == 0:
            return pd.Series(), self.df

        aa['group'] = aa.reference_name.apply(lambda x: x[0:shift])
        mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"]
        mapped.name = None

        if plot:
            mapped.plot(kind="bar")
            pylab.title(title)
            pylab.tight_layout()
        #data.to_csv(path + "_hq_sirv_grouped.csv")
        return mapped, self.df
Ejemplo n.º 22
0
    def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1):
        aa = self.df.query("reference_name not in [-1, '-1']").copy()
        if len(aa) == 0:
            return pd.Series(), self.df

        aa['group'] = aa.reference_name.apply(lambda x: x[0:shift])
        mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"]
        mapped.name = None

        if plot:
            mapped.plot(kind="bar")
            pylab.title(title)
            pylab.tight_layout()
        #data.to_csv(path + "_hq_sirv_grouped.csv")
        return mapped, self.df
Ejemplo n.º 23
0
    def hist_polymerase_per_barcode(self, bins=10, fontsize=12):
        """histogram of number of polymerase per barcode

        Cumulative histogram gives total number of polymerase reads

        """
        PR = self.df_barcoded["Polymerase Reads"].sum()
        self.df_barcoded['Polymerase Reads'].hist(bins=bins,
                                                  ec="k",
                                                  rwidth=0.8)
        pylab.title("Total Polymerase count: {}".format(PR))
        pylab.xlabel("Number of Polymerase Reads", fontsize=fontsize)
        pylab.ylabel("Number of Barcoded Samples", fontsize=fontsize)
        try:
            pylab.tight_layout()
        except:
            pass
Ejemplo n.º 24
0
    def plot_hist_zscore(self, fontsize=16, filename=None, max_z=6,
            binwidth=0.5, **hist_kargs):
        """ Barplot of the zscore values

        """
        pylab.clf()
        bins = self._set_bins(self.df["zscore"][self.range[0]:self.range[1]],
                              binwidth)
        self.df["zscore"][self.range[0]:self.range[1]].hist(
            grid=True, bins=bins, **hist_kargs)
        pylab.xlabel("Z-Score", fontsize=fontsize)
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
Ejemplo n.º 25
0
    def hist_transcript(self, hide_unmapped=True):
        pylab.clf()

        if hide_unmapped is True:
            query = "reference_length>0 and reference_name!=-1"
        else:
            query = "reference_length>0"

        print(query)
        ts = self.df.query(query).groupby("reference_name").count().reference_length
        if len(ts) == 0:
            print("nothing to plot")
            return ts

        ts.plot(kind="bar" ,color="r")
        try: pylab.tight_layout()
        except: pass
        return ts
Ejemplo n.º 26
0
    def plot(self, interpolation='None', aspect='auto', cmap='hot', tight_layout=True,
        colorbar=True, fontsize_x=None, fontsize_y=None, rotation_x=90,
        xticks_on=True, yticks_on=True, **kargs):
        """wrapper around imshow to plot a dataframe

        :param interpolation: set to None
        :param aspect: set to 'auto'
        :param cmap: colormap to be used.
        :param tight_layout:
        :param colorbar: add a colobar (default to True)
        :param fontsize_x: fontsize on xlabels
        :param fontsize_y: fontsize on ylabels
        :param rotation_x: rotate labels on xaxis
        :param xticks_on: switch off the xticks and labels
        :param yticks_on: switch off the yticks and labels

        """

        data = self.df
        pylab.clf()
        pylab.imshow(data, interpolation=interpolation, aspect=aspect, cmap=cmap, **kargs)

        if fontsize_x == None:
            fontsize_x = 16 #FIXME use default values
        if fontsize_y == None:
            fontsize_y = 16 #FIXME use default values

        if yticks_on is True:
            pylab.yticks(range(0, len(data.index)), data.index, 
                fontsize=fontsize_y)
        else:
            pylab.yticks([])
        if xticks_on is True:
            pylab.xticks(range(0, len(data.columns[:])), data.columns, 
                fontsize=fontsize_x, rotation=rotation_x)
        else:
            pylab.xticks([])

        if colorbar is True:
            pylab.colorbar()

        if tight_layout:
            pylab.tight_layout()
Ejemplo n.º 27
0
    def plot_boxplot_normeddata(self,
                                fliersize=2,
                                linewidth=2,
                                rotation=0,
                                **kwargs):
        import seaborn as sbn

        ax = sbn.boxplot(
            data=self.counts_norm.clip(1),
            linewidth=linewidth,
            fliersize=fliersize,
            palette=self.design_df.group_color,
            **kwargs,
        )
        pos, labs = pylab.xticks()
        pylab.xticks(pos, labs, rotation=rotation)
        ax.set(yscale="log")
        self._format_plot(ylabel="Normalised count distribution")
        pylab.tight_layout()
Ejemplo n.º 28
0
    def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="GC %", ylabel="#", label="",title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" %(mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:,'GC_content'], bins=bins,
            alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2))
            + ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try: pylab.tight_layout()
        except:pass
Ejemplo n.º 29
0
    def plot_bar_mapq(self, fontsize=16, filename=None, ):
        """Plots bar plots of the MAPQ (quality) of alignments

            .. plot::
                :include-source:

                from sequana import BAM, sequana_data
                b = BAM(sequana_data('test.bam', "testing"))
                b.plot_bar_mapq()

        """
        df = self.get_mapq_as_df()
        df.plot(kind='hist', bins=range(0,df.max().values[0]+1), legend=False,
            grid=True, logy=True)
        pylab.xlabel("MAPQ", fontsize=fontsize)
        pylab.ylabel("Count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
Ejemplo n.º 30
0
    def run_enrichment_kegg(self,
                            organism,
                            annot_col="Name",
                            out_dir="enrichment"):  # pragma: no cover

        out_dir = Path(out_dir) / "figures"
        out_dir.mkdir(exist_ok=True, parents=True)

        gene_lists_dict = self.get_gene_lists(annot_col=annot_col, dropna=True)
        enrichment = {}

        for compa in self.comparisons:
            gene_lists = gene_lists_dict[compa]
            ke = KeggPathwayEnrichment(gene_lists, organism, progress=False)
            ke.compute_enrichment()

            for direction in ["up", "down", "all"]:
                enrichment[(compa, direction)] = ke._get_final_df(
                    ke.enrichment[direction].results, nmax=10000)
                pylab.figure()
                ke.scatterplot(direction)
                pylab.tight_layout()
                pylab.savefig(out_dir / f"kegg_{compa}_{direction}.pdf")
                pylab.savefig(out_dir / f"kegg_{compa}_{direction}.png")

            logger.info(f"KEGG enrichment for {compa} DONE.")

        df = pd.concat(enrichment).sort_index()
        df.index.rename(["comparison", "direction", "index"], inplace=True)

        self.enrichment_kegg = df

        # Export results (should be moved to enrichment.py at some point I think)
        with pd.ExcelWriter(out_dir.parent / "enrichment_kegg.xlsx") as writer:
            df = self.enrichment_kegg.copy()
            df.reset_index(inplace=True)
            df.to_excel(writer, "kegg", index=False)
            ws = writer.sheets["kegg"]
            try:
                ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
            except:
                logger.warning("Fixme")
Ejemplo n.º 31
0
    def plot_boxplot_rawdata(self,
                             fliersize=2,
                             linewidth=2,
                             rotation=0,
                             **kwargs):
        import seaborn as sbn

        ax = sbn.boxplot(
            data=self.counts_raw.clip(1),
            linewidth=linewidth,
            fliersize=fliersize,
            palette=self.design_df.group_color,
            **kwargs,
        )
        pos, labs = pylab.xticks()
        pylab.xticks(pos, labs, rotation=rotation)
        ax.set_ylabel("Counts (raw) in log10 scale")
        ax.set_yscale("log")
        self._format_plot(ylabel="Raw count distribution")
        pylab.tight_layout()
Ejemplo n.º 32
0
    def plot_contig_length_vs_nreads(self, fontsize=16):
        # same as plot_scatter_contig_length_nread_cov
        if self._df is None:
            _ = self.get_df()
        pylab.clf()
        df = self._df

        m1 = df.length.min()
        M1 = df.length.max()
        pylab.loglog(df.length, df.nread, "o")
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("Contig N reads", fontsize=fontsize)
        pylab.grid()

        X = df.query("nread>10 and length>100000")['length']
        Y = df.query("nread>10 and length>100000")['nread']
        A = np.vstack([X, np.ones(len(X))]).T
        m, c = np.linalg.lstsq(A, Y.as_matrix())[0]
        x = np.array([m1, M1])
        pylab.plot(x, m * x + c, "o-r")
        pylab.tight_layout()
Ejemplo n.º 33
0
    def hist_transcript(self, hide_unmapped=True):
        pylab.clf()

        if hide_unmapped is True:
            query = "reference_length>0 and reference_name!=-1"
        else:
            query = "reference_length>0"

        print(query)
        ts = self.df.query(query).groupby(
            "reference_name").count().reference_length
        if len(ts) == 0:
            print("nothing to plot")
            return ts

        ts.plot(kind="bar", color="r")
        try:
            pylab.tight_layout()
        except:
            pass
        return ts
Ejemplo n.º 34
0
    def plot_bar_grouped(self, normalise=False, ncol=2, N=None):
        """

        :param normalise:
        :param ncol: columns in the legend

        """
        if N is not None:
            N = np.array(N)
        else:
            N = np.array([len(x) for x in self.rawdata])

        dd = pd.DataFrame(self.sirv).T
        if normalise:
            dd = dd/ (N/max(N))
        dd.columns = self.labels

        dd.plot(kind="bar")
        pylab.xlabel("")
        pylab.legend(self.labels, ncol=ncol)
        pylab.tight_layout()
        return dd
Ejemplo n.º 35
0
    def plot(self, bins=80, rwidth=0.8, **kwargs):
        pylab.clf()
        Y, X, _ = pylab.hist(self.data, bins=bins, rwidth=rwidth, **kwargs)

        pylab.xlabel(self.xlabel, fontsize=self.fontsize)
        pylab.ylabel(self.ylabel, fontsize=self.fontsize)

        """self.Y = Y
        self.X = X

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(self.data) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=self.fontsize)
        """
        pylab.grid(self.grid)
        pylab.title(self.title)
        try: pylab.tight_layout()
        except:pass
Ejemplo n.º 36
0
    def plot_bar_grouped(self, normalise=False, ncol=2, N=None):
        """

        :param normalise:
        :param ncol: columns in the legend

        """
        if N is not None:
            N = np.array(N)
        else:
            N = np.array([len(x) for x in self.rawdata])

        dd = pd.DataFrame(self.sirv).T
        if normalise:
            dd = dd / (N / max(N))
        dd.columns = self.labels

        dd.plot(kind="bar")
        pylab.xlabel("")
        pylab.legend(self.labels, ncol=ncol)
        pylab.tight_layout()
        return dd
Ejemplo n.º 37
0
    def plot_scatter_contig_length_nread_cov(self,
                                             fontsize=16,
                                             vmin=0,
                                             vmax=50,
                                             min_nreads=20,
                                             min_length=50000):

        if self._df is None:
            _ = self.get_df()
        pylab.clf()
        df = self._df

        m1 = df.length.min()
        M1 = df.length.max()

        # least square
        X = df.query("nread>@min_nreads and length>@min_length")['length']
        Y = df.query("nread>@min_nreads and length>@min_length")['nread']
        Z = df.query("nread>@min_nreads and length>@min_length")['covStat']
        print(X)
        print(Y)
        print(Z)

        A = np.vstack([X, np.ones(len(X))]).T
        m, c = np.linalg.lstsq(A, Y.as_matrix())[0]
        x = np.array([m1, M1])

        X = df['length']
        Y = df['nread']
        Z = df['covStat']
        pylab.scatter(X, Y, c=Z, vmin=vmin, vmax=vmax)
        pylab.colorbar()
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("Contig reads", fontsize=fontsize)
        pylab.title("coverage function of contig length and reads used")
        pylab.grid()
        pylab.plot(x, m * x + c, "o-r")
        pylab.loglog()
        pylab.tight_layout()
Ejemplo n.º 38
0
    def plot_count_per_sample(self, fontsize=12, rotation=45):
        """Number of mapped and annotated reads (i.e. counts) per sample. Each color
        for each replicate

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_count_per_sample()

        """
        pylab.clf()
        df = self.counts_raw.sum().rename("total_counts")
        df = pd.concat([self.design_df, df], axis=1)

        pylab.bar(
            df.index,
            df.total_counts / 1000000,
            color=df.group_color,
            lw=1,
            zorder=10,
            ec="k",
            width=0.9,
        )

        pylab.xlabel("Samples", fontsize=fontsize)
        pylab.ylabel("reads (M)", fontsize=fontsize)
        pylab.grid(True, zorder=0)
        pylab.title("Total read count per sample", fontsize=fontsize)
        pylab.xticks(rotation=rotation, ha="right")
        # pylab.xticks(range(N), self.sample_names)
        try:
            pylab.tight_layout()
        except:
            pass