Exemple #1
0
    def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8,
        align="left", fontsize=16, edgecolor="k", **kwargs):
        """

        mode can be all, lq, hq
        """
        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2
 
        Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align,
                    ec=edgecolor, **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")
Exemple #2
0
    def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8,
        align="left", fontsize=16, edgecolor="k", **kwargs):
        """

        mode can be all, lq, hq
        """
        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2
 
        Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align,
                    ec=edgecolor, **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")
Exemple #3
0
    def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:,"snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title,fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Exemple #4
0
    def hist_read_length(self,
                         bins=80,
                         alpha=0.5,
                         hold=False,
                         fontsize=12,
                         grid=True,
                         xlabel="Read Length",
                         ylabel="#",
                         label="",
                         title=None,
                         logy=False,
                         ec="k",
                         hist_kwargs={}):
        """Plot histogram Read length

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_read_length()

        """
        mean_len = np.mean(self.df.loc[:, 'read_length'])

        # set title if not provided
        if title is None:
            title = "Read length  \n Mean length : %.2f" % (mean_len)

        if hold is False:
            pylab.clf()

        hist = HistCumSum(self.df.loc[:, 'read_length'],
                          fontsize=fontsize,
                          grid=grid)
        hist.title = title
        hist.xlabel = xlabel
        hist.ylabel = ylabel
        hist.plot(bins=bins,
                  alpha=alpha,
                  edgecolor=ec,
                  label="%s, mean : %.0f, N : %d" %
                  (label, mean_len, len(self)),
                  log=logy,
                  **hist_kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
Exemple #5
0
    def hist_snr(self,
                 bins=50,
                 alpha=0.5,
                 hold=False,
                 fontsize=12,
                 grid=True,
                 xlabel="SNR",
                 ylabel="#",
                 title=""):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()
        pylab.hist(self._df.loc[:, 'snr_A'], alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:, 'snr_C'], alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:, 'snr_G'], alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:, 'snr_T'], alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Exemple #6
0
    def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.scatter(-pylab.log10(df['Adjusted P-value']),
                      range(len(df)),
                      s=10 * df['size'],
                      c=df['size'])

        pylab.xlabel("Odd ratio")
        pylab.ylabel("Gene sets")
        pylab.yticks(range(len(df)), df.name)
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.grid(True)
        ax = pylab.gca()

        M = max(df['size'])
        if M > 100:
            l1, l2, l3 = "10", "100", str(M)
        else:
            l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M)

        handles = [
            pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="")
        ]
        ax.legend(handles=handles, loc="upper left", title="gene-set size")

        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.tight_layout()
        ax = pylab.colorbar(pylab.gci())
        return df
Exemple #7
0
    def plot_dispersion(self):

        pylab.plot(
            self.dds_stats.baseMean,
            self.dds_stats.dispGeneEst,
            "ok",
            label="Estimate",
            ms=1,
        )
        pylab.plot(
            self.dds_stats.baseMean,
            self.dds_stats.dispersion,
            "ob",
            label="final",
            ms=1,
        )
        pylab.plot(self.dds_stats.baseMean,
                   self.dds_stats.dispFit,
                   "or",
                   label="Fit",
                   ms=1)
        pylab.legend()
        ax = pylab.gca()
        ax.set(yscale="log")
        ax.set(xscale="log")

        self._format_plot(
            title="Dispersion estimation",
            xlabel="Mean of normalized counts",
            ylabel="Dispersion",
        )
Exemple #8
0
    def plot_stacked_hist(self,
                          output_filename=None,
                          dpi=200,
                          kind="barh",
                          fontsize=10,
                          edgecolor="k",
                          lw=1,
                          width=1,
                          ytick_fontsize=10):
        df = self.get_df()
        df.T.plot(kind=kind,
                  stacked=True,
                  edgecolor=edgecolor,
                  lw=lw,
                  width=width)
        ax = pylab.gca()
        positions = pylab.yticks()
        #ax.set_yticklabel(positions, labels, fontsize=ytick_fontsize)
        pylab.xlabel("Percentage (%)", fontsize=fontsize)
        pylab.ylabel("Sample index/name", fontsize=fontsize)
        pylab.yticks(fontsize=ytick_fontsize)
        pylab.legend(title="kingdom")
        pylab.xlim([0, 100])

        if output_filename:
            pylab.savefig(output_filename, dpi=dpi)
Exemple #9
0
    def scatter_plot(self, filename=None, hold=False):
        """Scatter plot of the score versus length of each ortholog

        .. plot::
            :include-source:

            from sequana import BUSCO, sequana_data
            b = BUSCO(sequana_data("test_busco_full_table.tsv"))
            b.scatter_plot()


        Missing are not show since there is no information about contig .
        """
        if hold is False:
            pylab.clf()
        colors = ["green", "orange", "red", "blue"]
        markers = ['o', 's', 'x', 'o']
        for i, this in enumerate(["Complete", "Fragmented", "Duplicated"]):
            mask = self.df.Status == this
            if sum(mask) > 0:
                self.df[mask].plot(x="Length",
                                   y="Score",
                                   kind="scatter",
                                   color=colors[i],
                                   ax=pylab.gca(),
                                   marker=markers[i],
                                   label=this)

        pylab.legend()
        pylab.grid()
        if filename:
            pylab.savefig(filename)
Exemple #10
0
    def plot_unknown_barcodes(self, N=20):
        ub = self.data['UnknownBarcodes']
        df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub})
        if "unknown" in df.index and len(df) == 1:
            df.loc['known'] = [0 for i in df.columns]

        # if data is made of undetermined only, the dataframe is just made of
        # N lanes with one entry : unknown
        S = df.sum(axis=1).sort_values(ascending=False).index[0:N]
        data = df.loc[S][::-1]
        #print(data)

        data.columns = ["Lane {}".format(x) for x in data.columns]
        from matplotlib import rcParams
        rcParams['axes.axisbelow'] = True
        pylab.figure(figsize=(10, 8))
        ax = pylab.gca()
        data.plot(kind="barh", width=1, ec="k", ax=ax)
        rcParams['axes.axisbelow'] = False
        pylab.xlabel("Number of reads", fontsize=12)
        pylab.ylabel("")
        pylab.grid(True)
        pylab.legend(
            ["Lane {}".format(x) for x in range(1,
                                                len(df.columns) + 1)],
            loc="lower right")
        try:
            pylab.tight_layout()
        except Exception as err:
            print(err)
        return data
Exemple #11
0
    def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None):
        df = self.get_data_reads()

        # this is ugly but will do the job for now
        under = df.query("name=='Undetermined'")
        others = df.query("name!='Undetermined'")

        under = under.groupby("name").sum().reset_index()
        others = others.groupby("name").sum().reset_index()

        under = under[["name", "count"]].set_index("name")
        others = others[["name", "count"]].set_index("name")

        all_data = others.sort_index(ascending=False)
        all_data.columns = ["samples"]

        # appended at the end
        all_data.loc['undetermined'] = 0

        # revert back
        all_data = all_data.loc[::-1]

        # just for legend
        under.columns = ['undetermined']
        if all_data.sum().min() > 1e6:
            all_data /= 1e6
            under /= 1e6
            M = True
        else:
            M = False

        all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k')

        under.plot(kind="barh",
                   alpha=alpha,
                   color="red",
                   ax=pylab.gca(),
                   zorder=1,
                   width=width,
                   ec='k')
        pylab.ylim([-0.5, len(all_data) + 0.5])
        if len(all_data) < 100:
            pylab.yticks(range(len(all_data)), all_data.index)

        pylab.legend()
        pylab.grid(True, zorder=-1)
        if M:
            pylab.xlabel("Number of reads (M)")
        else:
            pylab.xlabel("Number of reads")
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename, dpi=200)
Exemple #12
0
    def hist_read_length(self, bins=80, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="Read Length", ylabel="#", label="",
                title=None, logy=False,  ec="k", hist_kwargs={}):
        """Plot histogram Read length

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_read_length()

        """
        mean_len =  np.mean(self.df.loc[:,'read_length'])

        # set title if not provided
        if title is None:
            title = "Read length  \n Mean length : %.2f" %(mean_len)

        if hold is False:
            pylab.clf()

        hist = HistCumSum(self.df.loc[:,'read_length'], fontsize=fontsize,
                    grid=grid)
        hist.title = title
        hist.xlabel = xlabel
        hist.ylabel = ylabel
        hist.plot(bins=bins, alpha=alpha, edgecolor=ec,
            label=  "%s, mean : %.0f, N : %d" % (label, mean_len, len(self)),
            log=logy, **hist_kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
Exemple #13
0
    def plot(self):
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        cmap.set_bad("grey", 1.0)

        p = sns.clustermap(
            self.data_df,
            cmap=cmap,
            col_colors=self.sample_groups_col_df,
            yticklabels=self.yticklabels,
            row_colors=self.gene_groups_col_df,
            **self.kwargs,
        )

        f = pylab.gca()

        self._do_legend(f, self.sample_color_dict, (12, 2))
        self._do_legend(f, self.gene_color_dict, (-2, -2))

        return p
Exemple #14
0
    def plot_hist_coverage(self, logx=True, logy=True, fontsize=16, N=20,
        fignum=1, hold=False, alpha=0.5, filename=None, **kw_hist):
        """


        """
        if hold is False:
            pylab.figure(fignum)
            pylab.clf()
        ax = pylab.gca()
        ax.set_facecolor('#eeeeee')

        data = self.df['cov'].dropna().values

        maxcov = data.max()
        if logx is True and logy is True:
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
            pylab.hist(data, bins=bins, log=True, label=self.chrom_name,
                alpha=alpha, **kw_hist)
            pylab.semilogx()
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
        elif logx is False and logy is True:
            pylab.hist(data, bins=N, log=True, label=self.chrom_name,
                alpha=alpha, **kw_hist)
            pylab.xlabel("Coverage", fontsize=fontsize)
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
        elif logx is True and logy is False:
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
            pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha,
                **kw_hist)
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
            pylab.ylabel("Count", fontsize=fontsize)
            pylab.semilogx()
        else:
            pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha,
                **kw_hist)
            pylab.xlabel("Coverage", fontsize=fontsize)
            pylab.ylabel("Count", fontsize=fontsize)
        pylab.grid(True)
        if filename:
            pylab.savefig(filename)
Exemple #15
0
    def plot(self):
        """"""
        if self.design:
            self.df['label'] = self.design.df['type'] + "/" + self.design.df[
                'condition']

        pylab.clf()
        MX = self.df.FRiP.max()
        MY = self.df['in_peaks'].max()
        pylab.plot([0, MX], [0, MY], ls='--', color='b', alpha=0.5)
        for label in self.df['label'].unique():
            self.df.query('label==@label').plot(x='FRiP',
                                                y='in_peaks',
                                                marker="o",
                                                lw=0,
                                                label=label,
                                                ax=pylab.gca())
        pylab.ylabel('Reads in peaks')
        pylab.xlabel('FRiP')
        pylab.xlim(0, pylab.xlim()[1])
        pylab.ylim(0, pylab.ylim()[1])
        pylab.grid()
Exemple #16
0
    def _plot(self, Xr, pca=None, pc1=0, pc2=1, colors=None, show_labels=True):
        if colors is None:
            colors = [self.colors[k] for k in self.labels]
            if len(colors) != len(Xr):
                colors = ["r"] * len(Xr[:,0])
        else:
            for k in self.labels:
                if k not in colors.keys():
                    logger.warning("No key color for this sample: {}. Set to red".format(k))
                    colors[k] = "r"
            colors = [colors[k] for k in self.labels]

        pylab.scatter(Xr[:,pc1], Xr[:,pc2], c=colors)
        ax = pylab.gca()
        X1, X2 = pylab.xlim()
        dX = X2 - X1
        pylab.xlim([X1 + X1*0.05, X2 + X2*0.05])

        Y1, Y2 = pylab.ylim()
        dY = Y2 - Y1
        pylab.ylim([Y1 + Y1*0.05, Y2 + Y2*0.05])

        count = 0
        if show_labels:
            for x,y in zip(Xr[:,pc1], Xr[:,pc2]):
                x += dX / 40
                y += dY / 40
                ax.annotate(self.labels[count], (x,y))
                count += 1
                if count > 100: 
                    break
        if pca:
            pylab.xlabel("PC{} ({}%)".format(pc1+1,
                round(pca.explained_variance_ratio_[pc1]*100, 2)))
            pylab.ylabel("PC{} ({}%)".format(pc2+1,
                round(pca.explained_variance_ratio_[pc2]*100, 2)))
        pylab.grid(True)
Exemple #17
0
    def plot_go_terms(self,
                      ontologies,
                      max_features=50,
                      log=False,
                      fontsize=8,
                      minimum_genes=0,
                      pvalue=0.05,
                      cmap="summer_r",
                      sort_by="fold_enrichment",
                      show_pvalues=False,
                      include_negative_enrichment=False,
                      fdr_threshold=0.05,
                      compute_levels=True,
                      progress=True):

        assert sort_by in ['pValue', 'fold_enrichment', 'fdr']

        # FIXME: pvalue and fold_enrichment not sorted in same order
        pylab.clf()

        df = self.get_data(
            ontologies,
            include_negative_enrichment=include_negative_enrichment,
            fdr=fdr_threshold)

        if len(df) == 0:
            return df

        df = df.query("pValue<=@pvalue")
        logger.info("Filtering out pvalue>{}. Kept {} GO terms".format(
            pvalue, len(df)))
        df = df.reset_index(drop=True)

        # Select a subset of the data to keep the best max_features in terms of
        # pValue
        subdf = df.query("number_in_list>@minimum_genes").copy()
        logger.info(
            "Filtering out GO terms with less than {} genes: Kept {} GO terms".
            format(minimum_genes, len(subdf)))

        logger.info("Filtering out the 3 parent terms")
        subdf = subdf.query("id not in @self.ontologies")

        # Keeping only a part of the data, sorting by pValue
        if sort_by == "pValue":
            subdf = subdf.sort_values(by="pValue",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="pValue", ascending=False)
        elif sort_by == "fold_enrichment":
            subdf = subdf.sort_values(by="abs_log2_fold_enrichment",
                                      ascending=True).iloc[-max_features:]
            df = df.sort_values(by="abs_log2_fold_enrichment", ascending=False)
        elif sort_by == "fdr":
            subdf = subdf.sort_values(by="fdr",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="fdr", ascending=False)

        subdf = subdf.reset_index(drop=True)

        # We get all levels for each go id.
        # They are stored by MF, CC or BP
        if compute_levels:
            paths = self.get_graph(list(subdf['id'].values), progress=progress)
            levels = []
            keys = list(paths.keys())
            goid_levels = paths[keys[0]]
            if len(keys) > 1:
                for k in keys[1:]:
                    goid_levels.update(paths[k])
            levels = [goid_levels[ID] for ID in subdf['id'].values]
            subdf["level"] = levels
        else:
            subdf['level'] = ""
        N = len(subdf)

        size_factor = 12000 / len(subdf)
        max_size = subdf.number_in_list.max()
        min_size = subdf.number_in_list.min()
        sizes = [
            max(max_size * 0.2, x) for x in size_factor *
            subdf.number_in_list.values / subdf.number_in_list.max()
        ]

        m1 = min(sizes)
        m3 = max(sizes)
        m2 = m1 + (m3 - m1) / 2

        if log:
            pylab.scatter(pylab.log2(subdf.fold_enrichment),
                          range(len(subdf)),
                          c=subdf.fdr,
                          s=sizes,
                          cmap=cmap,
                          alpha=0.8,
                          ec="k",
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
            #pylab.barh(range(N), pylab.log2(subdf.fold_enrichment), color="r",
            #    label="pvalue>0.05; FDR>0.05")
            #pylab.axvline(1, color="gray", ls="--")
            #pylab.axvline(-1, color="gray", ls="--")
        else:
            pylab.scatter(subdf.fold_enrichment,
                          range(len(subdf)),
                          c=subdf.fdr,
                          cmap=cmap,
                          s=sizes,
                          ec="k",
                          alpha=.8,
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
        #    pylab.barh(range(N), subdf.fold_enrichment, color="r",
        #    label="not significant")
        pylab.grid(zorder=-10)
        ax2 = pylab.colorbar(shrink=0.5)
        ax2.ax.set_ylabel('FDR')

        labels = [
            x if len(x) < 50 else x[0:47] + "..." for x in list(subdf.label)
        ]
        ticks = [
            "{} ({}) {}".format(ID, level, "; " + label.title())
            for level, ID, label in zip(subdf['level'], subdf.id, labels)
        ]

        pylab.yticks(range(N), ticks, fontsize=fontsize, ha='left')

        yax = pylab.gca().get_yaxis()
        try:
            pad = [x.label.get_window_extent().width for x in yax.majorTicks]
            yax.set_tick_params(pad=max(pad))
        except:
            yax.set_tick_params(pad=60 * fontsize * 0.7)
        yax.set_tick_params(pad=60 * fontsize * 0.6)

        fc_max = subdf.fold_enrichment.max(skipna=True)
        fc_min = subdf.fold_enrichment.min(skipna=True)
        # go into log2 space
        fc_max = pylab.log2(fc_max)
        fc_min = pylab.log2(fc_min)
        abs_max = max(fc_max, abs(fc_min), 1)

        if log:
            fc_max = abs_max * 1.5
        else:
            fc_max = 2**abs_max * 1.2

        pylab.axvline(0, color="k", lw=2)
        if log:
            pylab.xlabel("Fold Enrichment (log2)")
        else:
            pylab.xlabel("Fold Enrichment")
        if include_negative_enrichment:
            pylab.xlim([-fc_max, fc_max])
        else:
            pylab.xlim([0, fc_max])
        pylab.tight_layout()

        # The pvalue:
        if show_pvalues:
            ax = pylab.gca().twiny()
            ax.set_xlim([0, max(-pylab.log10(subdf.pValue)) * 1.2])
            ax.set_xlabel("p-values (log10)", fontsize=12)
            ax.plot(-pylab.log10(subdf.pValue),
                    range(len(subdf)),
                    label="pvalue",
                    lw=2,
                    color="k")
            ax.axvline(1.33, lw=1, ls="--", color="grey", label="pvalue=0.05")
            pylab.tight_layout()
            pylab.legend(loc="lower right")
        s1 = pylab.scatter([], [], s=m1, marker='o', color='#555555', ec="k")
        s2 = pylab.scatter([], [], s=m2, marker='o', color='#555555', ec="k")
        s3 = pylab.scatter([], [], s=m3, marker='o', color='#555555', ec="k")

        if len(subdf) < 10:
            labelspacing = 1.5 * 4
            borderpad = 4
            handletextpad = 2
        elif len(subdf) < 20:
            labelspacing = 1.5 * 2
            borderpad = 1
            handletextpad = 2
        else:
            labelspacing = 1.5
            borderpad = 2
            handletextpad = 2

        if len(subdf) >= 3:
            leg = pylab.legend(
                (s1, s2, s3),
                (str(int(min_size)),
                 str(int(min_size +
                         (max_size - min_size) / 2)), str(int(max_size))),
                scatterpoints=1,
                loc='lower right',
                ncol=1,
                frameon=True,
                title="gene-set size",
                labelspacing=labelspacing,
                borderpad=borderpad,
                handletextpad=handletextpad,
                fontsize=8)
        else:
            leg = pylab.legend((s1, ), (str(int(min_size)), ),
                               scatterpoints=1,
                               loc='lower right',
                               ncol=1,
                               frameon=True,
                               title="gene-set size",
                               labelspacing=labelspacing,
                               borderpad=borderpad,
                               handletextpad=handletextpad,
                               fontsize=8)

        frame = leg.get_frame()
        frame.set_facecolor('#b4aeae')
        frame.set_edgecolor('black')
        frame.set_alpha(1)

        self.subdf = subdf
        self.df = df
        return df
Exemple #18
0
    def plot_coverage(self, filename=None, fontsize=16,
            rm_lw=1, rm_color="#0099cc", rm_label="Running median",
            th_lw=1, th_color="r", th_ls="--", main_color="k", main_lw=1,
            main_kwargs={}, sample=True, set_ylimits=True):
        """ Plot coverage as a function of base position.

        :param filename:
        :param rm_lw: line width of the running median
        :param rm_color: line color of the running median
        :param rm_color: label for the running median
        :param th_lw: line width of the thresholds
        :param th_color: line color of the thresholds
        :param main_color: line color of the coverage
        :param main_lw: line width of the coverage
        :param sample: if there are more than 1 000 000 points, we 
            use an integer step to skip data points. We can still plot
            all points at your own risk by setting this option to False

        :param set_ylimits: we want to focus on the "normal" coverage ignoring
            unsual excess. To do so, we set the yaxis range between 0 and a
            maximum value. This maximum value is set to the minimum between the
            6 times the mean coverage and 1.5 the maximum of the high coverage
            threshold curve. If you want to let the ylimits free, set this
            argument to False

        .. note:: if there are more than 1,000,000 points, we show only
            1,000,000 by points. For instance for 5,000,000 points,

        In addition to the coverage, the running median and coverage confidence
        corresponding to the lower and upper  zscore thresholds are shown.

        .. note:: uses the thresholds attribute.
        """
        # z = (X/rm - \mu ) / sigma
        high_zcov = (self.thresholds.high * self.best_gaussian["sigma"] +
                self.best_gaussian["mu"]) * self.df["rm"]
        low_zcov = (self.thresholds.low * self.best_gaussian["sigma"] +
                self.best_gaussian["mu"]) * self.df["rm"]

        pylab.clf()
        ax = pylab.gca()
        ax.set_facecolor('#eeeeee')
        pylab.xlim(0,self.df["pos"].iloc[-1])
        axes = []
        labels = []

        # 1,000,000 points is a lot for matplotlib. Let us restrict ourself to 1
        # million points for now.
        if len(self.df) > 1000000 and sample is True:
            NN = int(len(self.df)/1000000)
        else:
            NN = 1

        # the main coverage plot
        p1, = pylab.plot(self.df["cov"][::NN], color=main_color, label="Coverage",
                linewidth=main_lw, **main_kwargs)
        axes.append(p1)
        labels.append("Coverage")

        # The running median plot
        if rm_lw > 0:
            p2, = pylab.plot(self.df["rm"][::NN],
                    color=rm_color,
                    linewidth=rm_lw,
                    label=rm_label)
            axes.append(p2)
            labels.append(rm_label)

        # The threshold curves
        if th_lw > 0:
            p3, = pylab.plot(high_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls,
                label="Thresholds")
            p4, = pylab.plot(low_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls,
                label="_nolegend_")
            axes.append(p3)
            labels.append("Thresholds")

        pylab.legend(axes, labels, loc="best")
        pylab.xlabel("Position", fontsize=fontsize)
        pylab.ylabel("Per-base coverage", fontsize=fontsize)
        pylab.grid(True)

        # sometimes there are large coverage value that squeeze the plot.
        # Let us restrict it
        if set_ylimits is True:
            pylab.ylim([0, min([
                high_zcov.max() * 1.5,
                self.df["cov"].mean()*6])])
        else:
            pylab.ylim([0, pylab.ylim()[1]])

        try:
            pylab.tight_layout()
        except:
            pass

        if filename:
            pylab.savefig(filename)
Exemple #19
0
    def plot_jaccard_distance(self, mode, padjs=[0.0001,0.001,0.01,0.05,0.1],
            Nfc=50, smooth=False, window=5):
        assert mode in ['down', 'up', 'all']
        pylab.clf()

        if mode == "down":
            m1 = self.r1.df.log2FoldChange.min()
            m2 = self.r2.df.log2FoldChange.min()
            minimum = min(m1,m2)
            print(m1, m2)
            X = pylab.linspace(0, minimum, Nfc)
        elif mode == "up":
            m1 = self.r1.df.log2FoldChange.max()
            m2 = self.r2.df.log2FoldChange.max()
            maximum = max(m1,m2)
            X = pylab.linspace(0, maximum, Nfc)
        else:
            minmax1 = self.r1.df.log2FoldChange.abs().max()
            minmax2 = self.r2.df.log2FoldChange.abs().max()
            maximum = max(minmax1, minmax2)
            X = pylab.linspace(0, maximum, Nfc)

        common = {}
        for padj in padjs:
            I = []
            common[padj] = []
            for x in X:
                if mode == "down":
                    # less than a given fold change that is negative
                    A = set(self.r1.df.query("log2FoldChange<=@x and padj<@padj").index)
                    B = set(self.r2.df.query("log2FoldChange<=@x and padj<@padj").index)
                elif mode == "up":
                    # greater than a given fold change that is positive
                    A = set(self.r1.df.query("log2FoldChange>=@x and padj<@padj").index)
                    B = set(self.r2.df.query("log2FoldChange>=@x and padj<@padj").index)
                else:
                    A = set(self.r1.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index)
                    B = set(self.r2.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index)
                if len(A) == 0 or len(B) == 0:
                    # no overlap yet
                    I.append(100)
                else:
                    res = len(A.intersection(B)) / (len(A) + len(B) - len(A.intersection(B)))  * 100
                    I.append(res)   
                common[padj].append(len(A.intersection(B)))

            try:
                if smooth:
                    I = pd.Series(I).rolling(window).median().values
                else:
                    assert False
            except:
                pass
            pylab.plot(X, I, 'o-', label=str(padj))
        ax = pylab.gca()
        ax.set_ylabel("Jaccard similarity (intersection/union)")
        ax.set_xlabel("Fold change (log2)")
        ax2 = ax.twinx()
        for padj in padjs:
            ax2.plot(X, common[padj], color='orange', ls='--')
        ax2.set_ylabel("Cardinality of the union ")
        ax.legend()
        ax.set_ylim([0,100])
        #ax2.set_ylim([0,100])
        if mode == "down":
            ax.axvline(-2, ls='--', color='r')
        else:
            ax.axvline(2, ls='--', color='r')
Exemple #20
0
def plot_venn(subsets,
              labels=None,
              title=None,
              ax=None,
              alpha=0.8,
              weighted=False,
              colors=('r', 'b', 'y')):
    """ Plot venn diagramm according to number of groups.

    :param subsets: This parameter may be (1) a dict, providing sizes of
        three diagram regions. The regions are identified via two-letter
        binary codes ('10', '01', and '11'), hence a valid set could look like:
        {'10': 10, '01': 20, '11': 40}. Unmentioned codes are considered to map to 0.
        (2) a list (or a tuple) with three numbers, denoting the sizes of the
        regions in the following order: (10, 01, 11) and (3) a list containing
        the subsets of values.

    The subsets can be a list (or a tuple) containing two set objects. For
    instance:

    .. plot::
        :include-source:

        from sequana.viz.venn import plot_venn
        A = set([1,2,3,4,5,6,7,8,9])
        B = set([            7,8,9,10,11])
        plot_venn((A, B), labels=("A", "B"))

    This is the unweighted version by default meaning all circles have the same
    size. If you prefer to have circle scaled to the
    size of the sets, add the relevant parameter as follows:

    .. plot::
        :include-source:

        from sequana.viz.venn import plot_venn
        A = set([1,2,3,4,5,6,7,8,9])
        B = set([            7,8,9,10,11])
        plot_venn((A, B), labels=("A", "B"), weighted=True)

    Similarly for 3 sets, a Venn diagram can be represented as follows. Note
    here that we also use the *title* parameter:

    .. plot::
        :include-source:

        from sequana.viz.venn import plot_venn

        A = set([1,2,3,4,5,6,7,8,9])
        B = set([      4,5,6,7,8,9,10,11,12,13])
        C = set([   3,4,5,6,7,8,9])
        plot_venn((A, B, C), labels=("A", "B", "C"), title="my Venn3 diagram")

    Input can be a list/tuple of 2 or 3 sets as described above. 


    """

    #pylab.clf()
    if len(subsets) == 2:
        from matplotlib_venn import venn2_unweighted, venn2_circles, venn2
        if weighted:
            venn_function = venn2
        else:
            venn_function = venn2_unweighted
        venn_circles = venn2_circles
    elif len(subsets) == 3:
        from matplotlib_venn import venn3_unweighted, venn3_circles, venn3
        if weighted:
            venn_function = venn3
        else:
            venn_function = venn3_unweighted
        venn_circles = venn3_circles
    else:
        raise IOError("Venn diagramm supports only 2 or 3 groups.")

    vf = venn_function(subsets,
                       set_labels=labels,
                       ax=ax,
                       alpha=alpha,
                       set_colors=colors)
    # works for weighted, nor for unweighted, so we draw the circles
    # ourselfves
    if ax is None:
        ax = pylab.gca()

    for center, radius in zip(vf.centers, vf.radii):
        circle = pylab.Circle(center,
                              radius=radius,
                              linestyle="-",
                              edgecolor="k",
                              lw=2,
                              facecolor='none',
                              alpha=1)
        ax.add_patch(circle)
    if title:
        pylab.title(title)
    return vf
Exemple #21
0
    def plot_common_major_counts(self, mode, labels=None,
            switch_up_down_cond2=False, add_venn=True, xmax=None, 
            title="", fontsize=12, sortby="log2FoldChange"):
        """

        :param mode: down, up or all


        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana.compare import RNADiffCompare

            c = RNADiffCompare(
                sequana_data("rnadiff/rnadiff_onecond_1"),
                sequana_data("rnadiff/rnadiff_onecond_2"))
            c.plot_common_major_counts("down")
        """
        #cond1, cond2 = self._get_cond1_cond2()
        if labels is None:
            labels = ['r1', 'r2']

        if mode in ["down"]:
            # Negative values !
            gl1 = set(self.r1.gene_lists['down'])
            gl2 =  set(self.r2.gene_lists['down'])
            A = self.r1.df.loc[gl1].sort_values(by=sortby)
            B = self.r2.df.loc[gl1].sort_values(by=sortby)
        else:
            gl1 = set(self.r1.gene_lists[mode])
            gl2 =  set(self.r2.gene_lists[mode])
            A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False)
            B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False)
        # sometimes, up and down may be inverted as compared to the other
        # conditions

        N = []
        for i in range(1,max(len(A), len(B))):
            a = A.iloc[0:i].index
            b = B.iloc[0:i].index
            n = len(set(b).intersection(set(a)))
            N.append(n / i*100)

        max_common = len(set(A.index).intersection(set(B.index)))
        pylab.clf()
        if len(A) > len(B):
            pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection")
            pylab.axvline(len(B), ls="--", color="k", label="rank of minor set")
        else:
            pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect")
            pylab.axvline(len(A), ls="--", color="k", label="rank of minor set")

        pylab.plot(N)
        pylab.xlabel('rank', fontsize=fontsize)
        pylab.ylabel('% common features', fontsize=fontsize)
        pylab.grid(True)
        pylab.ylim([0,100])
        if xmax:
            pylab.xlim([0, xmax])
        else:
            pylab.xlim([0, max(len(A),len(B))])
        pylab.title(title, fontsize=fontsize)
        ax = pylab.gca()
        ax2 = ax.twinx()
        ax2.plot(A[sortby].values, "orange", label=sortby)
        ax2.set_ylabel(sortby)
        pylab.legend(loc="lower left")
        ax.legend(loc="lower right")

        if add_venn:
            f = pylab.gcf()
            ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey")
            if mode=="down":
                self.plot_venn_down(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="up":
                self.plot_venn_up(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="all":
                self.plot_venn_all(ax=ax, title=None, labels=labels,
                    mode="two_only")
Exemple #22
0
    def plot_volcano(self, labels=None):
        """Volcano plot of log2 fold change versus log10 of adjusted p-value

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana.compare import RNADiffCompare

            c = RNADiffCompare(
                sequana_data("rnadiff/rnadiff_onecond_1"),
                sequana_data("rnadiff/rnadiff_onecond_2"))
            c.plot_volcano()
        """
        cond1, cond2 = "cond1", "cond2"
        if labels is None:
            labels = [cond1, cond2]

        A = self.r1.df.loc[self.r1.gene_lists["all"]]
        B = self.r2.df.loc[self.r2.gene_lists["all"]]

        if cond1 == cond2:
            cond1 += "(1)"
            cond2 += "(2)"

        pylab.clf()
        pylab.plot(A.log2FoldChange, -np.log10(A.padj), marker="o",
            alpha=0.5, color="r", lw=0, label=labels[0], pickradius=4,
            picker=True)
        pylab.plot(B.log2FoldChange, -np.log10(B.padj), marker="x",
            alpha=0.5, color="k", lw=0, label=labels[1], pickradius=4,
            picker=True)

        genes = list(A.index) + list(B.index)
        pylab.grid(True)
        pylab.xlabel("fold change")
        pylab.ylabel("log10 adjusted p-value")
        pylab.legend(loc="lower right")
        ax = pylab.gca()

        def onpick(event):
            thisline = event.artist
            self.event = event
            label = thisline.get_label()
            if label == cond1:
                gene_name = A.index[event.ind[0]]
                x1 = round(A.loc[gene_name].log2FoldChange,1)
                y1 = round(-np.log10(A.loc[gene_name].padj),1)
                try:
                    x2 = round(B.loc[gene_name].log2FoldChange,1)
                    y2 = round(-np.log10(B.loc[gene_name].padj),1)
                except:
                    x2, y2 = None, None
            else:
                gene_name = B.index[event.ind[0]]
                x1 = round(B.loc[gene_name].log2FoldChange,1)
                y1 = round(-np.log10(B.loc[gene_name].padj),1)
                try:
                    x2 = round(A.loc[gene_name].log2FoldChange,1)
                    y2 = round(-np.log10(A.loc[gene_name].padj),1)
                except:
                    x2, y2 = None, None

            try:
                if x2 is None:
                    ax.title.set_text("{} at pos [{},{}]".format(
                        gene_name,x1,y1))
                else:
                    ax.title.set_text("{} at pos [{},{}] and [{},{}]".format(
                            gene_name,x1,y1,x2,y2))
            except:
                print("exception")
                ax.title.set_text("")
            pylab.draw()
        fig = pylab.gcf()
        fig.canvas.mpl_connect('pick_event', onpick)
Exemple #23
0
    def hist_snr(self,
                 bins=50,
                 alpha=0.5,
                 hold=False,
                 fontsize=12,
                 grid=True,
                 xlabel="SNR",
                 ylabel="#",
                 title="",
                 clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:, "snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:, 'snr_A'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="A",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_C'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="C",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_G'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="G",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_T'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="T",
                   bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Exemple #24
0
    def plot(
        self,
        size=10,
        alpha=0.7,
        marker="o",
        fontsize=16,
        xlabel="fold change",
        logy=False,
        threshold_lines={
            "color": "black",
            "ls": "--",
            "width": 0.5
        },
        ylabel="p-value",
        add_broken_axes=False,
        broken_axes={"ylims": ((0, 10), (50, 100))},
    ):
        """

        :param size: size of the markers
        :param alpha: transparency of the marker
        :param fontsize:
        :param xlabel:
        :param ylabel:
        :param center: If centering the x axis
        """
        pylab.clf()

        if add_broken_axes:  #pragma: no cover
            from brokenaxes import brokenaxes

            _ylims = broken_axes.get("ylims", None)
            _xlims = broken_axes.get("xlims", None)
            bax = brokenaxes(ylims=_ylims, xlims=_xlims)
        else:
            bax = pylab

        bax.scatter(
            self.df.fold_change,
            self.df.pvalue,
            s=size,
            alpha=alpha,
            c=self.df.color,
            marker=marker,
            edgecolors="None",
        )

        bax.grid()
        # pylab.ylim([0, pylab.ylim()[1]])
        # M = max(abs(self.fold_change)) * 1.1
        # pylab.xlim([-M, M])
        try:
            bax.set_xlabel(xlabel, fontsize=fontsize)
            bax.set_ylabel(ylabel, fontsize=fontsize)
        except:
            bax.xlabel(xlabel, fontsize=fontsize)
            bax.ylabel(ylabel, fontsize=fontsize)

        bax.axhline(
            self.pvalue_threshold,
            color=threshold_lines["color"],
            linestyle=threshold_lines["ls"],
            linewidth=threshold_lines["width"],
        )
        bax.axvline(
            self.fold_change_threshold,
            color=threshold_lines["color"],
            linestyle=threshold_lines["ls"],
            linewidth=threshold_lines["width"],
        )
        bax.axvline(
            -1 * self.fold_change_threshold,
            color=threshold_lines["color"],
            linestyle=threshold_lines["ls"],
            linewidth=threshold_lines["width"],
        )

        if logy is True:
            ax = pylab.gca()
            ax.set(yscale="log")