Esempio n. 1
0
    def plot_idr_vs_peaks(self, filename=None, savefig=False):

        # global_idr is actually -log10(idr)
        pylab.clf()
        X1 = pylab.linspace(0, self.threshold, 100)
        X2 = pylab.linspace(self.threshold, 1, 100)
        # convert global idr to proba

        df1 = self.df.query("idr<@self.threshold")
        df2 = self.df.query("idr>[email protected]")

        pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2)
        shift = len(df1)

        pylab.plot([shift + sum(df2['idr'] < x) for x in X2],
                   X2,
                   "-",
                   color='k',
                   lw=2)
        pylab.xlabel('Number of significant peaks')
        pylab.ylabel('IDR')
        pylab.axhline(0.05, color='b', ls='--')
        pylab.axvline(self.N_significant_peaks, color='b', ls='--')
        if savefig:
            pylab.savefig(filename)
Esempio n. 2
0
 def get_percentage_genes_covered_at_this_fraction(self, this):
     assert this <= 1 and this >= 0
     icol = self.coverage_column
     X = pylab.linspace(0, 1, 101)
     N = float(len(self.df))
     Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X])
     return np.interp(this, X, Y)
Esempio n. 3
0
    def plot_gc_content(self, fontsize=16, ec="k", bins=100):
        """plot GC content histogram

        :params bins: a value for the number of bins or an array (with a copy()
            method)
        :param ec: add black contour on the bars

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam'))
            b.plot_gc_content()

        """
        data = self.get_gc_content()
        try:
            X = np.linspace(0, 100, bins)
        except:
            X = bins.copy()

        pylab.hist(data, X, normed=True, ec=ec)
        pylab.grid(True)
        mu = pylab.mean(data)
        sigma = pylab.std(data)

        X = pylab.linspace(X.min(), X.max(), 100)
        pylab.plot(X, pylab.normpdf(X, mu, sigma), lw=2, color="r", ls="--")
        pylab.xlabel("GC content", fontsize=16)
Esempio n. 4
0
    def plot_gc_content(self, fontsize=16, ec="k", bins=100):
        """plot GC content histogram

        :params bins: a value for the number of bins or an array (with a copy()
            method)
        :param ec: add black contour on the bars

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam'))
            b.plot_gc_content()

        """
        data = self.get_gc_content()
        try:
            X = np.linspace(0, 100, bins)
        except:
            X = bins.copy()

        pylab.hist(data, X, density=True, ec=ec)
        pylab.grid(True)
        mu = pylab.mean(data)
        sigma = pylab.std(data)

        X = pylab.linspace(X.min(), X.max(), 100)

        from sequana.misc import normpdf

        pylab.plot(X, normpdf(X, mu, sigma), lw=2, color="r", ls="--")
        pylab.xlabel("GC content", fontsize=16)
Esempio n. 5
0
    def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:,"snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title,fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 6
0
    def plot(self,
             normed=True,
             N=1000,
             Xmin=None,
             Xmax=None,
             bins=50,
             color='red',
             lw=2,
             hist_kw={
                 'color': '#5F9EA0',
                 "edgecolor": "k"
             },
             ax=None):

        if ax:
            ax.hist(self.data, normed=normed, bins=bins, **hist_kw)
        else:
            pylab.hist(self.data, density=normed, bins=bins, **hist_kw)
        if Xmin is None:
            Xmin = self.data.min()
        if Xmax is None:
            Xmax = self.data.max()
        X = pylab.linspace(Xmin, Xmax, N)

        if ax:
            ax.plot(X, [self.model.pdf(x, self.results.x) for x in X],
                    color=color,
                    lw=lw)
        else:
            pylab.plot(X, [self.model.pdf(x, self.results.x) for x in X],
                       color=color,
                       lw=lw)

        K = len(self.results.x)
        # The PIs must be normalised
        import scipy.stats as ss
        for i in range(self.k):

            mu, sigma, pi_ = self.results.mus[i], self.results.sigmas[
                i], self.results.pis[i]
            if ax:
                ax.plot(X, [pi_ * ss.norm.pdf(x, mu, sigma) for x in X],
                        'k--',
                        alpha=0.7,
                        lw=2)
            else:
                pylab.plot(X, [pi_ * ss.norm.pdf(x, mu, sigma) for x in X],
                           'k--',
                           alpha=0.7,
                           lw=2)
Esempio n. 7
0
    def hist_snr(self,
                 bins=50,
                 alpha=0.5,
                 hold=False,
                 fontsize=12,
                 grid=True,
                 xlabel="SNR",
                 ylabel="#",
                 title="",
                 clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:, "snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:, 'snr_A'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="A",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_C'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="C",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_G'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="G",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_T'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="T",
                   bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 8
0
    def plot_jaccard_distance(self, mode, padjs=[0.0001,0.001,0.01,0.05,0.1],
            Nfc=50, smooth=False, window=5):
        assert mode in ['down', 'up', 'all']
        pylab.clf()

        if mode == "down":
            m1 = self.r1.df.log2FoldChange.min()
            m2 = self.r2.df.log2FoldChange.min()
            minimum = min(m1,m2)
            print(m1, m2)
            X = pylab.linspace(0, minimum, Nfc)
        elif mode == "up":
            m1 = self.r1.df.log2FoldChange.max()
            m2 = self.r2.df.log2FoldChange.max()
            maximum = max(m1,m2)
            X = pylab.linspace(0, maximum, Nfc)
        else:
            minmax1 = self.r1.df.log2FoldChange.abs().max()
            minmax2 = self.r2.df.log2FoldChange.abs().max()
            maximum = max(minmax1, minmax2)
            X = pylab.linspace(0, maximum, Nfc)

        common = {}
        for padj in padjs:
            I = []
            common[padj] = []
            for x in X:
                if mode == "down":
                    # less than a given fold change that is negative
                    A = set(self.r1.df.query("log2FoldChange<=@x and padj<@padj").index)
                    B = set(self.r2.df.query("log2FoldChange<=@x and padj<@padj").index)
                elif mode == "up":
                    # greater than a given fold change that is positive
                    A = set(self.r1.df.query("log2FoldChange>=@x and padj<@padj").index)
                    B = set(self.r2.df.query("log2FoldChange>=@x and padj<@padj").index)
                else:
                    A = set(self.r1.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index)
                    B = set(self.r2.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index)
                if len(A) == 0 or len(B) == 0:
                    # no overlap yet
                    I.append(100)
                else:
                    res = len(A.intersection(B)) / (len(A) + len(B) - len(A.intersection(B)))  * 100
                    I.append(res)   
                common[padj].append(len(A.intersection(B)))

            try:
                if smooth:
                    I = pd.Series(I).rolling(window).median().values
                else:
                    assert False
            except:
                pass
            pylab.plot(X, I, 'o-', label=str(padj))
        ax = pylab.gca()
        ax.set_ylabel("Jaccard similarity (intersection/union)")
        ax.set_xlabel("Fold change (log2)")
        ax2 = ax.twinx()
        for padj in padjs:
            ax2.plot(X, common[padj], color='orange', ls='--')
        ax2.set_ylabel("Cardinality of the union ")
        ax.legend()
        ax.set_ylim([0,100])
        #ax2.set_ylim([0,100])
        if mode == "down":
            ax.axvline(-2, ls='--', color='r')
        else:
            ax.axvline(2, ls='--', color='r')