Esempio n. 1
0
    def plot_volcano(self):
        """
        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_volcano()

        """
        d1 = self.df.query("padj>0.05")
        d2 = self.df.query("padj<=0.05")

        fig = pylab.figure()
        pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o",
            alpha=0.5, color="r", lw=0)
        pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o",
            alpha=0.5, color="k", lw=0)

        pylab.grid(True)
        pylab.xlabel("fold change")
        pylab.ylabel("log10 adjusted p-value")
        m1 = abs(min(self.df.log2FoldChange))
        m2 = max(self.df.log2FoldChange)
        limit = max(m1,m2)
        pylab.xlim([-limit, limit])
        y1,y2 = pylab.ylim()
        pylab.ylim([0,y2])

        pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
Esempio n. 2
0
    def plot_idr_vs_peaks(self, filename=None, savefig=False):

        # global_idr is actually -log10(idr)
        pylab.clf()
        X1 = pylab.linspace(0, self.threshold, 100)
        X2 = pylab.linspace(self.threshold, 1, 100)
        # convert global idr to proba

        df1 = self.df.query("idr<@self.threshold")
        df2 = self.df.query("idr>[email protected]")

        pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2)
        shift = len(df1)

        pylab.plot([shift + sum(df2['idr'] < x) for x in X2],
                   X2,
                   "-",
                   color='k',
                   lw=2)
        pylab.xlabel('Number of significant peaks')
        pylab.ylabel('IDR')
        pylab.axhline(0.05, color='b', ls='--')
        pylab.axvline(self.N_significant_peaks, color='b', ls='--')
        if savefig:
            pylab.savefig(filename)
Esempio n. 3
0
    def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True):
        """Number Of Polymerase Reads Per Barcode"""
        PR = self.df_barcoded["Polymerase Reads"].sum()
        data = self.df_barcoded['Polymerase Reads'].sort_values(
            ascending=False).values
        pylab.plot([int(x) for x in range(1,
                                          len(data) + 1)],
                   data,
                   label="barcodes")
        pylab.axhline(data.mean(), color="r", label="average")

        try:
            if unbarcoded is True:
                unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0]
                pylab.axhline(unbar, color="k", ls="--", label="not barcoded")
        except:
            pass

        pylab.xlabel("Barcode Rank Order", fontsize=fontsize)
        pylab.ylabel("Counts of Reads", fontsize=fontsize)
        pylab.title("Total Polymerase count: {}".format(PR))
        pylab.legend()
        pylab.ylim(ymin=0)
        try:
            pylab.tight_layout()
        except:
            pass
Esempio n. 4
0
    def plot_percentage_null_read_counts(self):
        """


        Bars represent the percentage of null counts in each samples. 
        The dashed horizontal line represents the percentage of 
        feature counts being equal to zero across all samples.

        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_percentage_null_read_counts()


        """
        N = len(self.sample_names)

        data = (self.df[self.sample_names]==0).sum() 
        data = data / len(self.df) * 100

        all_null = (self.df[self.sample_names].sum(axis=1) == 0).sum()

        pylab.clf()
        pylab.bar(range(N), data)
        pylab.axhline(all_null / len(self.df) * 100, lw=2, ls="--", color="k")
        pylab.xticks(range(N), self.sample_names)
        pylab.xlabel("Sample")
Esempio n. 5
0
 def scatter_length_cov_gc(self, min_length=200, min_cov=10):
     pylab.clf()
     pylab.scatter(self.df.length, self.df['cov'], c=self.df.GC)
     pylab.loglog()
     pylab.axvline(min_length, lw=2, c="r", ls='--')
     pylab.axhline(min_cov, lw=2, c="r", ls='--')
     pylab.xlabel("contig length")
     pylab.ylabel("contig coverage")
     pylab.colorbar(label="GC")
     pylab.grid(True)
Esempio n. 6
0
    def plot_volcano_differences(self, mode="all"):
        cond1, cond2 = "cond1", "cond2"
        labels = [cond1, cond2]
        A = self.r1.df.loc[self.r1.gene_lists[mode]]
        B = self.r2.df.loc[self.r2.gene_lists[mode]]
        AB = set(A.index).intersection(set(B.index))
        Aonly = A.loc[set(A.index).difference(set(B.index))]
        Bonly = B.loc[set(B.index).difference(set(A.index))]
        Acommon = A.loc[AB]
        Bcommon = B.loc[AB]

        pylab.clf()
        pylab.plot(Acommon.log2FoldChange, -np.log10(Acommon.padj), marker="o",
            alpha=0.5, color="r", lw=0, label="Common in experiment 1", pickradius=4,
            picker=True)
        pylab.plot(Bcommon.log2FoldChange, -np.log10(Bcommon.padj), marker="o",
            alpha=0.5, color="orange", lw=0, label="Common in experiment 2", pickradius=4,
            picker=True)

        for x in AB:
            a_l = A.loc[x].log2FoldChange
            a_p = -np.log10(A.loc[x].padj)
            b_l = B.loc[x].log2FoldChange
            b_p = -np.log10(B.loc[x].padj)
            pylab.plot([a_l, b_l], [a_p, b_p], 'k', alpha=0.5)

        pylab.plot(Bonly.log2FoldChange, -np.log10(Bonly.padj), marker="*",
            alpha=0.5, color="blue", lw=0, label="In experiment 2 only", pickradius=4,
            picker=True)
        pylab.plot(Aonly.log2FoldChange, -np.log10(Aonly.padj), marker="*",
            alpha=0.5, color="cyan", lw=0, label="In experiment 1 only", pickradius=4,
            picker=True)

        for name, x in Bonly.iterrows():
            x1 = x.log2FoldChange
            y1 = -np.log10(x.padj)
            x2 = self.r1.df.loc[name].log2FoldChange
            y2 = -np.log10(self.r1.df.loc[name].padj)
            pylab.plot( [x1,x2], [y1,y2], ls="--", color='r')
        for name, x in Aonly.iterrows():
            x1 = x.log2FoldChange
            y1 = -np.log10(x.padj)
            x2 = self.r2.df.loc[name].log2FoldChange
            y2 = -np.log10(self.r2.df.loc[name].padj)
            pylab.plot( [x1,x2], [y1,y2], ls="-", color='r')


        pylab.axhline(1.33, alpha=0.5, ls="--", color="r")

        pylab.xlabel("log2 fold Change")
        pylab.ylabel("log10 adjusted p-values")
        pylab.legend()
        pylab.grid(True)

        return Aonly, Bonly, Acommon, Bcommon
Esempio n. 7
0
 def plot(self, clf=True):
     if clf:
         pylab.clf()
     M = self.df_shustring.shustring_length.max()
     print(M)
     M = int(M / 1000) + 1
     for i in range(M):
         pylab.axhline(i * 1000, ls='--', color='grey')
     pylab.plot(self.df_shustring.shustring_length)
     pylab.xlabel('position (bp)')
     pylab.ylabel('Length of repeats')
     pylab.ylim(bottom=0)
Esempio n. 8
0
    def plot(self,
             X=[0, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8, .9, .95, .99, .999, 1],
             fontsize=16,
             label=None):
        """plot percentage of genes covered (y axis) as a function of percentage
        of genes covered at least by X percent (x-axis). 

        """
        icol = self.coverage_column
        N = float(len(self.df))
        X = np.array(X)
        Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X])
        if label is None:
            pylab.plot(X * 100, Y, "o-")
        else:
            pylab.plot(X * 100, Y, "o-", label=label)
        pylab.xlabel("Gene coverage (%)", fontsize=fontsize)
        pylab.ylabel("Percentage of genes covered", fontsize=fontsize)
        for this in [25, 50, 75]:
            pylab.axhline(this, color="r", alpha=0.5, ls="--")
            pylab.axvline(this, color="r", alpha=0.5, ls="--")
Esempio n. 9
0
    def plot_percentage_null_read_counts(self):
        """

        Bars represent the percentage of null counts in each samples. 
        The dashed horizontal line represents the percentage of 
        feature counts being equal to zero across all samples.

        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_percentage_null_read_counts()


        """
        N = len(self.sample_names)

        data = (self.df[self.sample_names]==0).sum() 
        data = data / len(self.df) * 100

        all_null = (self.df[self.sample_names].sum(axis=1) == 0).sum()

        colors = []
        for sample in self.sample_names:
            colors.append(self.colors[self.get_cond_from_sample(sample)])

        pylab.clf()
        pylab.bar(range(N), data, 
            color=colors, alpha=1, 
            zorder=10, lw=1, ec="k", width=0.9)
        pylab.axhline(all_null / len(self.df) * 100, lw=2, ls="--", color="k",
            zorder=20)
        pylab.xticks(range(N), self.sample_names)
        pylab.xlabel("Sample")
        pylab.ylabel("Proportion of null counts (%)")
        pylab.grid(True, zorder=0)
Esempio n. 10
0
    def plot_percentage_null_read_counts(self):
        """Bars represent the percentage of null counts in each samples.  The dashed
        horizontal line represents the percentage of feature counts being equal
        to zero across all samples

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_percentage_null_read_counts()

        """
        pylab.clf()
        # how many null counts ?
        df = (self.counts_raw == 0).sum() / self.counts_raw.shape[0] * 100
        df = df.rename("percent_null")
        df = pd.concat([self.design_df, df], axis=1)

        pylab.bar(df.index,
                  df.percent_null,
                  color=df.group_color,
                  ec="k",
                  lw=1,
                  zorder=10)

        all_null = (self.counts_raw
                    == 0).all(axis=1).sum() / self.counts_raw.shape[0]

        pylab.axhline(all_null, ls="--", color="black", alpha=0.5)

        pylab.xticks(rotation=45, ha="right")
        pylab.ylabel("Proportion of null counts (%)")
        pylab.grid(True, zorder=0)
        pylab.tight_layout()
Esempio n. 11
0
 def plot_RSC(self):
     self.df.RSC.plot(kind='bar')
     pylab.axhline(0.8, lw=2, color='r', ls='--')
Esempio n. 12
0
    def plot_common_major_counts(self, mode, labels=None,
            switch_up_down_cond2=False, add_venn=True, xmax=None, 
            title="", fontsize=12, sortby="log2FoldChange"):
        """

        :param mode: down, up or all


        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana.compare import RNADiffCompare

            c = RNADiffCompare(
                sequana_data("rnadiff/rnadiff_onecond_1"),
                sequana_data("rnadiff/rnadiff_onecond_2"))
            c.plot_common_major_counts("down")
        """
        #cond1, cond2 = self._get_cond1_cond2()
        if labels is None:
            labels = ['r1', 'r2']

        if mode in ["down"]:
            # Negative values !
            gl1 = set(self.r1.gene_lists['down'])
            gl2 =  set(self.r2.gene_lists['down'])
            A = self.r1.df.loc[gl1].sort_values(by=sortby)
            B = self.r2.df.loc[gl1].sort_values(by=sortby)
        else:
            gl1 = set(self.r1.gene_lists[mode])
            gl2 =  set(self.r2.gene_lists[mode])
            A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False)
            B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False)
        # sometimes, up and down may be inverted as compared to the other
        # conditions

        N = []
        for i in range(1,max(len(A), len(B))):
            a = A.iloc[0:i].index
            b = B.iloc[0:i].index
            n = len(set(b).intersection(set(a)))
            N.append(n / i*100)

        max_common = len(set(A.index).intersection(set(B.index)))
        pylab.clf()
        if len(A) > len(B):
            pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection")
            pylab.axvline(len(B), ls="--", color="k", label="rank of minor set")
        else:
            pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect")
            pylab.axvline(len(A), ls="--", color="k", label="rank of minor set")

        pylab.plot(N)
        pylab.xlabel('rank', fontsize=fontsize)
        pylab.ylabel('% common features', fontsize=fontsize)
        pylab.grid(True)
        pylab.ylim([0,100])
        if xmax:
            pylab.xlim([0, xmax])
        else:
            pylab.xlim([0, max(len(A),len(B))])
        pylab.title(title, fontsize=fontsize)
        ax = pylab.gca()
        ax2 = ax.twinx()
        ax2.plot(A[sortby].values, "orange", label=sortby)
        ax2.set_ylabel(sortby)
        pylab.legend(loc="lower left")
        ax.legend(loc="lower right")

        if add_venn:
            f = pylab.gcf()
            ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey")
            if mode=="down":
                self.plot_venn_down(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="up":
                self.plot_venn_up(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="all":
                self.plot_venn_all(ax=ax, title=None, labels=labels,
                    mode="two_only")