Exemple #1
0
    def hist_concordance(self, method, bins=100, fontsize=16):
        """

            formula : 1 - (in + del + mismatch / (in + del + mismatch + match) )

        For BWA and BLASR, the get_cigar_stats are different !!!
        BWA for instance has no X stored while Pacbio forbids the use of the M
        (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters.

        Subread Accuracy: The post-mapping accuracy of the basecalls. 
        Formula: [1 - (errors/subread length)], where errors = number of deletions +
        insertions + substitutions.

        """
        try:
            concordance = self._concordance
        except:
            self._set_concordance(method)
            concordance = self._concordance

        pylab.hist(concordance, bins=bins)
        pylab.grid()
        mu = np.mean(concordance)
        median = np.median(concordance)
        pylab.axvline(mu, color='r', alpha=0.5)
        pylab.axvline(median, color='r', alpha=0.5, ls="--")
        pylab.xlabel("concordance", fontsize=fontsize)
Exemple #2
0
    def plot_idr_vs_peaks(self, filename=None, savefig=False):

        # global_idr is actually -log10(idr)
        pylab.clf()
        X1 = pylab.linspace(0, self.threshold, 100)
        X2 = pylab.linspace(self.threshold, 1, 100)
        # convert global idr to proba

        df1 = self.df.query("idr<@self.threshold")
        df2 = self.df.query("idr>[email protected]")

        pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2)
        shift = len(df1)

        pylab.plot([shift + sum(df2['idr'] < x) for x in X2],
                   X2,
                   "-",
                   color='k',
                   lw=2)
        pylab.xlabel('Number of significant peaks')
        pylab.ylabel('IDR')
        pylab.axhline(0.05, color='b', ls='--')
        pylab.axvline(self.N_significant_peaks, color='b', ls='--')
        if savefig:
            pylab.savefig(filename)
Exemple #3
0
    def hist_concordance(self,  bins=100, fontsize=16):
        """

            formula : 1 - (in + del + mismatch / (in + del + mismatch + match) )

        For BWA and BLASR, the get_cigar_stats are different !!!
        BWA for instance has no X stored while Pacbio forbids the use of the M
        (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters.

        Subread Accuracy: The post-mapping accuracy of the basecalls.
        Formula: [1 - (errors/subread length)], where errors = number of deletions +
        insertions + substitutions.

        """
        try:
            concordance = self._concordance
        except:
            self._set_concordance()
            concordance = self._concordance

        pylab.hist(concordance, bins=bins)
        pylab.grid()
        mu = np.mean(concordance)
        median = np.median(concordance)
        pylab.axvline(mu, color='r', alpha=0.5)
        pylab.axvline(median, color='r', alpha=0.5, ls="--")
        pylab.xlabel("concordance", fontsize=fontsize)
Exemple #4
0
    def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.scatter(-pylab.log10(df['Adjusted P-value']),
                      range(len(df)),
                      s=10 * df['size'],
                      c=df['size'])

        pylab.xlabel("Odd ratio")
        pylab.ylabel("Gene sets")
        pylab.yticks(range(len(df)), df.name)
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.grid(True)
        ax = pylab.gca()

        M = max(df['size'])
        if M > 100:
            l1, l2, l3 = "10", "100", str(M)
        else:
            l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M)

        handles = [
            pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="")
        ]
        ax.legend(handles=handles, loc="upper left", title="gene-set size")

        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.tight_layout()
        ax = pylab.colorbar(pylab.gci())
        return df
Exemple #5
0
 def scatter_length_cov_gc(self, min_length=200, min_cov=10):
     pylab.clf()
     pylab.scatter(self.df.length, self.df['cov'], c=self.df.GC)
     pylab.loglog()
     pylab.axvline(min_length, lw=2, c="r", ls='--')
     pylab.axhline(min_cov, lw=2, c="r", ls='--')
     pylab.xlabel("contig length")
     pylab.ylabel("contig coverage")
     pylab.colorbar(label="GC")
     pylab.grid(True)
Exemple #6
0
    def barplot(self, enrich, cutoff=0.05, nmax=10):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value']))
        pylab.yticks(range(len(df)), df.name)
        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.grid(True)
        pylab.xlabel("Adjusted p-value (log10)")
        pylab.ylabel("Gene sets")
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.tight_layout()
        return df
Exemple #7
0
    def plot(self,
             X=[0, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8, .9, .95, .99, .999, 1],
             fontsize=16,
             label=None):
        """plot percentage of genes covered (y axis) as a function of percentage
        of genes covered at least by X percent (x-axis). 

        """
        icol = self.coverage_column
        N = float(len(self.df))
        X = np.array(X)
        Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X])
        if label is None:
            pylab.plot(X * 100, Y, "o-")
        else:
            pylab.plot(X * 100, Y, "o-", label=label)
        pylab.xlabel("Gene coverage (%)", fontsize=fontsize)
        pylab.ylabel("Percentage of genes covered", fontsize=fontsize)
        for this in [25, 50, 75]:
            pylab.axhline(this, color="r", alpha=0.5, ls="--")
            pylab.axvline(this, color="r", alpha=0.5, ls="--")
Exemple #8
0
    def plot_common_major_counts(self, mode, labels=None,
            switch_up_down_cond2=False, add_venn=True, xmax=None, 
            title="", fontsize=12, sortby="log2FoldChange"):
        """

        :param mode: down, up or all


        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana.compare import RNADiffCompare

            c = RNADiffCompare(
                sequana_data("rnadiff/rnadiff_onecond_1"),
                sequana_data("rnadiff/rnadiff_onecond_2"))
            c.plot_common_major_counts("down")
        """
        #cond1, cond2 = self._get_cond1_cond2()
        if labels is None:
            labels = ['r1', 'r2']

        if mode in ["down"]:
            # Negative values !
            gl1 = set(self.r1.gene_lists['down'])
            gl2 =  set(self.r2.gene_lists['down'])
            A = self.r1.df.loc[gl1].sort_values(by=sortby)
            B = self.r2.df.loc[gl1].sort_values(by=sortby)
        else:
            gl1 = set(self.r1.gene_lists[mode])
            gl2 =  set(self.r2.gene_lists[mode])
            A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False)
            B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False)
        # sometimes, up and down may be inverted as compared to the other
        # conditions

        N = []
        for i in range(1,max(len(A), len(B))):
            a = A.iloc[0:i].index
            b = B.iloc[0:i].index
            n = len(set(b).intersection(set(a)))
            N.append(n / i*100)

        max_common = len(set(A.index).intersection(set(B.index)))
        pylab.clf()
        if len(A) > len(B):
            pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection")
            pylab.axvline(len(B), ls="--", color="k", label="rank of minor set")
        else:
            pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect")
            pylab.axvline(len(A), ls="--", color="k", label="rank of minor set")

        pylab.plot(N)
        pylab.xlabel('rank', fontsize=fontsize)
        pylab.ylabel('% common features', fontsize=fontsize)
        pylab.grid(True)
        pylab.ylim([0,100])
        if xmax:
            pylab.xlim([0, xmax])
        else:
            pylab.xlim([0, max(len(A),len(B))])
        pylab.title(title, fontsize=fontsize)
        ax = pylab.gca()
        ax2 = ax.twinx()
        ax2.plot(A[sortby].values, "orange", label=sortby)
        ax2.set_ylabel(sortby)
        pylab.legend(loc="lower left")
        ax.legend(loc="lower right")

        if add_venn:
            f = pylab.gcf()
            ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey")
            if mode=="down":
                self.plot_venn_down(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="up":
                self.plot_venn_up(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="all":
                self.plot_venn_all(ax=ax, title=None, labels=labels,
                    mode="two_only")
Exemple #9
0
    def plot_go_terms(self,
                      ontologies,
                      max_features=50,
                      log=False,
                      fontsize=8,
                      minimum_genes=0,
                      pvalue=0.05,
                      cmap="summer_r",
                      sort_by="fold_enrichment",
                      show_pvalues=False,
                      include_negative_enrichment=False,
                      fdr_threshold=0.05,
                      compute_levels=True,
                      progress=True):

        assert sort_by in ['pValue', 'fold_enrichment', 'fdr']

        # FIXME: pvalue and fold_enrichment not sorted in same order
        pylab.clf()

        df = self.get_data(
            ontologies,
            include_negative_enrichment=include_negative_enrichment,
            fdr=fdr_threshold)

        if len(df) == 0:
            return df

        df = df.query("pValue<=@pvalue")
        logger.info("Filtering out pvalue>{}. Kept {} GO terms".format(
            pvalue, len(df)))
        df = df.reset_index(drop=True)

        # Select a subset of the data to keep the best max_features in terms of
        # pValue
        subdf = df.query("number_in_list>@minimum_genes").copy()
        logger.info(
            "Filtering out GO terms with less than {} genes: Kept {} GO terms".
            format(minimum_genes, len(subdf)))

        logger.info("Filtering out the 3 parent terms")
        subdf = subdf.query("id not in @self.ontologies")

        # Keeping only a part of the data, sorting by pValue
        if sort_by == "pValue":
            subdf = subdf.sort_values(by="pValue",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="pValue", ascending=False)
        elif sort_by == "fold_enrichment":
            subdf = subdf.sort_values(by="abs_log2_fold_enrichment",
                                      ascending=True).iloc[-max_features:]
            df = df.sort_values(by="abs_log2_fold_enrichment", ascending=False)
        elif sort_by == "fdr":
            subdf = subdf.sort_values(by="fdr",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="fdr", ascending=False)

        subdf = subdf.reset_index(drop=True)

        # We get all levels for each go id.
        # They are stored by MF, CC or BP
        if compute_levels:
            paths = self.get_graph(list(subdf['id'].values), progress=progress)
            levels = []
            keys = list(paths.keys())
            goid_levels = paths[keys[0]]
            if len(keys) > 1:
                for k in keys[1:]:
                    goid_levels.update(paths[k])
            levels = [goid_levels[ID] for ID in subdf['id'].values]
            subdf["level"] = levels
        else:
            subdf['level'] = ""
        N = len(subdf)

        size_factor = 12000 / len(subdf)
        max_size = subdf.number_in_list.max()
        min_size = subdf.number_in_list.min()
        sizes = [
            max(max_size * 0.2, x) for x in size_factor *
            subdf.number_in_list.values / subdf.number_in_list.max()
        ]

        m1 = min(sizes)
        m3 = max(sizes)
        m2 = m1 + (m3 - m1) / 2

        if log:
            pylab.scatter(pylab.log2(subdf.fold_enrichment),
                          range(len(subdf)),
                          c=subdf.fdr,
                          s=sizes,
                          cmap=cmap,
                          alpha=0.8,
                          ec="k",
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
            #pylab.barh(range(N), pylab.log2(subdf.fold_enrichment), color="r",
            #    label="pvalue>0.05; FDR>0.05")
            #pylab.axvline(1, color="gray", ls="--")
            #pylab.axvline(-1, color="gray", ls="--")
        else:
            pylab.scatter(subdf.fold_enrichment,
                          range(len(subdf)),
                          c=subdf.fdr,
                          cmap=cmap,
                          s=sizes,
                          ec="k",
                          alpha=.8,
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
        #    pylab.barh(range(N), subdf.fold_enrichment, color="r",
        #    label="not significant")
        pylab.grid(zorder=-10)
        ax2 = pylab.colorbar(shrink=0.5)
        ax2.ax.set_ylabel('FDR')

        labels = [
            x if len(x) < 50 else x[0:47] + "..." for x in list(subdf.label)
        ]
        ticks = [
            "{} ({}) {}".format(ID, level, "; " + label.title())
            for level, ID, label in zip(subdf['level'], subdf.id, labels)
        ]

        pylab.yticks(range(N), ticks, fontsize=fontsize, ha='left')

        yax = pylab.gca().get_yaxis()
        try:
            pad = [x.label.get_window_extent().width for x in yax.majorTicks]
            yax.set_tick_params(pad=max(pad))
        except:
            yax.set_tick_params(pad=60 * fontsize * 0.7)
        yax.set_tick_params(pad=60 * fontsize * 0.6)

        fc_max = subdf.fold_enrichment.max(skipna=True)
        fc_min = subdf.fold_enrichment.min(skipna=True)
        # go into log2 space
        fc_max = pylab.log2(fc_max)
        fc_min = pylab.log2(fc_min)
        abs_max = max(fc_max, abs(fc_min), 1)

        if log:
            fc_max = abs_max * 1.5
        else:
            fc_max = 2**abs_max * 1.2

        pylab.axvline(0, color="k", lw=2)
        if log:
            pylab.xlabel("Fold Enrichment (log2)")
        else:
            pylab.xlabel("Fold Enrichment")
        if include_negative_enrichment:
            pylab.xlim([-fc_max, fc_max])
        else:
            pylab.xlim([0, fc_max])
        pylab.tight_layout()

        # The pvalue:
        if show_pvalues:
            ax = pylab.gca().twiny()
            ax.set_xlim([0, max(-pylab.log10(subdf.pValue)) * 1.2])
            ax.set_xlabel("p-values (log10)", fontsize=12)
            ax.plot(-pylab.log10(subdf.pValue),
                    range(len(subdf)),
                    label="pvalue",
                    lw=2,
                    color="k")
            ax.axvline(1.33, lw=1, ls="--", color="grey", label="pvalue=0.05")
            pylab.tight_layout()
            pylab.legend(loc="lower right")
        s1 = pylab.scatter([], [], s=m1, marker='o', color='#555555', ec="k")
        s2 = pylab.scatter([], [], s=m2, marker='o', color='#555555', ec="k")
        s3 = pylab.scatter([], [], s=m3, marker='o', color='#555555', ec="k")

        if len(subdf) < 10:
            labelspacing = 1.5 * 4
            borderpad = 4
            handletextpad = 2
        elif len(subdf) < 20:
            labelspacing = 1.5 * 2
            borderpad = 1
            handletextpad = 2
        else:
            labelspacing = 1.5
            borderpad = 2
            handletextpad = 2

        if len(subdf) >= 3:
            leg = pylab.legend(
                (s1, s2, s3),
                (str(int(min_size)),
                 str(int(min_size +
                         (max_size - min_size) / 2)), str(int(max_size))),
                scatterpoints=1,
                loc='lower right',
                ncol=1,
                frameon=True,
                title="gene-set size",
                labelspacing=labelspacing,
                borderpad=borderpad,
                handletextpad=handletextpad,
                fontsize=8)
        else:
            leg = pylab.legend((s1, ), (str(int(min_size)), ),
                               scatterpoints=1,
                               loc='lower right',
                               ncol=1,
                               frameon=True,
                               title="gene-set size",
                               labelspacing=labelspacing,
                               borderpad=borderpad,
                               handletextpad=handletextpad,
                               fontsize=8)

        frame = leg.get_frame()
        frame.set_facecolor('#b4aeae')
        frame.set_edgecolor('black')
        frame.set_alpha(1)

        self.subdf = subdf
        self.df = df
        return df