Exemple #1
0
    def plot_volcano(self):
        """
        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_volcano()

        """
        d1 = self.df.query("padj>0.05")
        d2 = self.df.query("padj<=0.05")

        fig = pylab.figure()
        pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o",
            alpha=0.5, color="r", lw=0)
        pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o",
            alpha=0.5, color="k", lw=0)

        pylab.grid(True)
        pylab.xlabel("fold change")
        pylab.ylabel("log10 adjusted p-value")
        m1 = abs(min(self.df.log2FoldChange))
        m2 = max(self.df.log2FoldChange)
        limit = max(m1,m2)
        pylab.xlim([-limit, limit])
        y1,y2 = pylab.ylim()
        pylab.ylim([0,y2])

        pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
Exemple #2
0
    def histogram_sequence_lengths(self, logy=True):
        """Histogram sequence lengths

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.histogram_sequence_lengths()

        """
        data = [len(x) for x in self.sequences]
        bary, barx = np.histogram(data, bins=range(max(data)+1))

        # get rid of zeros to avoid warnings
        bx = [x for x,y in zip(barx, bary) if y!=0]
        by = [y for x,y in zip(barx, bary) if y!=0]
        if logy:
            pylab.bar(bx, pylab.log10(by))
        else:
            pylab.bar(bx, by)

        pylab.xlim([1,max(data)+1])

        pylab.grid(True)
        pylab.xlabel("position (bp)", fontsize=self.fontsize)
        pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
Exemple #3
0
    def plot(self,
             color_line='r',
             bgcolor='grey',
             color='yellow',
             lw=4,
             hold=False,
             ax=None):

        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax)
        pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.3)
        pylab.fill_between([0, xmax], [20, 20], [30, 30],
                           color='orange',
                           alpha=0.3)
        pylab.fill_between([0, xmax], [30, 30], [41, 41],
                           color='green',
                           alpha=0.3)

        if self.X is None:
            X = range(1, self.xmax + 1)

        pylab.fill_between(X,
                           self.df.mean() + self.df.std(),
                           self.df.mean() - self.df.std(),
                           color=color,
                           interpolate=False)

        pylab.plot(X, self.df.mean(), color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax + 1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Exemple #4
0
    def plot_stacked_hist(self,
                          output_filename=None,
                          dpi=200,
                          kind="barh",
                          fontsize=10,
                          edgecolor="k",
                          lw=1,
                          width=1,
                          ytick_fontsize=10):
        df = self.get_df()
        df.T.plot(kind=kind,
                  stacked=True,
                  edgecolor=edgecolor,
                  lw=lw,
                  width=width)
        ax = pylab.gca()
        positions = pylab.yticks()
        #ax.set_yticklabel(positions, labels, fontsize=ytick_fontsize)
        pylab.xlabel("Percentage (%)", fontsize=fontsize)
        pylab.ylabel("Sample index/name", fontsize=fontsize)
        pylab.yticks(fontsize=ytick_fontsize)
        pylab.legend(title="kingdom")
        pylab.xlim([0, 100])

        if output_filename:
            pylab.savefig(output_filename, dpi=dpi)
Exemple #5
0
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
Exemple #6
0
    def plot_hist_normalized_coverage(self, filename=None, binwidth=0.1,
            max_z=4):
        """ Barplot of the normalized coverage with gaussian fitting

        """
        pylab.clf()
        # if there are a NaN -> can't set up binning
        d = self.df["scale"][self.range[0]:self.range[1]].dropna()
        # remove outlier -> plot crash if range between min and max is too high
        d = d[np.abs(d - d.mean()) <= (4 * d.std())]
        bins = self._set_bins(d, binwidth)
        self.mixture_fitting.data = d
        try:
            self.mixture_fitting.plot(self.gaussians_params, bins=bins, Xmin=0,
                                      Xmax=max_z)
        except ZeroDivisionError:
            pass
        pylab.grid(True)
        pylab.xlim([0,max_z])
        pylab.xlabel("Normalised per-base coverage")
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
Exemple #7
0
    def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.scatter(-pylab.log10(df['Adjusted P-value']),
                      range(len(df)),
                      s=10 * df['size'],
                      c=df['size'])

        pylab.xlabel("Odd ratio")
        pylab.ylabel("Gene sets")
        pylab.yticks(range(len(df)), df.name)
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.grid(True)
        ax = pylab.gca()

        M = max(df['size'])
        if M > 100:
            l1, l2, l3 = "10", "100", str(M)
        else:
            l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M)

        handles = [
            pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="")
        ]
        ax.legend(handles=handles, loc="upper left", title="gene-set size")

        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.tight_layout()
        ax = pylab.colorbar(pylab.gci())
        return df
Exemple #8
0
 def plot_contig_length_vs_GC(self, alpha=0.5):
     pylab.plot(self.df["length"], self.df['GC'], "o", alpha=alpha)
     pylab.xlabel("contig length (bp)")
     pylab.ylabel("GC (%)")
     pylab.grid(True)
     pylab.ylim([0, 100])
     pylab.xlim(0, max(self.df['length']) + 10)
Exemple #9
0
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
Exemple #10
0
    def hist_GC(self,
                bins=50,
                alpha=0.5,
                hold=False,
                fontsize=12,
                grid=True,
                xlabel="GC %",
                ylabel="#",
                label="",
                title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC = np.mean(self.df.loc[:, 'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" % (mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:, 'GC_content'],
                   bins=bins,
                   alpha=alpha,
                   label=label + ", mean : " + str(round(mean_GC, 2)) +
                   ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try:
            pylab.tight_layout()
        except:
            pass
Exemple #11
0
 def plot_genesets_hist(self, bins=20):
     N = len(self.gene_sets.keys())
     pylab.clf()
     pylab.hist([len(v) for k, v in self.gene_sets.items()],
                bins=bins,
                lw=1,
                ec="k")
     pylab.title("{} gene sets".format(N))
     pylab.xlabel("Gene set sizes")
     pylab.grid(True)
     a, b = pylab.xlim()
     pylab.xlim([0, b])
Exemple #12
0
    def barplot(self, enrich, cutoff=0.05, nmax=10):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value']))
        pylab.yticks(range(len(df)), df.name)
        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.grid(True)
        pylab.xlabel("Adjusted p-value (log10)")
        pylab.ylabel("Gene sets")
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.tight_layout()
        return df
Exemple #13
0
    def boxplot_quality(self, color_line='r', bgcolor='grey', color='yellow', lw=4, 
            hold=False, ax=None):


        quality = self.df[[str(x) for x in range(42)]]  # not sure why we have phred score from 0 to 41
        N = self.metadata['ReadNum']
        proba = quality / N

        self.xmax = 150
        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax) # pragma no cover
        pylab.fill_between([0,xmax], [0,0], [20,20], color='red', alpha=0.3)
        pylab.fill_between([0,xmax], [20,20], [30,30], color='orange', alpha=0.3)
        pylab.fill_between([0,xmax], [30,30], [41,41], color='green', alpha=0.3)


        X = []
        Q = []
        S = []
        for pos in range(1, 151):
            qualities = [((int(k)+1)*v) for k,v in quality.loc[pos].items()]
            mean_quality = sum(qualities) / N
            X.append(pos)
            Q.append(mean_quality)
            proba = quality.loc[pos] / N

            std = pylab.sqrt(sum([(x-mean_quality)**2 * y for x, y in zip(range(42), proba)]))
            S.append(std)

        print(len(X))
        print(len(Q))
        print(len(S))

        Q = np.array(Q)
        X = np.array(X)
        S = np.array(S)
        pylab.fill_between(X, Q+S, Q-S, 
            color=color, interpolate=False)

        pylab.plot(X, Q, color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax+1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Exemple #14
0
    def histogram_gc_content(self):
        """Plot histogram of GC content

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.histogram_gc_content()

        """
        pylab.hist(self.gc_list, bins=range(0, 100))
        pylab.grid()
        pylab.title("GC content distribution (per sequence)")
        pylab.xlabel(r"Mean GC content (%)", fontsize=self.fontsize)
        pylab.xlim([0,100])
Exemple #15
0
    def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="GC %", ylabel="#", label="",title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" %(mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:,'GC_content'], bins=bins,
            alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2))
            + ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try: pylab.tight_layout()
        except:pass
Exemple #16
0
    def plot(self):
        """"""
        if self.design:
            self.df['label'] = self.design.df['type'] + "/" + self.design.df[
                'condition']

        pylab.clf()
        MX = self.df.FRiP.max()
        MY = self.df['in_peaks'].max()
        pylab.plot([0, MX], [0, MY], ls='--', color='b', alpha=0.5)
        for label in self.df['label'].unique():
            self.df.query('label==@label').plot(x='FRiP',
                                                y='in_peaks',
                                                marker="o",
                                                lw=0,
                                                label=label,
                                                ax=pylab.gca())
        pylab.ylabel('Reads in peaks')
        pylab.xlabel('FRiP')
        pylab.xlim(0, pylab.xlim()[1])
        pylab.ylim(0, pylab.ylim()[1])
        pylab.grid()
Exemple #17
0
    def plot_sequence_quality(self, max_score=40, ax=None):

        ymax = max_score + 1
        xmax = 0
        for sample in self.fastqc_data.keys():
            if "per_sequence_quality_scores" in self.fastqc_data[sample]:
                data = {
                    self._avg_bp_from_range(d['base']): d['mean']
                    for d in self.fastqc_data[sample]
                    ['per_base_sequence_quality']
                }
                df = pd.Series(data)
                df.plot(color="k", alpha=0.5)

                if df.max() > ymax:
                    ymax = df.max()
                if df.index.max() > xmax:
                    xmax = df.index.max()

        if ax:
            pylab.sca(ax)
        pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.4)
        pylab.fill_between([0, xmax], [20, 20], [30, 30],
                           color='orange',
                           alpha=0.4)
        pylab.fill_between([0, xmax], [30, 30], [ymax, ymax],
                           color='green',
                           alpha=0.4)

        X = range(1, xmax + 1)

        pylab.ylim([0, ymax])
        if xmax != 0:
            pylab.xlim([0, xmax])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Phred Score", fontsize=12)
        pylab.grid(axis='x')
Exemple #18
0
    def _plot(self, Xr, pca=None, pc1=0, pc2=1, colors=None, show_labels=True):
        if colors is None:
            colors = [self.colors[k] for k in self.labels]
            if len(colors) != len(Xr):
                colors = ["r"] * len(Xr[:,0])
        else:
            for k in self.labels:
                if k not in colors.keys():
                    logger.warning("No key color for this sample: {}. Set to red".format(k))
                    colors[k] = "r"
            colors = [colors[k] for k in self.labels]

        pylab.scatter(Xr[:,pc1], Xr[:,pc2], c=colors)
        ax = pylab.gca()
        X1, X2 = pylab.xlim()
        dX = X2 - X1
        pylab.xlim([X1 + X1*0.05, X2 + X2*0.05])

        Y1, Y2 = pylab.ylim()
        dY = Y2 - Y1
        pylab.ylim([Y1 + Y1*0.05, Y2 + Y2*0.05])

        count = 0
        if show_labels:
            for x,y in zip(Xr[:,pc1], Xr[:,pc2]):
                x += dX / 40
                y += dY / 40
                ax.annotate(self.labels[count], (x,y))
                count += 1
                if count > 100: 
                    break
        if pca:
            pylab.xlabel("PC{} ({}%)".format(pc1+1,
                round(pca.explained_variance_ratio_[pc1]*100, 2)))
            pylab.ylabel("PC{} ({}%)".format(pc2+1,
                round(pca.explained_variance_ratio_[pc2]*100, 2)))
        pylab.grid(True)
Exemple #19
0
 def hist_passes(self, maxp=50, fontsize=16):
     passes = self.df.nb_passes.copy()
     passes.clip_upper(maxp).hist(bins=maxp)
     pylab.xlim([0, maxp])
     pylab.ylabel("# count", fontsize=fontsize)
     pylab.xlabel("Passes (max {})".format(maxp), fontsize=fontsize)
Exemple #20
0
 def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12):
     self.df.mapq.hist()
     if logy:
         pylab.semilogy()
     pylab.xlim([xmin, xmax])
     pylab.xlabel("Mapping quality", fontsize=fontsize)
Exemple #21
0
    def plot_coverage(self, filename=None, fontsize=16,
            rm_lw=1, rm_color="#0099cc", rm_label="Running median",
            th_lw=1, th_color="r", th_ls="--", main_color="k", main_lw=1,
            main_kwargs={}, sample=True, set_ylimits=True):
        """ Plot coverage as a function of base position.

        :param filename:
        :param rm_lw: line width of the running median
        :param rm_color: line color of the running median
        :param rm_color: label for the running median
        :param th_lw: line width of the thresholds
        :param th_color: line color of the thresholds
        :param main_color: line color of the coverage
        :param main_lw: line width of the coverage
        :param sample: if there are more than 1 000 000 points, we 
            use an integer step to skip data points. We can still plot
            all points at your own risk by setting this option to False

        :param set_ylimits: we want to focus on the "normal" coverage ignoring
            unsual excess. To do so, we set the yaxis range between 0 and a
            maximum value. This maximum value is set to the minimum between the
            6 times the mean coverage and 1.5 the maximum of the high coverage
            threshold curve. If you want to let the ylimits free, set this
            argument to False

        .. note:: if there are more than 1,000,000 points, we show only
            1,000,000 by points. For instance for 5,000,000 points,

        In addition to the coverage, the running median and coverage confidence
        corresponding to the lower and upper  zscore thresholds are shown.

        .. note:: uses the thresholds attribute.
        """
        # z = (X/rm - \mu ) / sigma
        high_zcov = (self.thresholds.high * self.best_gaussian["sigma"] +
                self.best_gaussian["mu"]) * self.df["rm"]
        low_zcov = (self.thresholds.low * self.best_gaussian["sigma"] +
                self.best_gaussian["mu"]) * self.df["rm"]

        pylab.clf()
        ax = pylab.gca()
        ax.set_facecolor('#eeeeee')
        pylab.xlim(0,self.df["pos"].iloc[-1])
        axes = []
        labels = []

        # 1,000,000 points is a lot for matplotlib. Let us restrict ourself to 1
        # million points for now.
        if len(self.df) > 1000000 and sample is True:
            NN = int(len(self.df)/1000000)
        else:
            NN = 1

        # the main coverage plot
        p1, = pylab.plot(self.df["cov"][::NN], color=main_color, label="Coverage",
                linewidth=main_lw, **main_kwargs)
        axes.append(p1)
        labels.append("Coverage")

        # The running median plot
        if rm_lw > 0:
            p2, = pylab.plot(self.df["rm"][::NN],
                    color=rm_color,
                    linewidth=rm_lw,
                    label=rm_label)
            axes.append(p2)
            labels.append(rm_label)

        # The threshold curves
        if th_lw > 0:
            p3, = pylab.plot(high_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls,
                label="Thresholds")
            p4, = pylab.plot(low_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls,
                label="_nolegend_")
            axes.append(p3)
            labels.append("Thresholds")

        pylab.legend(axes, labels, loc="best")
        pylab.xlabel("Position", fontsize=fontsize)
        pylab.ylabel("Per-base coverage", fontsize=fontsize)
        pylab.grid(True)

        # sometimes there are large coverage value that squeeze the plot.
        # Let us restrict it
        if set_ylimits is True:
            pylab.ylim([0, min([
                high_zcov.max() * 1.5,
                self.df["cov"].mean()*6])])
        else:
            pylab.ylim([0, pylab.ylim()[1]])

        try:
            pylab.tight_layout()
        except:
            pass

        if filename:
            pylab.savefig(filename)
for i in range(len(list_analysis)):
    analysis = list_analysis[i]
    res = compute_table_performance(analysis, df_results)
    print("%s" % analysis)
    # [TP, FP, FN, TN]
    # print(len(res[0]), len(res[1]), res[2], res[3] , sum([len(res[0]), len(res[1]), res[2], res[3]]))
    TP = res[0]
    FP = res[1]
    FN = [0] * res[2]
    TN = [0] * res[3]
    y_true = np.array([1] * len(TP) + [1] * len(FN) + [0] * len(FP) +
                      [0] * len(TN))
    y_scores = np.array(TP + FN + FP + TN)
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    pylab.plot(recall, precision, color=colors[i], label=analysis)

pylab.xlabel('Recall')
pylab.ylabel('Precision')
pylab.ylim([0.0, 1.05])
pylab.xlim([0.0, 1.05])
pylab.title('Precision-Recall')
#pylab.legend(loc="lower left")

lgd = pylab.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#pylab.tight_layout()

if file_fig != "show":
    pylab.savefig(file_fig, bbox_extra_artists=(lgd, ), bbox_inches='tight')
else:
    pylab.show()
Exemple #23
0
    def plot_common_major_counts(self, mode, labels=None,
            switch_up_down_cond2=False, add_venn=True, xmax=None, 
            title="", fontsize=12, sortby="log2FoldChange"):
        """

        :param mode: down, up or all


        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana.compare import RNADiffCompare

            c = RNADiffCompare(
                sequana_data("rnadiff/rnadiff_onecond_1"),
                sequana_data("rnadiff/rnadiff_onecond_2"))
            c.plot_common_major_counts("down")
        """
        #cond1, cond2 = self._get_cond1_cond2()
        if labels is None:
            labels = ['r1', 'r2']

        if mode in ["down"]:
            # Negative values !
            gl1 = set(self.r1.gene_lists['down'])
            gl2 =  set(self.r2.gene_lists['down'])
            A = self.r1.df.loc[gl1].sort_values(by=sortby)
            B = self.r2.df.loc[gl1].sort_values(by=sortby)
        else:
            gl1 = set(self.r1.gene_lists[mode])
            gl2 =  set(self.r2.gene_lists[mode])
            A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False)
            B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False)
        # sometimes, up and down may be inverted as compared to the other
        # conditions

        N = []
        for i in range(1,max(len(A), len(B))):
            a = A.iloc[0:i].index
            b = B.iloc[0:i].index
            n = len(set(b).intersection(set(a)))
            N.append(n / i*100)

        max_common = len(set(A.index).intersection(set(B.index)))
        pylab.clf()
        if len(A) > len(B):
            pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection")
            pylab.axvline(len(B), ls="--", color="k", label="rank of minor set")
        else:
            pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect")
            pylab.axvline(len(A), ls="--", color="k", label="rank of minor set")

        pylab.plot(N)
        pylab.xlabel('rank', fontsize=fontsize)
        pylab.ylabel('% common features', fontsize=fontsize)
        pylab.grid(True)
        pylab.ylim([0,100])
        if xmax:
            pylab.xlim([0, xmax])
        else:
            pylab.xlim([0, max(len(A),len(B))])
        pylab.title(title, fontsize=fontsize)
        ax = pylab.gca()
        ax2 = ax.twinx()
        ax2.plot(A[sortby].values, "orange", label=sortby)
        ax2.set_ylabel(sortby)
        pylab.legend(loc="lower left")
        ax.legend(loc="lower right")

        if add_venn:
            f = pylab.gcf()
            ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey")
            if mode=="down":
                self.plot_venn_down(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="up":
                self.plot_venn_up(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="all":
                self.plot_venn_all(ax=ax, title=None, labels=labels,
                    mode="two_only")
Exemple #24
0
 def hist_passes(self, maxp=50, fontsize=16):
     passes = self.df.nb_passes.copy()
     passes.clip_upper(maxp).hist(bins=maxp)
     pylab.xlim([0, maxp])
     pylab.ylabel("# count", fontsize=fontsize)
     pylab.xlabel("Passes (max {})".format(maxp), fontsize=fontsize)
Exemple #25
0
 def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12):
     self.df.mapq.hist()
     if logy:
         pylab.semilogy()
     pylab.xlim([xmin, xmax])
     pylab.xlabel("Mapping quality", fontsize=fontsize)
Exemple #26
0
    def plot_go_terms(self,
                      ontologies,
                      max_features=50,
                      log=False,
                      fontsize=8,
                      minimum_genes=0,
                      pvalue=0.05,
                      cmap="summer_r",
                      sort_by="fold_enrichment",
                      show_pvalues=False,
                      include_negative_enrichment=False,
                      fdr_threshold=0.05,
                      compute_levels=True,
                      progress=True):

        assert sort_by in ['pValue', 'fold_enrichment', 'fdr']

        # FIXME: pvalue and fold_enrichment not sorted in same order
        pylab.clf()

        df = self.get_data(
            ontologies,
            include_negative_enrichment=include_negative_enrichment,
            fdr=fdr_threshold)

        if len(df) == 0:
            return df

        df = df.query("pValue<=@pvalue")
        logger.info("Filtering out pvalue>{}. Kept {} GO terms".format(
            pvalue, len(df)))
        df = df.reset_index(drop=True)

        # Select a subset of the data to keep the best max_features in terms of
        # pValue
        subdf = df.query("number_in_list>@minimum_genes").copy()
        logger.info(
            "Filtering out GO terms with less than {} genes: Kept {} GO terms".
            format(minimum_genes, len(subdf)))

        logger.info("Filtering out the 3 parent terms")
        subdf = subdf.query("id not in @self.ontologies")

        # Keeping only a part of the data, sorting by pValue
        if sort_by == "pValue":
            subdf = subdf.sort_values(by="pValue",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="pValue", ascending=False)
        elif sort_by == "fold_enrichment":
            subdf = subdf.sort_values(by="abs_log2_fold_enrichment",
                                      ascending=True).iloc[-max_features:]
            df = df.sort_values(by="abs_log2_fold_enrichment", ascending=False)
        elif sort_by == "fdr":
            subdf = subdf.sort_values(by="fdr",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="fdr", ascending=False)

        subdf = subdf.reset_index(drop=True)

        # We get all levels for each go id.
        # They are stored by MF, CC or BP
        if compute_levels:
            paths = self.get_graph(list(subdf['id'].values), progress=progress)
            levels = []
            keys = list(paths.keys())
            goid_levels = paths[keys[0]]
            if len(keys) > 1:
                for k in keys[1:]:
                    goid_levels.update(paths[k])
            levels = [goid_levels[ID] for ID in subdf['id'].values]
            subdf["level"] = levels
        else:
            subdf['level'] = ""
        N = len(subdf)

        size_factor = 12000 / len(subdf)
        max_size = subdf.number_in_list.max()
        min_size = subdf.number_in_list.min()
        sizes = [
            max(max_size * 0.2, x) for x in size_factor *
            subdf.number_in_list.values / subdf.number_in_list.max()
        ]

        m1 = min(sizes)
        m3 = max(sizes)
        m2 = m1 + (m3 - m1) / 2

        if log:
            pylab.scatter(pylab.log2(subdf.fold_enrichment),
                          range(len(subdf)),
                          c=subdf.fdr,
                          s=sizes,
                          cmap=cmap,
                          alpha=0.8,
                          ec="k",
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
            #pylab.barh(range(N), pylab.log2(subdf.fold_enrichment), color="r",
            #    label="pvalue>0.05; FDR>0.05")
            #pylab.axvline(1, color="gray", ls="--")
            #pylab.axvline(-1, color="gray", ls="--")
        else:
            pylab.scatter(subdf.fold_enrichment,
                          range(len(subdf)),
                          c=subdf.fdr,
                          cmap=cmap,
                          s=sizes,
                          ec="k",
                          alpha=.8,
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
        #    pylab.barh(range(N), subdf.fold_enrichment, color="r",
        #    label="not significant")
        pylab.grid(zorder=-10)
        ax2 = pylab.colorbar(shrink=0.5)
        ax2.ax.set_ylabel('FDR')

        labels = [
            x if len(x) < 50 else x[0:47] + "..." for x in list(subdf.label)
        ]
        ticks = [
            "{} ({}) {}".format(ID, level, "; " + label.title())
            for level, ID, label in zip(subdf['level'], subdf.id, labels)
        ]

        pylab.yticks(range(N), ticks, fontsize=fontsize, ha='left')

        yax = pylab.gca().get_yaxis()
        try:
            pad = [x.label.get_window_extent().width for x in yax.majorTicks]
            yax.set_tick_params(pad=max(pad))
        except:
            yax.set_tick_params(pad=60 * fontsize * 0.7)
        yax.set_tick_params(pad=60 * fontsize * 0.6)

        fc_max = subdf.fold_enrichment.max(skipna=True)
        fc_min = subdf.fold_enrichment.min(skipna=True)
        # go into log2 space
        fc_max = pylab.log2(fc_max)
        fc_min = pylab.log2(fc_min)
        abs_max = max(fc_max, abs(fc_min), 1)

        if log:
            fc_max = abs_max * 1.5
        else:
            fc_max = 2**abs_max * 1.2

        pylab.axvline(0, color="k", lw=2)
        if log:
            pylab.xlabel("Fold Enrichment (log2)")
        else:
            pylab.xlabel("Fold Enrichment")
        if include_negative_enrichment:
            pylab.xlim([-fc_max, fc_max])
        else:
            pylab.xlim([0, fc_max])
        pylab.tight_layout()

        # The pvalue:
        if show_pvalues:
            ax = pylab.gca().twiny()
            ax.set_xlim([0, max(-pylab.log10(subdf.pValue)) * 1.2])
            ax.set_xlabel("p-values (log10)", fontsize=12)
            ax.plot(-pylab.log10(subdf.pValue),
                    range(len(subdf)),
                    label="pvalue",
                    lw=2,
                    color="k")
            ax.axvline(1.33, lw=1, ls="--", color="grey", label="pvalue=0.05")
            pylab.tight_layout()
            pylab.legend(loc="lower right")
        s1 = pylab.scatter([], [], s=m1, marker='o', color='#555555', ec="k")
        s2 = pylab.scatter([], [], s=m2, marker='o', color='#555555', ec="k")
        s3 = pylab.scatter([], [], s=m3, marker='o', color='#555555', ec="k")

        if len(subdf) < 10:
            labelspacing = 1.5 * 4
            borderpad = 4
            handletextpad = 2
        elif len(subdf) < 20:
            labelspacing = 1.5 * 2
            borderpad = 1
            handletextpad = 2
        else:
            labelspacing = 1.5
            borderpad = 2
            handletextpad = 2

        if len(subdf) >= 3:
            leg = pylab.legend(
                (s1, s2, s3),
                (str(int(min_size)),
                 str(int(min_size +
                         (max_size - min_size) / 2)), str(int(max_size))),
                scatterpoints=1,
                loc='lower right',
                ncol=1,
                frameon=True,
                title="gene-set size",
                labelspacing=labelspacing,
                borderpad=borderpad,
                handletextpad=handletextpad,
                fontsize=8)
        else:
            leg = pylab.legend((s1, ), (str(int(min_size)), ),
                               scatterpoints=1,
                               loc='lower right',
                               ncol=1,
                               frameon=True,
                               title="gene-set size",
                               labelspacing=labelspacing,
                               borderpad=borderpad,
                               handletextpad=handletextpad,
                               fontsize=8)

        frame = leg.get_frame()
        frame.set_facecolor('#b4aeae')
        frame.set_edgecolor('black')
        frame.set_alpha(1)

        self.subdf = subdf
        self.df = df
        return df
Exemple #27
0
    def stats(self, results, df_avc, bw=1):

        stats = {}
        stats['read_fragments'] = len(self.df)
        stats['fragment_length'] = self.read_length

        # average cross correlation across all chromosomes
        print("Read {} fragments".format(stats['read_fragments']))
        print("ChIP data mean length: {}".format(self.read_length))
        #df_avc.sum(axis=1).plot()
        df = df_avc.sum(axis=1)
        corr_max = df.max()
        shift_max = df.idxmax()
        # note that in phantomPeak, they use the last value as min... not the
        # actual min. Not very important.
        corr_min = df.min()
        shift_min = df.idxmin()
        print("Maximum cross-correlation value: {:.5f}".format(corr_max))
        print("Maximum cross-correlation shift: {}".format(shift_max))
        print("Minimum cross-correlation value: {:.5f}".format(corr_min))
        print("Minimum cross-correlation shift: {}".format(shift_min))
        stats['shift_max'] = int(shift_max)  # to make it json serialisable
        stats['corr_max'] = corr_max

        # original code phantomPeak but always equal to 1 it range max >5 ??
        # default is 500 so sbw=1 whatsoever
        #sbw = 2 * floor(ceil(5/15000) / 2) + 1
        sbw = 1

        # here we could use a rolling mean
        #df.rolling(window=5, center=True).mean()

        # so following runnin mean is useless
        # cc$y <- runmean(cc$y,sbw,alg="fast")
        #

        # again, computation of bw but always equal to 1 ....
        # Compute cross-correlation peak
        #  bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks
        #bw = 1
        # search for local peaks within bandwidth of bw = 1
        peakidx = df.diff(periods=bw) > 0
        peakidx = peakidx.astype(int).diff(periods=bw) == -1

        # the final bw points are NA and filled with False
        peakidx = peakidx.shift(-bw).fillna(False)

        df_peaks = df[peakidx]
        # when searching for max, exclude peaks from the excluded region
        exclusion_range = [10, self.read_length + 10]
        mask = np.logical_or(df_peaks.index < exclusion_range[0],
                             df_peaks.index > exclusion_range[1])
        df_peaks = df_peaks[mask]

        #
        max_peak = df_peaks.max()
        shift_peak = df_peaks.idxmax()

        # now, we select peaks that are at least 90% of main peak and with shift
        # higher than main shift. why higher ?
        mask = np.logical_and(df_peaks > max_peak * 0.9,
                              df_peaks.index >= shift_peak)
        best_df_peaks = df_peaks[mask]
        best = best_df_peaks.sort_values(ascending=False)[0:3]

        values = ",".join(["{:.5f}".format(x) for x in best.values])
        pos = ",".join([str(x) for x in best.index])
        print("Top 3 cross-correlation values: {}".format(values))
        print("Top 3 estimates for fragment length: {}".format(pos))

        # now the real window half size according to phantom peaks, not spp ...
        # min + (max-min)/3
        threshold = (df_peaks.max() - corr_min) / 3 + corr_min
        whs = df[df > threshold].index.max()

        # coming back to real cross correlation, identify peak in window
        # readlength +- 2*binning  !! not symmetry in phantompeak
        # x >= ( chip.data$read.length - round(2*binning) &
        # x <= ( chip.data$read.length + round(1.5*binning)

        binning = self.binning
        ph_min = self.read_length - round(2 * binning)
        ph_max = self.read_length + round(1.5 * binning)
        phantom = df[np.logical_and(df.index >= ph_min, df.index <= ph_max)]
        print("Phantom peak range detection:{}-{}".format(ph_min, ph_max))
        print("Phantom peak location:{}".format(phantom.idxmax()))
        print("Phantom peak Correlation: {:.5f}".format(phantom.max()))
        stats['phantom_corr'] = phantom.max()
        stats['phantom_location'] = int(phantom.idxmax())  # for json

        NSC = df_peaks.max() / phantom.max()
        # error in phatompeaks ?? is encoded as follows but no link with phantom
        # peak...
        # Another difference with phantom peak is that the min in phantom peak
        # is not the min but last value on the RHS so
        #    phantom_coeff = df_peaks.max() /  df.min()
        # is
        #    phantom_coeff = df_peaks.max() /  df.iloc[-1]
        NSC_spp = df_peaks.max() / df.iloc[-1]
        print(
            "Normalized Strand cross-correlation coefficient (NSC): {:.5f} [{:.5f}]"
            .format(NSC, NSC_spp))
        RSC = (df_peaks.max() - df.min()) / (phantom.max() - df.min())
        RSC_spp = (df_peaks.max() - df.iloc[-1]) / (phantom.max() -
                                                    df.iloc[-1])
        print(
            "Relative Strand cross-correlation Coefficient (RSC): {:.5f} [{:.5f}]"
            .format(RSC, RSC_spp))

        if RSC > 0 and RSC < 0.25:
            tag = -2
        elif RSC >= 0.25 and RSC < 0.5:
            tag = -1
        elif RSC >= 0.5 and RSC < 1:
            tag = 0
        elif RSC >= 1 and RSC < 1.5:
            tag = 1
        elif RSC >= 1.5:
            tag = 2
        print("Phantom Peak Quality Tag: {}".format(tag))

        pylab.clf()
        df.plot()
        ##df_peaks.plot(marker="o", lw=0)
        ylim = pylab.ylim()
        #pylab.axvline(whs, ls='--', color='k', lw=1)
        Y0, Y1 = pylab.ylim()
        pylab.plot([phantom.idxmax(), phantom.idxmax()],
                   [Y0, phantom.max()],
                   ls='--',
                   color='k',
                   lw=1)
        pylab.plot([df.idxmax(), df.idxmax()], [Y0, df.max()],
                   ls='--',
                   color='r',
                   lw=2)
        #pylab.fill_betweenx(ylim, 10,85, color='grey', alpha=0.5)
        pylab.ylim(ylim)
        pylab.ylabel("Cross-correlation")
        pylab.xlabel(
            "strand-shift: {}bp\nNSC={:.5f}, RSC={:.5f}, Qtag={}".format(
                best.index[0], NSC, RSC, tag))
        pylab.xlim(self.start, self.stop)
        pylab.grid(True, zorder=-20)
        try:
            pylab.tight_layout()
        except:
            pass
        stats['NSC'] = NSC
        stats['RSC'] = RSC
        stats['Qtag'] = tag
        return stats
Exemple #28
0
if save_plots:
    pylab.savefig(filename_output.replace(".", "_GC."), dpi=182)
    pylab.clf()
else:
    pylab.show()

# plot ZMW passes
fig, ax = pylab.subplots(1, 1, figsize=figsize_ZMW)
for i in range(len(list_BAM)):
    bam = list_BAM[i]
    bam.hist_ZMW_subreads(hold=True,
                          label=labels[i],
                          title="",
                          grid=False,
                          xlabel="Number of passes")
    pylab.xlim([0, 45])

    pylab.legend()
    fig.tight_layout()
    if save_plots:
        pylab.savefig(filename_output.replace(".", "_ZMW_%d." % i), dpi=182)
        pylab.clf()
    else:
        pylab.show()

# plot snr
for i in range(len(list_BAM)):
    bam = list_BAM[i]
    # plot read length
    fig, ax = pylab.subplots(1, 1, figsize=figsize_SNR)
    bam.hist_snr(grid=False, title="")