Beispiel #1
0
    def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True):
        """Number Of Polymerase Reads Per Barcode"""
        PR = self.df_barcoded["Polymerase Reads"].sum()
        data = self.df_barcoded['Polymerase Reads'].sort_values(
            ascending=False).values
        pylab.plot([int(x) for x in range(1,
                                          len(data) + 1)],
                   data,
                   label="barcodes")
        pylab.axhline(data.mean(), color="r", label="average")

        try:
            if unbarcoded is True:
                unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0]
                pylab.axhline(unbar, color="k", ls="--", label="not barcoded")
        except:
            pass

        pylab.xlabel("Barcode Rank Order", fontsize=fontsize)
        pylab.ylabel("Counts of Reads", fontsize=fontsize)
        pylab.title("Total Polymerase count: {}".format(PR))
        pylab.legend()
        pylab.ylim(ymin=0)
        try:
            pylab.tight_layout()
        except:
            pass
Beispiel #2
0
    def plot_volcano(self):
        """
        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_volcano()

        """
        d1 = self.df.query("padj>0.05")
        d2 = self.df.query("padj<=0.05")

        fig = pylab.figure()
        pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o",
            alpha=0.5, color="r", lw=0)
        pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o",
            alpha=0.5, color="k", lw=0)

        pylab.grid(True)
        pylab.xlabel("fold change")
        pylab.ylabel("log10 adjusted p-value")
        m1 = abs(min(self.df.log2FoldChange))
        m2 = max(self.df.log2FoldChange)
        limit = max(m1,m2)
        pylab.xlim([-limit, limit])
        y1,y2 = pylab.ylim()
        pylab.ylim([0,y2])

        pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
Beispiel #3
0
 def plot_contig_length_vs_GC(self, alpha=0.5):
     pylab.plot(self.df["length"], self.df['GC'], "o", alpha=alpha)
     pylab.xlabel("contig length (bp)")
     pylab.ylabel("GC (%)")
     pylab.grid(True)
     pylab.ylim([0, 100])
     pylab.xlim(0, max(self.df['length']) + 10)
Beispiel #4
0
    def plot(self,
             color_line='r',
             bgcolor='grey',
             color='yellow',
             lw=4,
             hold=False,
             ax=None):

        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax)
        pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.3)
        pylab.fill_between([0, xmax], [20, 20], [30, 30],
                           color='orange',
                           alpha=0.3)
        pylab.fill_between([0, xmax], [30, 30], [41, 41],
                           color='green',
                           alpha=0.3)

        if self.X is None:
            X = range(1, self.xmax + 1)

        pylab.fill_between(X,
                           self.df.mean() + self.df.std(),
                           self.df.mean() - self.df.std(),
                           color=color,
                           interpolate=False)

        pylab.plot(X, self.df.mean(), color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax + 1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Beispiel #5
0
    def plot_gc_vs_coverage(self, filename=None, bins=None, Nlevels=6,
                            fontsize=20, norm="log", ymin=0, ymax=100,
                            contour=True, **kwargs):

        if Nlevels is None or Nlevels==0:
            contour = False

        data = self.df[['cov','gc']].copy()
        data['gc'] *= 100
        data = data.dropna()
        if bins is None:
            bins = [100, min(int(data['gc'].max()-data['gc'].min()+1),
                    max(5,self.bed.gc_window_size - 4))]
            bins[0] = max(10, min(bins[0], self.df['cov'].max()))

        from biokit import Hist2D
        h2 = Hist2D(data)

        try:
            h2.plot(bins=bins, xlabel="Per-base coverage",
                    ylabel=r'GC content (%)',
                    Nlevels=Nlevels, contour=contour, norm=norm,
                    fontsize=fontsize, **kwargs)
        except:
            h2.plot(bins=bins, xlabel="Per-base coverage",
                    ylabel=r'GC content (%)' ,
                    Nlevels=Nlevels, contour=False, norm=norm,
                    fontsize=fontsize, **kwargs)
        pylab.ylim([ymin, ymax])
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
Beispiel #6
0
    def plot_alignment(self, bamfile, motif, window=200,
            global_th=10,title=None,legend=True, legend_fontsize=11,
            valid_rnames=[],
            valid_flags=[]):
        """


        plot alignments that match the motif. 

        """

        bam = BAM(bamfile)
        print("Found {} hits".format(len(bam)))
        pylab.clf()
        count = 0
        for aln in bam:
            if valid_rnames and aln.rname not in valid_rnames:
                continue
            if valid_flags and aln.flag not in valid_flags:
                continue

            seq = aln.query_sequence
            if seq:
                count += 1
                X1 = [seq[i:i+window].count(motif) for i in range(len(seq))]
                pylab.plot(range(aln.reference_start,
                    aln.reference_start+len(seq)),X1, label=aln.query_name)
        print("Showing {} entries after filtering".format(count))
        max_theo = int(1.2*window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count<15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)
Beispiel #7
0
    def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None):
        df = self.get_data_reads()

        # this is ugly but will do the job for now
        under = df.query("name=='Undetermined'")
        others = df.query("name!='Undetermined'")

        under = under.groupby("name").sum().reset_index()
        others = others.groupby("name").sum().reset_index()

        under = under[["name", "count"]].set_index("name")
        others = others[["name", "count"]].set_index("name")

        all_data = others.sort_index(ascending=False)
        all_data.columns = ["samples"]

        # appended at the end
        all_data.loc['undetermined'] = 0

        # revert back
        all_data = all_data.loc[::-1]

        # just for legend
        under.columns = ['undetermined']
        if all_data.sum().min() > 1e6:
            all_data /= 1e6
            under /= 1e6
            M = True
        else:
            M = False

        all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k')

        under.plot(kind="barh",
                   alpha=alpha,
                   color="red",
                   ax=pylab.gca(),
                   zorder=1,
                   width=width,
                   ec='k')
        pylab.ylim([-0.5, len(all_data) + 0.5])
        if len(all_data) < 100:
            pylab.yticks(range(len(all_data)), all_data.index)

        pylab.legend()
        pylab.grid(True, zorder=-1)
        if M:
            pylab.xlabel("Number of reads (M)")
        else:
            pylab.xlabel("Number of reads")
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename, dpi=200)
Beispiel #8
0
 def plot(self, fontsize=16):
     """plot quality versus base position"""
     pylab.plot(self.quality, label="offset: %s" % self.offset)
     pylab.xlabel('base position', fontsize=fontsize)
     pylab.ylabel('Quality per base', fontsize=fontsize)
     pylab.grid(True)
     # ylim set autoscale to off so if we want to call this function  several
     # times, we must reset autoscale to on before calling ylim
     pylab.autoscale()
     limits = pylab.ylim()
     pylab.ylim(max(0,limits[0]-1), limits[1]+1)
Beispiel #9
0
 def plot(self, fontsize=16):
     """plot quality versus base position"""
     pylab.plot(self.quality, label="offset: %s" % self.offset)
     pylab.xlabel('base position', fontsize=fontsize)
     pylab.ylabel('Quality per base', fontsize=fontsize)
     pylab.grid(True)
     # ylim set autoscale to off so if we want to call this function  several
     # times, we must reset autoscale to on before calling ylim
     pylab.autoscale()
     limits = pylab.ylim()
     pylab.ylim(max(0, limits[0] - 1), limits[1] + 1)
Beispiel #10
0
 def plot(self, clf=True):
     if clf:
         pylab.clf()
     M = self.df_shustring.shustring_length.max()
     print(M)
     M = int(M / 1000) + 1
     for i in range(M):
         pylab.axhline(i * 1000, ls='--', color='grey')
     pylab.plot(self.df_shustring.shustring_length)
     pylab.xlabel('position (bp)')
     pylab.ylabel('Length of repeats')
     pylab.ylim(bottom=0)
Beispiel #11
0
    def plot_GC_read_len(self,
                         hold=False,
                         fontsize=12,
                         bins=[60, 60],
                         grid=True,
                         xlabel="GC %",
                         ylabel="#",
                         cmap="BrBG"):
        """Plot GC content versus read length

        :param bool hold:
        :param int fontsize: for x and y labels and title
        :param bins: a integer or tuple of 2 integers to specify
            the binning of the x and y 2D histogram.
        :param bool grid:
        :param str xlabel:
        :param str ylabel:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.plot_GC_read_len(bins=[10, 10])

        """
        if self._df is None:
            self._get_df()
        mean_len = np.mean(self._df.loc[:, 'read_length'])
        mean_GC = np.mean(self._df.loc[:, 'GC_content'])

        if hold is False:
            pylab.clf()

        data = self._df.loc[:, ['read_length', 'GC_content']].dropna()
        h = biokit.viz.hist2d.Hist2D(data)
        res = h.plot(bins=bins,
                     contour=False,
                     norm='log',
                     Nlevels=6,
                     cmap=cmap)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("GC %", fontsize=fontsize)
        pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" %
                    (mean_len, mean_GC),
                    fontsize=fontsize)
        pylab.ylim([0, 100])
        if grid is True:
            pylab.grid(True)
Beispiel #12
0
    def plot_indel_dist(self, fontsize=16):
        """Plot indel count (+ ratio)

        :Return: list of insertions, deletions and ratio insertion/deletion for
            different length starting at 1

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.plot_indel_dist()

        What you see on this figure is the presence of 10 insertions of length
        1, 1 insertion of length 2 and 3 deletions of length 1


        # Note that in samtools, several insertions or deletions in a single
        alignment are ignored and only the first one seems to be reported. For
        instance 10M1I10M1I stored only 1 insertion in its report; Same comment
        for deletions.

        .. todo:: speed up and handle long reads cases more effitiently by 
            storing INDELS as histograms rather than lists
        """
        try:
            self.insertions
        except:
            self._set_indels()

        if len(self.insertions) ==0 or len(self.deletions) == 0:
            raise ValueError("No deletions or insertions found")

        N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1
        D = [self.deletions.count(i) for i in range(N)]
        I = [self.insertions.count(i) for i in range(N)]
        R = [i/d if d!=0 else 0 for i,d in zip(I, D)]
        fig, ax = pylab.subplots()
        ax.plot(range(N), I, marker="x", label="Insertions")
        ax.plot(range(N), D, marker="x", label="Deletions")
        ax.plot(range(N), R, "--r", label="Ratio insertions/deletions")
        ax.set_yscale("symlog")
        pylab.ylim([1, pylab.ylim()[1]])
        pylab.legend()
        pylab.grid()
        from matplotlib.ticker import MaxNLocator
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        pylab.xlabel("Indel length", fontsize=fontsize)
        pylab.ylabel("Indel count", fontsize=fontsize)
        return I, D, R
Beispiel #13
0
    def boxplot_quality(self, color_line='r', bgcolor='grey', color='yellow', lw=4, 
            hold=False, ax=None):


        quality = self.df[[str(x) for x in range(42)]]  # not sure why we have phred score from 0 to 41
        N = self.metadata['ReadNum']
        proba = quality / N

        self.xmax = 150
        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax) # pragma no cover
        pylab.fill_between([0,xmax], [0,0], [20,20], color='red', alpha=0.3)
        pylab.fill_between([0,xmax], [20,20], [30,30], color='orange', alpha=0.3)
        pylab.fill_between([0,xmax], [30,30], [41,41], color='green', alpha=0.3)


        X = []
        Q = []
        S = []
        for pos in range(1, 151):
            qualities = [((int(k)+1)*v) for k,v in quality.loc[pos].items()]
            mean_quality = sum(qualities) / N
            X.append(pos)
            Q.append(mean_quality)
            proba = quality.loc[pos] / N

            std = pylab.sqrt(sum([(x-mean_quality)**2 * y for x, y in zip(range(42), proba)]))
            S.append(std)

        print(len(X))
        print(len(Q))
        print(len(S))

        Q = np.array(Q)
        X = np.array(X)
        S = np.array(S)
        pylab.fill_between(X, Q+S, Q-S, 
            color=color, interpolate=False)

        pylab.plot(X, Q, color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax+1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Beispiel #14
0
    def plot_alignment(self,
                       motif,
                       window=200,
                       global_th=10,
                       title=None,
                       legend=True,
                       legend_fontsize=11):
        """


        plot alignments that match the motif. 

        """
        df = self._get_aligments(motif=motif,
                                 window=window,
                                 global_th=global_th)
        print("Found {} hits".format(len(df)))
        bam = BAM(self.bamfile)
        pylab.clf()
        count = 0
        for aln in bam:
            if aln.query_name in df.query_name.values:
                seq = aln.query_sequence
                if seq:
                    count += 1
                    X1 = [
                        seq[i:i + window].count(motif) for i in range(len(seq))
                    ]
                    pylab.plot(range(aln.reference_start,
                                     aln.reference_start + len(seq)),
                               X1,
                               label=aln.query_name)

        max_theo = int(1.2 * window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count < 15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)

        return df
Beispiel #15
0
    def plot(self):
        """"""
        if self.design:
            self.df['label'] = self.design.df['type'] + "/" + self.design.df[
                'condition']

        pylab.clf()
        MX = self.df.FRiP.max()
        MY = self.df['in_peaks'].max()
        pylab.plot([0, MX], [0, MY], ls='--', color='b', alpha=0.5)
        for label in self.df['label'].unique():
            self.df.query('label==@label').plot(x='FRiP',
                                                y='in_peaks',
                                                marker="o",
                                                lw=0,
                                                label=label,
                                                ax=pylab.gca())
        pylab.ylabel('Reads in peaks')
        pylab.xlabel('FRiP')
        pylab.xlim(0, pylab.xlim()[1])
        pylab.ylim(0, pylab.ylim()[1])
        pylab.grid()
Beispiel #16
0
    def plot_sequence_quality(self, max_score=40, ax=None):

        ymax = max_score + 1
        xmax = 0
        for sample in self.fastqc_data.keys():
            if "per_sequence_quality_scores" in self.fastqc_data[sample]:
                data = {
                    self._avg_bp_from_range(d['base']): d['mean']
                    for d in self.fastqc_data[sample]
                    ['per_base_sequence_quality']
                }
                df = pd.Series(data)
                df.plot(color="k", alpha=0.5)

                if df.max() > ymax:
                    ymax = df.max()
                if df.index.max() > xmax:
                    xmax = df.index.max()

        if ax:
            pylab.sca(ax)
        pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.4)
        pylab.fill_between([0, xmax], [20, 20], [30, 30],
                           color='orange',
                           alpha=0.4)
        pylab.fill_between([0, xmax], [30, 30], [ymax, ymax],
                           color='green',
                           alpha=0.4)

        X = range(1, xmax + 1)

        pylab.ylim([0, ymax])
        if xmax != 0:
            pylab.xlim([0, xmax])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Phred Score", fontsize=12)
        pylab.grid(axis='x')
Beispiel #17
0
    def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60],
                grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"):
        """Plot GC content versus read length

        :param bool hold:
        :param int fontsize: for x and y labels and title
        :param bins: a integer or tuple of 2 integers to specify
            the binning of the x and y 2D histogram.
        :param bool grid:
        :param str xlabel:
        :param str ylabel:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.plot_GC_read_len(bins=[10, 10])

        """
        mean_len =  np.mean(self.df.loc[:,'read_length'])
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        if hold is False:
            pylab.clf()

        data = self.df.loc[:,['read_length','GC_content']].dropna()
        h = biokit.viz.hist2d.Hist2D(data)
        res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("GC %", fontsize=fontsize)
        pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" %
            (mean_len, mean_GC), fontsize=fontsize)
        pylab.ylim([0, 100])
        if grid is True:
            pylab.grid(True)
Beispiel #18
0
    def _plot(self, Xr, pca=None, pc1=0, pc2=1, colors=None, show_labels=True):
        if colors is None:
            colors = [self.colors[k] for k in self.labels]
            if len(colors) != len(Xr):
                colors = ["r"] * len(Xr[:,0])
        else:
            for k in self.labels:
                if k not in colors.keys():
                    logger.warning("No key color for this sample: {}. Set to red".format(k))
                    colors[k] = "r"
            colors = [colors[k] for k in self.labels]

        pylab.scatter(Xr[:,pc1], Xr[:,pc2], c=colors)
        ax = pylab.gca()
        X1, X2 = pylab.xlim()
        dX = X2 - X1
        pylab.xlim([X1 + X1*0.05, X2 + X2*0.05])

        Y1, Y2 = pylab.ylim()
        dY = Y2 - Y1
        pylab.ylim([Y1 + Y1*0.05, Y2 + Y2*0.05])

        count = 0
        if show_labels:
            for x,y in zip(Xr[:,pc1], Xr[:,pc2]):
                x += dX / 40
                y += dY / 40
                ax.annotate(self.labels[count], (x,y))
                count += 1
                if count > 100: 
                    break
        if pca:
            pylab.xlabel("PC{} ({}%)".format(pc1+1,
                round(pca.explained_variance_ratio_[pc1]*100, 2)))
            pylab.ylabel("PC{} ({}%)".format(pc2+1,
                round(pca.explained_variance_ratio_[pc2]*100, 2)))
        pylab.grid(True)
Beispiel #19
0
 def plot_scores(self, filename=None, savefig=False):
     # scores
     from pylab import log10
     pylab.clf()
     pylab.plot(log10(self.df.query('score>540')['rep1_signal']),
                log10(self.df.query('score>540')['rep2_signal']),
                'ko',
                alpha=0.5,
                label='<0.05 IDR')
     pylab.plot(log10(self.df.query('score<540')['rep1_signal']),
                log10(self.df.query('score<540')['rep2_signal']),
                'ro',
                alpha=0.5,
                label='>=0.05 IDR')
     N = pylab.ylim()[1]
     pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--')
     pylab.xlabel("Rep1 log10 score")
     pylab.ylabel("Rep2 log10 score")
     pylab.legend(loc='lower right')
     if savefig:
         pylab.savefig(filename)
Beispiel #20
0
    def plot_coverage(self, filename=None, fontsize=16,
            rm_lw=1, rm_color="#0099cc", rm_label="Running median",
            th_lw=1, th_color="r", th_ls="--", main_color="k", main_lw=1,
            main_kwargs={}, sample=True, set_ylimits=True):
        """ Plot coverage as a function of base position.

        :param filename:
        :param rm_lw: line width of the running median
        :param rm_color: line color of the running median
        :param rm_color: label for the running median
        :param th_lw: line width of the thresholds
        :param th_color: line color of the thresholds
        :param main_color: line color of the coverage
        :param main_lw: line width of the coverage
        :param sample: if there are more than 1 000 000 points, we 
            use an integer step to skip data points. We can still plot
            all points at your own risk by setting this option to False

        :param set_ylimits: we want to focus on the "normal" coverage ignoring
            unsual excess. To do so, we set the yaxis range between 0 and a
            maximum value. This maximum value is set to the minimum between the
            6 times the mean coverage and 1.5 the maximum of the high coverage
            threshold curve. If you want to let the ylimits free, set this
            argument to False

        .. note:: if there are more than 1,000,000 points, we show only
            1,000,000 by points. For instance for 5,000,000 points,

        In addition to the coverage, the running median and coverage confidence
        corresponding to the lower and upper  zscore thresholds are shown.

        .. note:: uses the thresholds attribute.
        """
        # z = (X/rm - \mu ) / sigma
        high_zcov = (self.thresholds.high * self.best_gaussian["sigma"] +
                self.best_gaussian["mu"]) * self.df["rm"]
        low_zcov = (self.thresholds.low * self.best_gaussian["sigma"] +
                self.best_gaussian["mu"]) * self.df["rm"]

        pylab.clf()
        ax = pylab.gca()
        ax.set_facecolor('#eeeeee')
        pylab.xlim(0,self.df["pos"].iloc[-1])
        axes = []
        labels = []

        # 1,000,000 points is a lot for matplotlib. Let us restrict ourself to 1
        # million points for now.
        if len(self.df) > 1000000 and sample is True:
            NN = int(len(self.df)/1000000)
        else:
            NN = 1

        # the main coverage plot
        p1, = pylab.plot(self.df["cov"][::NN], color=main_color, label="Coverage",
                linewidth=main_lw, **main_kwargs)
        axes.append(p1)
        labels.append("Coverage")

        # The running median plot
        if rm_lw > 0:
            p2, = pylab.plot(self.df["rm"][::NN],
                    color=rm_color,
                    linewidth=rm_lw,
                    label=rm_label)
            axes.append(p2)
            labels.append(rm_label)

        # The threshold curves
        if th_lw > 0:
            p3, = pylab.plot(high_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls,
                label="Thresholds")
            p4, = pylab.plot(low_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls,
                label="_nolegend_")
            axes.append(p3)
            labels.append("Thresholds")

        pylab.legend(axes, labels, loc="best")
        pylab.xlabel("Position", fontsize=fontsize)
        pylab.ylabel("Per-base coverage", fontsize=fontsize)
        pylab.grid(True)

        # sometimes there are large coverage value that squeeze the plot.
        # Let us restrict it
        if set_ylimits is True:
            pylab.ylim([0, min([
                high_zcov.max() * 1.5,
                self.df["cov"].mean()*6])])
        else:
            pylab.ylim([0, pylab.ylim()[1]])

        try:
            pylab.tight_layout()
        except:
            pass

        if filename:
            pylab.savefig(filename)
for i in range(len(list_analysis)):
    analysis = list_analysis[i]
    res = compute_table_performance(analysis, df_results)
    print("%s" % analysis)
    # [TP, FP, FN, TN]
    # print(len(res[0]), len(res[1]), res[2], res[3] , sum([len(res[0]), len(res[1]), res[2], res[3]]))
    TP = res[0]
    FP = res[1]
    FN = [0] * res[2]
    TN = [0] * res[3]
    y_true = np.array([1] * len(TP) + [1] * len(FN) + [0] * len(FP) +
                      [0] * len(TN))
    y_scores = np.array(TP + FN + FP + TN)
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    pylab.plot(recall, precision, color=colors[i], label=analysis)

pylab.xlabel('Recall')
pylab.ylabel('Precision')
pylab.ylim([0.0, 1.05])
pylab.xlim([0.0, 1.05])
pylab.title('Precision-Recall')
#pylab.legend(loc="lower left")

lgd = pylab.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#pylab.tight_layout()

if file_fig != "show":
    pylab.savefig(file_fig, bbox_extra_artists=(lgd, ), bbox_inches='tight')
else:
    pylab.show()
Beispiel #22
0
    def plot_common_major_counts(self, mode, labels=None,
            switch_up_down_cond2=False, add_venn=True, xmax=None, 
            title="", fontsize=12, sortby="log2FoldChange"):
        """

        :param mode: down, up or all


        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana.compare import RNADiffCompare

            c = RNADiffCompare(
                sequana_data("rnadiff/rnadiff_onecond_1"),
                sequana_data("rnadiff/rnadiff_onecond_2"))
            c.plot_common_major_counts("down")
        """
        #cond1, cond2 = self._get_cond1_cond2()
        if labels is None:
            labels = ['r1', 'r2']

        if mode in ["down"]:
            # Negative values !
            gl1 = set(self.r1.gene_lists['down'])
            gl2 =  set(self.r2.gene_lists['down'])
            A = self.r1.df.loc[gl1].sort_values(by=sortby)
            B = self.r2.df.loc[gl1].sort_values(by=sortby)
        else:
            gl1 = set(self.r1.gene_lists[mode])
            gl2 =  set(self.r2.gene_lists[mode])
            A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False)
            B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False)
        # sometimes, up and down may be inverted as compared to the other
        # conditions

        N = []
        for i in range(1,max(len(A), len(B))):
            a = A.iloc[0:i].index
            b = B.iloc[0:i].index
            n = len(set(b).intersection(set(a)))
            N.append(n / i*100)

        max_common = len(set(A.index).intersection(set(B.index)))
        pylab.clf()
        if len(A) > len(B):
            pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection")
            pylab.axvline(len(B), ls="--", color="k", label="rank of minor set")
        else:
            pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect")
            pylab.axvline(len(A), ls="--", color="k", label="rank of minor set")

        pylab.plot(N)
        pylab.xlabel('rank', fontsize=fontsize)
        pylab.ylabel('% common features', fontsize=fontsize)
        pylab.grid(True)
        pylab.ylim([0,100])
        if xmax:
            pylab.xlim([0, xmax])
        else:
            pylab.xlim([0, max(len(A),len(B))])
        pylab.title(title, fontsize=fontsize)
        ax = pylab.gca()
        ax2 = ax.twinx()
        ax2.plot(A[sortby].values, "orange", label=sortby)
        ax2.set_ylabel(sortby)
        pylab.legend(loc="lower left")
        ax.legend(loc="lower right")

        if add_venn:
            f = pylab.gcf()
            ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey")
            if mode=="down":
                self.plot_venn_down(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="up":
                self.plot_venn_up(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="all":
                self.plot_venn_all(ax=ax, title=None, labels=labels,
                    mode="two_only")
Beispiel #23
0
    def stats(self, results, df_avc, bw=1):

        stats = {}
        stats['read_fragments'] = len(self.df)
        stats['fragment_length'] = self.read_length

        # average cross correlation across all chromosomes
        print("Read {} fragments".format(stats['read_fragments']))
        print("ChIP data mean length: {}".format(self.read_length))
        #df_avc.sum(axis=1).plot()
        df = df_avc.sum(axis=1)
        corr_max = df.max()
        shift_max = df.idxmax()
        # note that in phantomPeak, they use the last value as min... not the
        # actual min. Not very important.
        corr_min = df.min()
        shift_min = df.idxmin()
        print("Maximum cross-correlation value: {:.5f}".format(corr_max))
        print("Maximum cross-correlation shift: {}".format(shift_max))
        print("Minimum cross-correlation value: {:.5f}".format(corr_min))
        print("Minimum cross-correlation shift: {}".format(shift_min))
        stats['shift_max'] = int(shift_max)  # to make it json serialisable
        stats['corr_max'] = corr_max

        # original code phantomPeak but always equal to 1 it range max >5 ??
        # default is 500 so sbw=1 whatsoever
        #sbw = 2 * floor(ceil(5/15000) / 2) + 1
        sbw = 1

        # here we could use a rolling mean
        #df.rolling(window=5, center=True).mean()

        # so following runnin mean is useless
        # cc$y <- runmean(cc$y,sbw,alg="fast")
        #

        # again, computation of bw but always equal to 1 ....
        # Compute cross-correlation peak
        #  bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks
        #bw = 1
        # search for local peaks within bandwidth of bw = 1
        peakidx = df.diff(periods=bw) > 0
        peakidx = peakidx.astype(int).diff(periods=bw) == -1

        # the final bw points are NA and filled with False
        peakidx = peakidx.shift(-bw).fillna(False)

        df_peaks = df[peakidx]
        # when searching for max, exclude peaks from the excluded region
        exclusion_range = [10, self.read_length + 10]
        mask = np.logical_or(df_peaks.index < exclusion_range[0],
                             df_peaks.index > exclusion_range[1])
        df_peaks = df_peaks[mask]

        #
        max_peak = df_peaks.max()
        shift_peak = df_peaks.idxmax()

        # now, we select peaks that are at least 90% of main peak and with shift
        # higher than main shift. why higher ?
        mask = np.logical_and(df_peaks > max_peak * 0.9,
                              df_peaks.index >= shift_peak)
        best_df_peaks = df_peaks[mask]
        best = best_df_peaks.sort_values(ascending=False)[0:3]

        values = ",".join(["{:.5f}".format(x) for x in best.values])
        pos = ",".join([str(x) for x in best.index])
        print("Top 3 cross-correlation values: {}".format(values))
        print("Top 3 estimates for fragment length: {}".format(pos))

        # now the real window half size according to phantom peaks, not spp ...
        # min + (max-min)/3
        threshold = (df_peaks.max() - corr_min) / 3 + corr_min
        whs = df[df > threshold].index.max()

        # coming back to real cross correlation, identify peak in window
        # readlength +- 2*binning  !! not symmetry in phantompeak
        # x >= ( chip.data$read.length - round(2*binning) &
        # x <= ( chip.data$read.length + round(1.5*binning)

        binning = self.binning
        ph_min = self.read_length - round(2 * binning)
        ph_max = self.read_length + round(1.5 * binning)
        phantom = df[np.logical_and(df.index >= ph_min, df.index <= ph_max)]
        print("Phantom peak range detection:{}-{}".format(ph_min, ph_max))
        print("Phantom peak location:{}".format(phantom.idxmax()))
        print("Phantom peak Correlation: {:.5f}".format(phantom.max()))
        stats['phantom_corr'] = phantom.max()
        stats['phantom_location'] = int(phantom.idxmax())  # for json

        NSC = df_peaks.max() / phantom.max()
        # error in phatompeaks ?? is encoded as follows but no link with phantom
        # peak...
        # Another difference with phantom peak is that the min in phantom peak
        # is not the min but last value on the RHS so
        #    phantom_coeff = df_peaks.max() /  df.min()
        # is
        #    phantom_coeff = df_peaks.max() /  df.iloc[-1]
        NSC_spp = df_peaks.max() / df.iloc[-1]
        print(
            "Normalized Strand cross-correlation coefficient (NSC): {:.5f} [{:.5f}]"
            .format(NSC, NSC_spp))
        RSC = (df_peaks.max() - df.min()) / (phantom.max() - df.min())
        RSC_spp = (df_peaks.max() - df.iloc[-1]) / (phantom.max() -
                                                    df.iloc[-1])
        print(
            "Relative Strand cross-correlation Coefficient (RSC): {:.5f} [{:.5f}]"
            .format(RSC, RSC_spp))

        if RSC > 0 and RSC < 0.25:
            tag = -2
        elif RSC >= 0.25 and RSC < 0.5:
            tag = -1
        elif RSC >= 0.5 and RSC < 1:
            tag = 0
        elif RSC >= 1 and RSC < 1.5:
            tag = 1
        elif RSC >= 1.5:
            tag = 2
        print("Phantom Peak Quality Tag: {}".format(tag))

        pylab.clf()
        df.plot()
        ##df_peaks.plot(marker="o", lw=0)
        ylim = pylab.ylim()
        #pylab.axvline(whs, ls='--', color='k', lw=1)
        Y0, Y1 = pylab.ylim()
        pylab.plot([phantom.idxmax(), phantom.idxmax()],
                   [Y0, phantom.max()],
                   ls='--',
                   color='k',
                   lw=1)
        pylab.plot([df.idxmax(), df.idxmax()], [Y0, df.max()],
                   ls='--',
                   color='r',
                   lw=2)
        #pylab.fill_betweenx(ylim, 10,85, color='grey', alpha=0.5)
        pylab.ylim(ylim)
        pylab.ylabel("Cross-correlation")
        pylab.xlabel(
            "strand-shift: {}bp\nNSC={:.5f}, RSC={:.5f}, Qtag={}".format(
                best.index[0], NSC, RSC, tag))
        pylab.xlim(self.start, self.stop)
        pylab.grid(True, zorder=-20)
        try:
            pylab.tight_layout()
        except:
            pass
        stats['NSC'] = NSC
        stats['RSC'] = RSC
        stats['Qtag'] = tag
        return stats
Beispiel #24
0
    def plot(self,
             interpolation='None',
             aspect='auto',
             cmap='hot',
             tight_layout=True,
             colorbar=True,
             fontsize_x=None,
             fontsize_y=None,
             rotation_x=90,
             xticks_on=True,
             yticks_on=True,
             **kargs):
        """wrapper around imshow to plot a dataframe

        :param interpolation: set to None
        :param aspect: set to 'auto'
        :param cmap: colormap to be used.
        :param tight_layout:
        :param colorbar: add a colobar (default to True)
        :param fontsize_x: fontsize on xlabels
        :param fontsize_y: fontsize on ylabels
        :param rotation_x: rotate labels on xaxis
        :param xticks_on: switch off the xticks and labels
        :param yticks_on: switch off the yticks and labels

        """

        data = self.df
        pylab.clf()
        pylab.imshow(data,
                     interpolation=interpolation,
                     aspect=aspect,
                     cmap=cmap,
                     **kargs)

        if fontsize_x == None:
            fontsize_x = 16  #FIXME use default values
        if fontsize_y == None:
            fontsize_y = 16  #FIXME use default values

        if yticks_on is True:
            pylab.yticks(range(0, len(data.index)),
                         data.index,
                         fontsize=fontsize_y)
        else:
            pylab.yticks([])
        if xticks_on is True:
            pylab.xticks(range(0, len(data.columns[:])),
                         data.columns,
                         fontsize=fontsize_x,
                         rotation=rotation_x)
        else:
            pylab.xticks([])

        if colorbar is True:
            pylab.colorbar()

        if tight_layout:
            pylab.tight_layout()

        # For some reasons, in newest version of python/mpl, this is required
        # for ylim, not for xlim
        y1, y2 = pylab.ylim()
        pylab.ylim([y1 + 0.5, y2 - 0.5])