Beispiel #1
0
 def plot_contig_length_vs_GC(self, alpha=0.5):
     pylab.plot(self.df["length"], self.df['GC'], "o", alpha=alpha)
     pylab.xlabel("contig length (bp)")
     pylab.ylabel("GC (%)")
     pylab.grid(True)
     pylab.ylim([0, 100])
     pylab.xlim(0, max(self.df['length']) + 10)
Beispiel #2
0
    def plot_specific_alignment(self,
                                query_name,
                                motif,
                                clf=True,
                                windows=[10, 50, 100, 200, 500, 1000]):

        found = None
        bam = BAM(self.bamfile)
        for aln in bam:
            if aln.query_name == query_name:
                found = aln
        if found:
            # Detection
            seq = found.query_sequence
            if clf: pylab.clf()
            for window in windows:
                X = [seq[i:i + window].count(motif) for i in range(len(seq))]
                pylab.plot(X, label=window)
                score = sum([x > window / 6 for x in X])
                print(window, score / 3.)
            pylab.legend()
            pylab.ylabel("# {} in a given sliding window".format(motif))
            pylab.title(query_name)
        else:
            print("Not found")
Beispiel #3
0
 def plot_bar(self, spikes_filename=None, ratio=100):
     data = self.spikes_found(spikes_filename)
     lengths = [self.SIRV_lengths[x] for x in data.index]
     data.plot(kind="bar")
     pylab.plot(np.array(lengths) / ratio)
     pylab.tight_layout()
     return data
Beispiel #4
0
    def plot_gc_content(self, fontsize=16, ec="k", bins=100):
        """plot GC content histogram

        :params bins: a value for the number of bins or an array (with a copy()
            method)
        :param ec: add black contour on the bars

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam'))
            b.plot_gc_content()

        """
        data = self.get_gc_content()
        try:
            X = np.linspace(0, 100, bins)
        except:
            X = bins.copy()

        pylab.hist(data, X, normed=True, ec=ec)
        pylab.grid(True)
        mu = pylab.mean(data)
        sigma = pylab.std(data)

        X = pylab.linspace(X.min(), X.max(), 100)
        pylab.plot(X, pylab.normpdf(X, mu, sigma), lw=2, color="r", ls="--")
        pylab.xlabel("GC content", fontsize=16)
Beispiel #5
0
    def plot_volcano(self):
        """
        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_volcano()

        """
        d1 = self.df.query("padj>0.05")
        d2 = self.df.query("padj<=0.05")

        fig = pylab.figure()
        pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o",
            alpha=0.5, color="r", lw=0)
        pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o",
            alpha=0.5, color="k", lw=0)

        pylab.grid(True)
        pylab.xlabel("fold change")
        pylab.ylabel("log10 adjusted p-value")
        m1 = abs(min(self.df.log2FoldChange))
        m2 = max(self.df.log2FoldChange)
        limit = max(m1,m2)
        pylab.xlim([-limit, limit])
        y1,y2 = pylab.ylim()
        pylab.ylim([0,y2])

        pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
Beispiel #6
0
    def plot(self,
             color_line='r',
             bgcolor='grey',
             color='yellow',
             lw=4,
             hold=False,
             ax=None):

        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax)
        pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.3)
        pylab.fill_between([0, xmax], [20, 20], [30, 30],
                           color='orange',
                           alpha=0.3)
        pylab.fill_between([0, xmax], [30, 30], [41, 41],
                           color='green',
                           alpha=0.3)

        if self.X is None:
            X = range(1, self.xmax + 1)

        pylab.fill_between(X,
                           self.df.mean() + self.df.std(),
                           self.df.mean() - self.df.std(),
                           color=color,
                           interpolate=False)

        pylab.plot(X, self.df.mean(), color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax + 1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Beispiel #7
0
    def plot_alignment(self, bamfile, motif, window=200,
            global_th=10,title=None,legend=True, legend_fontsize=11,
            valid_rnames=[],
            valid_flags=[]):
        """


        plot alignments that match the motif. 

        """

        bam = BAM(bamfile)
        print("Found {} hits".format(len(bam)))
        pylab.clf()
        count = 0
        for aln in bam:
            if valid_rnames and aln.rname not in valid_rnames:
                continue
            if valid_flags and aln.flag not in valid_flags:
                continue

            seq = aln.query_sequence
            if seq:
                count += 1
                X1 = [seq[i:i+window].count(motif) for i in range(len(seq))]
                pylab.plot(range(aln.reference_start,
                    aln.reference_start+len(seq)),X1, label=aln.query_name)
        print("Showing {} entries after filtering".format(count))
        max_theo = int(1.2*window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count<15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)
Beispiel #8
0
def find_motif(bamfile, motif="CAGCAG", window=200, savefig=False, 
    local_th=5, global_th=10):
    """

    If at least 10 position contains at least 5 instances of the motif, then
    this is a hit and the alignment is kept
    """
    b1 = BAM(bamfile)

    # FIND motif and create pictures
    count = 0
    found = []
    Ss = []
    alns = []
    for a in b1:
        count +=1
        if a.query_sequence is None:
            continue
        seq = a.query_sequence
        X1 = [seq[i:i+window].count(motif) for i in range(len(seq))]
        S = sum([x>local_th for x in X1])
        Ss.append(S)
        als.append(a)
        if S > global_th:
            found.append(True)
            off = a.query_alignment_start
            pylab.clf()
            pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1)
            if savefig:
                pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_")))
        else:
            found.append(False)

    return alns, found, Ss
Beispiel #9
0
 def plot_bar(self, spikes_filename=None, ratio=100):
     data = self.spikes_found(spikes_filename)
     lengths = [self.SIRV_lengths[x] for x in data.index]
     data.plot(kind="bar")
     pylab.plot(np.array(lengths)/ratio)
     pylab.tight_layout()
     return data
Beispiel #10
0
    def plot_specific_alignment(self, bamfile, query_name, motif,clf=True,
            show_figure=True, authorized_flags=[0,16],
            windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5):

        found = None
        bam = BAM(bamfile)
        for aln in bam:
            if aln.query_name == query_name and aln.flag in authorized_flags:
                found = aln
                break  # we may have several entries. let us pick up the first 
            

        sizes = []
        if found:
            # Detection
            seq = found.query_sequence
            if clf:pylab.clf()
            for window in windows:
                X = [seq[i:i+window].count(motif) for i in range(len(seq))]
                if show_figure:
                    pylab.plot(X, label=window)
                score = sum([x>local_threshold for x in X])
                sizes.append(score-window)
            if show_figure:
                pylab.legend()
                pylab.ylabel("# {} in a given sliding window".format(motif))
                pylab.title(query_name)
        else:
            print("{} Not found in {} file".format(query_name, bamfile))
        
        return sizes
Beispiel #11
0
    def get_max_gc_correlation(self, reference):
        """Plot correlation between coverage and GC content by varying the GC window

         The GC content uses a moving window of size W. This parameter affects
         the correlation bewteen coverage and GC. This function find the
         *optimal* window length.

        """
        pylab.clf()
        corrs = []
        wss = []

        def func(params):
            ws = int(round(params[0]))
            if ws < 10:
                return 0
            self.bed.compute_gc_content(reference, ws)
            corr = self.get_gc_correlation()
            corrs.append(corr)
            wss.append(ws)
            return corr

        from scipy.optimize import fmin
        res = fmin(func, 100, xtol=1, disp=False)  # guess is 200
        pylab.plot(wss, corrs, "o")
        pylab.xlabel("GC window size")
        pylab.ylabel("Correlation")
        pylab.grid()
        return res[0]
Beispiel #12
0
    def plot_idr_vs_peaks(self, filename=None, savefig=False):

        # global_idr is actually -log10(idr)
        pylab.clf()
        X1 = pylab.linspace(0, self.threshold, 100)
        X2 = pylab.linspace(self.threshold, 1, 100)
        # convert global idr to proba

        df1 = self.df.query("idr<@self.threshold")
        df2 = self.df.query("idr>[email protected]")

        pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2)
        shift = len(df1)

        pylab.plot([shift + sum(df2['idr'] < x) for x in X2],
                   X2,
                   "-",
                   color='k',
                   lw=2)
        pylab.xlabel('Number of significant peaks')
        pylab.ylabel('IDR')
        pylab.axhline(0.05, color='b', ls='--')
        pylab.axvline(self.N_significant_peaks, color='b', ls='--')
        if savefig:
            pylab.savefig(filename)
Beispiel #13
0
    def plot_gc_content(self, fontsize=16, ec="k", bins=100):
        """plot GC content histogram

        :params bins: a value for the number of bins or an array (with a copy()
            method)
        :param ec: add black contour on the bars

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam'))
            b.plot_gc_content()

        """
        data = self.get_gc_content()
        try:
            X = np.linspace(0, 100, bins)
        except:
            X = bins.copy()

        pylab.hist(data, X, density=True, ec=ec)
        pylab.grid(True)
        mu = pylab.mean(data)
        sigma = pylab.std(data)

        X = pylab.linspace(X.min(), X.max(), 100)

        from sequana.misc import normpdf

        pylab.plot(X, normpdf(X, mu, sigma), lw=2, color="r", ls="--")
        pylab.xlabel("GC content", fontsize=16)
Beispiel #14
0
    def check(self, bins=60):
        y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k")
        M1 = max(y)

        # this normalisation is an approximation/hack
        pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro")
        pylab.title("simulated (blue) and target (red) distributions")
Beispiel #15
0
    def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True):
        """Number Of Polymerase Reads Per Barcode"""
        PR = self.df_barcoded["Polymerase Reads"].sum()
        data = self.df_barcoded['Polymerase Reads'].sort_values(
            ascending=False).values
        pylab.plot([int(x) for x in range(1,
                                          len(data) + 1)],
                   data,
                   label="barcodes")
        pylab.axhline(data.mean(), color="r", label="average")

        try:
            if unbarcoded is True:
                unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0]
                pylab.axhline(unbar, color="k", ls="--", label="not barcoded")
        except:
            pass

        pylab.xlabel("Barcode Rank Order", fontsize=fontsize)
        pylab.ylabel("Counts of Reads", fontsize=fontsize)
        pylab.title("Total Polymerase count: {}".format(PR))
        pylab.legend()
        pylab.ylim(ymin=0)
        try:
            pylab.tight_layout()
        except:
            pass
Beispiel #16
0
 def plot(self, fontsize=16):
     """plot quality versus base position"""
     pylab.plot(self.quality, label="offset: %s" % self.offset)
     pylab.xlabel('base position', fontsize=fontsize)
     pylab.ylabel('Quality per base', fontsize=fontsize)
     pylab.grid(True)
     # ylim set autoscale to off so if we want to call this function  several
     # times, we must reset autoscale to on before calling ylim
     pylab.autoscale()
     limits = pylab.ylim()
     pylab.ylim(max(0, limits[0] - 1), limits[1] + 1)
Beispiel #17
0
 def plot(self, fontsize=16):
     """plot quality versus base position"""
     pylab.plot(self.quality, label="offset: %s" % self.offset)
     pylab.xlabel('base position', fontsize=fontsize)
     pylab.ylabel('Quality per base', fontsize=fontsize)
     pylab.grid(True)
     # ylim set autoscale to off so if we want to call this function  several
     # times, we must reset autoscale to on before calling ylim
     pylab.autoscale()
     limits = pylab.ylim()
     pylab.ylim(max(0,limits[0]-1), limits[1]+1)
Beispiel #18
0
 def plot(self, clf=True):
     if clf:
         pylab.clf()
     M = self.df_shustring.shustring_length.max()
     print(M)
     M = int(M / 1000) + 1
     for i in range(M):
         pylab.axhline(i * 1000, ls='--', color='grey')
     pylab.plot(self.df_shustring.shustring_length)
     pylab.xlabel('position (bp)')
     pylab.ylabel('Length of repeats')
     pylab.ylim(bottom=0)
Beispiel #19
0
    def run(self,
            bins=50,
            xmin=0,
            xmax=30000,
            step=1000,
            burn=1000,
            alpha=1,
            output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length,
                                      bins=bins,
                                      normed=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([
                alpha,
                self.target_distribution(can) / self.target_distribution(x)
            ])
            #acceptance probability
            u = pylab.uniform(0, 1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, normed=1)
        pylab.plot(x, y, 'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF', 'Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
Beispiel #20
0
    def plot(self,
             normed=True,
             N=1000,
             Xmin=None,
             Xmax=None,
             bins=50,
             color='red',
             lw=2,
             hist_kw={
                 'color': '#5F9EA0',
                 "edgecolor": "k"
             },
             ax=None):

        if ax:
            ax.hist(self.data, normed=normed, bins=bins, **hist_kw)
        else:
            pylab.hist(self.data, density=normed, bins=bins, **hist_kw)
        if Xmin is None:
            Xmin = self.data.min()
        if Xmax is None:
            Xmax = self.data.max()
        X = pylab.linspace(Xmin, Xmax, N)

        if ax:
            ax.plot(X, [self.model.pdf(x, self.results.x) for x in X],
                    color=color,
                    lw=lw)
        else:
            pylab.plot(X, [self.model.pdf(x, self.results.x) for x in X],
                       color=color,
                       lw=lw)

        K = len(self.results.x)
        # The PIs must be normalised
        import scipy.stats as ss
        for i in range(self.k):

            mu, sigma, pi_ = self.results.mus[i], self.results.sigmas[
                i], self.results.pis[i]
            if ax:
                ax.plot(X, [pi_ * ss.norm.pdf(x, mu, sigma) for x in X],
                        'k--',
                        alpha=0.7,
                        lw=2)
            else:
                pylab.plot(X, [pi_ * ss.norm.pdf(x, mu, sigma) for x in X],
                           'k--',
                           alpha=0.7,
                           lw=2)
Beispiel #21
0
    def plot_coverage(self):
        """Please use :class:`GenomeCov` for more sophisticated
        tools to plot the genome coverage

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.plot_coverage()

        """
        try: self.coverage
        except: self._set_coverage()
        pylab.plot(self.coverage)
        pylab.xlabel("Coverage")
Beispiel #22
0
    def plot_coverage(self):
        """Please use :class:`GenomeCov` for more sophisticated
        tools to plot the genome coverage

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.plot_coverage()

        """
        try: self.coverage
        except: self.set_fast_stats()
        pylab.plot(self.coverage)
        pylab.xlabel("Coverage")
Beispiel #23
0
    def boxplot_quality(self, color_line='r', bgcolor='grey', color='yellow', lw=4, 
            hold=False, ax=None):


        quality = self.df[[str(x) for x in range(42)]]  # not sure why we have phred score from 0 to 41
        N = self.metadata['ReadNum']
        proba = quality / N

        self.xmax = 150
        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax) # pragma no cover
        pylab.fill_between([0,xmax], [0,0], [20,20], color='red', alpha=0.3)
        pylab.fill_between([0,xmax], [20,20], [30,30], color='orange', alpha=0.3)
        pylab.fill_between([0,xmax], [30,30], [41,41], color='green', alpha=0.3)


        X = []
        Q = []
        S = []
        for pos in range(1, 151):
            qualities = [((int(k)+1)*v) for k,v in quality.loc[pos].items()]
            mean_quality = sum(qualities) / N
            X.append(pos)
            Q.append(mean_quality)
            proba = quality.loc[pos] / N

            std = pylab.sqrt(sum([(x-mean_quality)**2 * y for x, y in zip(range(42), proba)]))
            S.append(std)

        print(len(X))
        print(len(Q))
        print(len(S))

        Q = np.array(Q)
        X = np.array(X)
        S = np.array(S)
        pylab.fill_between(X, Q+S, Q-S, 
            color=color, interpolate=False)

        pylab.plot(X, Q, color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax+1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Beispiel #24
0
    def plot_read_length(self):
        """Plot occurences of aligned read lengths

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("test.bam"))
            b.plot_read_length()

        """
        X, Y = self._get_read_length()
        pylab.plot(X, Y,
            label="min length:{}; max length:{}".format(min(X), max(X)))
        pylab.grid()
        pylab.xlabel("Read length", fontsize=16)
        pylab.legend()
Beispiel #25
0
    def plot_read_length(self):
        """Plot occurences of aligned read lengths

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("test.bam"))
            b.plot_read_length()

        """
        X, Y = self._get_read_length()
        pylab.plot(X, Y,
            label="min length:{}; max length:{}".format(min(X), max(X)))
        pylab.grid()
        pylab.xlabel("Read length", fontsize=16)
        pylab.legend()
Beispiel #26
0
    def diagnostics(self, bins=60, clear=True):
        if clear: pylab.clf()

        pylab.subplot(3,1,1)
        pylab.hist(self.aprob, bins=bins)
        pylab.title("Acceptation")

        pylab.subplot(3,1,2)
        pylab.plot(self.vec)
        pylab.title("proposition")

        pylab.subplot(3,1,3)
        y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k")
        M1 = max(y)

        # this normalisation is an approximation/hack
        pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro")
        pylab.title("simulated (blue) and target (red) distributions")
Beispiel #27
0
    def find_motif(self, motif, window=200, figure=False, savefig=False):

        b1 = BAM(self.bamfile)

        df = {
            "query_name": [],
            "hit": [],
            "length": [],
            "start": [],
            "end": []
        }

        for a in b1:
            if a.query_sequence is None:
                continue
            seq = a.query_sequence

            X1 = [seq[i:i + window].count(motif) for i in range(len(seq))]
            S = sum([x >= self.local_threshold for x in X1])

            df['query_name'].append(a.query_name)
            df['start'].append(a.reference_start)
            df['end'].append(a.reference_end)
            df['length'].append(a.rlen)
            df['hit'].append(S)

            if S >= self.global_threshold:
                off = a.query_alignment_start
                #pylab.clf()
                if figure:
                    pylab.plot(
                        range(off + a.reference_start,
                              off + a.reference_start + len(seq)), X1)
                    if savefig:
                        pylab.savefig("{}_{}_{}.png".format(
                            a.reference_name, S,
                            a.query_name.replace("/", "_")))

        df = pd.DataFrame(df)
        L = len(df.query("hit>5"))
        print(L)
        return df
Beispiel #28
0
    def plot_alignment(self,
                       motif,
                       window=200,
                       global_th=10,
                       title=None,
                       legend=True,
                       legend_fontsize=11):
        """


        plot alignments that match the motif. 

        """
        df = self._get_aligments(motif=motif,
                                 window=window,
                                 global_th=global_th)
        print("Found {} hits".format(len(df)))
        bam = BAM(self.bamfile)
        pylab.clf()
        count = 0
        for aln in bam:
            if aln.query_name in df.query_name.values:
                seq = aln.query_sequence
                if seq:
                    count += 1
                    X1 = [
                        seq[i:i + window].count(motif) for i in range(len(seq))
                    ]
                    pylab.plot(range(aln.reference_start,
                                     aln.reference_start + len(seq)),
                               X1,
                               label=aln.query_name)

        max_theo = int(1.2 * window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count < 15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)

        return df
Beispiel #29
0
    def plot_contig_length_vs_nreads(self, fontsize=16):
        # same as plot_scatter_contig_length_nread_cov
        if self._df is None:
            _ = self.get_df()
        pylab.clf()
        df = self._df

        m1 = df.length.min()
        M1 = df.length.max()
        pylab.loglog(df.length, df.nread, "o")
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("Contig N reads", fontsize=fontsize)
        pylab.grid()

        X = df.query("nread>10 and length>100000")['length']
        Y = df.query("nread>10 and length>100000")['nread']
        A = np.vstack([X, np.ones(len(X))]).T
        m, c = np.linalg.lstsq(A, Y.as_matrix())[0]
        x = np.array([m1, M1])
        pylab.plot(x, m * x + c, "o-r")
        pylab.tight_layout()
Beispiel #30
0
    def plot(self,
             X=[0, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8, .9, .95, .99, .999, 1],
             fontsize=16,
             label=None):
        """plot percentage of genes covered (y axis) as a function of percentage
        of genes covered at least by X percent (x-axis). 

        """
        icol = self.coverage_column
        N = float(len(self.df))
        X = np.array(X)
        Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X])
        if label is None:
            pylab.plot(X * 100, Y, "o-")
        else:
            pylab.plot(X * 100, Y, "o-", label=label)
        pylab.xlabel("Gene coverage (%)", fontsize=fontsize)
        pylab.ylabel("Percentage of genes covered", fontsize=fontsize)
        for this in [25, 50, 75]:
            pylab.axhline(this, color="r", alpha=0.5, ls="--")
            pylab.axvline(this, color="r", alpha=0.5, ls="--")
Beispiel #31
0
 def plot_ranks(self, filename=None, savefig=False):
     # ranks
     # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000).
     # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of
     # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0.
     df1 = self.df.query('score>540')
     df2 = self.df.query('score<=540')
     pylab.clf()
     pylab.plot(df1.rep1_rank,
                df1.rep2_rank,
                'ko',
                alpha=0.5,
                label='<0.05 IDR')
     pylab.plot(df2.rep1_rank,
                df2.rep2_rank,
                'ro',
                alpha=0.5,
                label='>=0.05 IDR')
     pylab.xlabel("Peak rank - replicate 1")
     pylab.ylabel("Peak rank - replicate 2")
     N = len(self.df)
     pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--')
     #pylab.xlim([0,1.05])
     #pylab.ylim([0,1.05])
     pylab.legend(loc='lower right')
     if savefig:
         pylab.savefig(filename)
Beispiel #32
0
    def plot_dispersion(self):

        pylab.plot(
            self.dds_stats.baseMean,
            self.dds_stats.dispGeneEst,
            "ok",
            label="Estimate",
            ms=1,
        )
        pylab.plot(
            self.dds_stats.baseMean,
            self.dds_stats.dispersion,
            "ob",
            label="final",
            ms=1,
        )
        pylab.plot(self.dds_stats.baseMean,
                   self.dds_stats.dispFit,
                   "or",
                   label="Fit",
                   ms=1)
        pylab.legend()
        ax = pylab.gca()
        ax.set(yscale="log")
        ax.set(xscale="log")

        self._format_plot(
            title="Dispersion estimation",
            xlabel="Mean of normalized counts",
            ylabel="Dispersion",
        )
Beispiel #33
0
    def plot_scatter_contig_length_nread_cov(self,
                                             fontsize=16,
                                             vmin=0,
                                             vmax=50,
                                             min_nreads=20,
                                             min_length=50000):

        if self._df is None:
            _ = self.get_df()
        pylab.clf()
        df = self._df

        m1 = df.length.min()
        M1 = df.length.max()

        # least square
        X = df.query("nread>@min_nreads and length>@min_length")['length']
        Y = df.query("nread>@min_nreads and length>@min_length")['nread']
        Z = df.query("nread>@min_nreads and length>@min_length")['covStat']
        print(X)
        print(Y)
        print(Z)

        A = np.vstack([X, np.ones(len(X))]).T
        m, c = np.linalg.lstsq(A, Y.as_matrix())[0]
        x = np.array([m1, M1])

        X = df['length']
        Y = df['nread']
        Z = df['covStat']
        pylab.scatter(X, Y, c=Z, vmin=vmin, vmax=vmax)
        pylab.colorbar()
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("Contig reads", fontsize=fontsize)
        pylab.title("coverage function of contig length and reads used")
        pylab.grid()
        pylab.plot(x, m * x + c, "o-r")
        pylab.loglog()
        pylab.tight_layout()
Beispiel #34
0
    def plot_pca_vs_max_features(self, step=100, n_components=2,
            progress=True):
        """

        .. plot::
            :include-source:

            from sequana.viz.pca import PCA
            from sequana import sequana_data
            import pandas as pd

            data = sequana_data("test_pca.csv")
            df = pd.read_csv(data)
            df = df.set_index("Id")

            p = PCA(df)
            p.plot_pca_vs_max_features()

        """
        assert n_components in [2,3,4]
        N = len(self.df)
        if step > N:
            step = N

        # We start with at least 5 features
        X = range(10, N, step)
        from easydev import Progress
        pb = Progress(len(X))
        Y = []
        for i, x in enumerate(X):
            res = self.plot(n_components=n_components, max_features=x, show_plot=False)
            Y.append(res)
            if progress: pb.animate(i+1)

        sub = n_components
        pylab.subplot(sub,1,1)
        pylab.plot(X, [y[0]*100 for y in Y])
        pylab.ylabel("PC1 (%)")
        pylab.subplot(sub,1,2)
        pylab.plot(X, [y[1]*100 for y in Y])
        pylab.ylabel("PC2 (%)")
        if sub >= 3:
            pylab.subplot(sub,1,3)
            pylab.plot(X, [y[2]*100 for y in Y])
            pylab.ylabel("PC3 (%)")
        if sub >= 4:
            pylab.subplot(sub,1,4)
            pylab.plot(X, [y[3]*100 for y in Y])
            pylab.ylabel("PC4 (%)")
Beispiel #35
0
    def plot(self):
        """"""
        if self.design:
            self.df['label'] = self.design.df['type'] + "/" + self.design.df[
                'condition']

        pylab.clf()
        MX = self.df.FRiP.max()
        MY = self.df['in_peaks'].max()
        pylab.plot([0, MX], [0, MY], ls='--', color='b', alpha=0.5)
        for label in self.df['label'].unique():
            self.df.query('label==@label').plot(x='FRiP',
                                                y='in_peaks',
                                                marker="o",
                                                lw=0,
                                                label=label,
                                                ax=pylab.gca())
        pylab.ylabel('Reads in peaks')
        pylab.xlabel('FRiP')
        pylab.xlim(0, pylab.xlim()[1])
        pylab.ylim(0, pylab.ylim()[1])
        pylab.grid()
Beispiel #36
0
    def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)])
            #acceptance probability
            u = pylab.uniform(0,1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, density=1)
        pylab.plot(x,y,'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF','Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)