Ejemplo n.º 1
0
    def plot(self,
             color_line='r',
             bgcolor='grey',
             color='yellow',
             lw=4,
             hold=False,
             ax=None):

        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax)
        pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.3)
        pylab.fill_between([0, xmax], [20, 20], [30, 30],
                           color='orange',
                           alpha=0.3)
        pylab.fill_between([0, xmax], [30, 30], [41, 41],
                           color='green',
                           alpha=0.3)

        if self.X is None:
            X = range(1, self.xmax + 1)

        pylab.fill_between(X,
                           self.df.mean() + self.df.std(),
                           self.df.mean() - self.df.std(),
                           color=color,
                           interpolate=False)

        pylab.plot(X, self.df.mean(), color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax + 1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Ejemplo n.º 2
0
 def plot_contig_length_vs_GC(self, alpha=0.5):
     pylab.plot(self.df["length"], self.df['GC'], "o", alpha=alpha)
     pylab.xlabel("contig length (bp)")
     pylab.ylabel("GC (%)")
     pylab.grid(True)
     pylab.ylim([0, 100])
     pylab.xlim(0, max(self.df['length']) + 10)
Ejemplo n.º 3
0
    def hist_length_repeats(self,
                            bins=None,
                            alpha=0.5,
                            hold=False,
                            fontsize=12,
                            grid=True,
                            label="Repeat length",
                            xlabel="Repeat length",
                            ylabel="#"):
        """Plots histogram of the repeat lengths


        """
        # check that user has set a threshold
        if self._list_len_repeats is None:
            self._get_list_len_repeats()

        if bins is None:
            bins = range(max(0, self.threshold - 1),
                         max(self._list_len_repeats) + 2)

        if hold is False:
            pylab.clf()
        pylab.hist(self._list_len_repeats, alpha=alpha, label=label, bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Ejemplo n.º 4
0
    def imshow_qualities(self):
        """Qualities

        ::

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.imshow_qualities()
            from pylab import tight_layout; tight_layout()

        """
        tiles = self._get_tile_info()
        d = defaultdict(list)
        for tile, seq in zip(tiles['tiles'], self.qualities):
            d[tile].append(seq)
        self.data_imqual = [pd.DataFrame(d[key]).mean().values for key in sorted(d.keys())]

        from biokit.viz import Imshow
        im = Imshow(self.data_imqual)
        im.plot(xticks_on=False, yticks_on=False, origin='lower')
        pylab.title("Quality per tile", fontsize=self.fontsize)
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("tile number")
Ejemplo n.º 5
0
    def plot_specific_alignment(self,
                                query_name,
                                motif,
                                clf=True,
                                windows=[10, 50, 100, 200, 500, 1000]):

        found = None
        bam = BAM(self.bamfile)
        for aln in bam:
            if aln.query_name == query_name:
                found = aln
        if found:
            # Detection
            seq = found.query_sequence
            if clf: pylab.clf()
            for window in windows:
                X = [seq[i:i + window].count(motif) for i in range(len(seq))]
                pylab.plot(X, label=window)
                score = sum([x > window / 6 for x in X])
                print(window, score / 3.)
            pylab.legend()
            pylab.ylabel("# {} in a given sliding window".format(motif))
            pylab.title(query_name)
        else:
            print("Not found")
Ejemplo n.º 6
0
    def plot_bar_flags(self, logy=True, fontsize=16, filename=None):
        """Plot an histogram of the flags contained in the BAM

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam', "testing"))
            b.plot_bar_flags()

        .. seealso:: :class:`SAMFlags` for meaning of each flag
        """
        df = self.get_flags_as_df()
        df = df.sum()
        pylab.clf()
        if logy is True:
            barplot = df.plot(kind='bar', logy=logy, grid=True)
        else:
            barplot = df.plot(kind='bar', grid=True)
        pylab.xlabel("flags", fontsize=fontsize)
        pylab.ylabel("count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
        return barplot
Ejemplo n.º 7
0
    def hist_ORF_CDS_linearscale(self,
                                 alpha=0.5,
                                 bins=40,
                                 xlabel="Length",
                                 ylabel="#"):
        if self._ORF_pos is None:
            self._find_ORF_CDS()

        n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0]
        n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0]

        # plot for all ORF and CDS
        pylab.hist(self._ORF_pos["len_ORF"].dropna(),
                   alpha=alpha,
                   label="ORF, N = " + str(n_ORF),
                   bins=bins)
        pylab.hist(self._ORF_pos["len_CDS"].dropna(),
                   alpha=alpha,
                   label="CDS, N = " + str(n_CDS),
                   bins=bins)
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend()
        pylab.title("Length of ORF and CDS (after filter %s > %d)" \
            %(self._type_filter, self._threshold))
Ejemplo n.º 8
0
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
Ejemplo n.º 9
0
    def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8,
        align="left", fontsize=16, edgecolor="k", **kwargs):
        """

        mode can be all, lq, hq
        """
        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2
 
        Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align,
                    ec=edgecolor, **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")
Ejemplo n.º 10
0
    def plot_bar_flags(self, logy=True, fontsize=16, filename=None):
        """Plot an histogram of the flags contained in the BAM

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam', "testing"))
            b.plot_bar_flags()

        .. seealso:: :class:`SAMFlags` for meaning of each flag
        """
        df = self.get_flags_as_df()
        df = df.sum()
        pylab.clf()
        if logy is True:
            barplot = df.plot(kind='bar', logy=logy, grid=True)
        else:
            barplot = df.plot(kind='bar', grid=True)
        pylab.xlabel("flags", fontsize=fontsize)
        pylab.ylabel("count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
        return barplot
Ejemplo n.º 11
0
    def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8,
        align="left", fontsize=16, edgecolor="k", **kwargs):
        """

        mode can be all, lq, hq
        """
        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2
 
        Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align,
                    ec=edgecolor, **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")
Ejemplo n.º 12
0
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
Ejemplo n.º 13
0
    def plot_bar_mapq(self, fontsize=16, filename=None):
        """Plots bar plots of the MAPQ (quality) of alignments

            .. plot::
                :include-source:

                from sequana import BAM, sequana_data
                b = BAM(sequana_data('test.bam', "testing"))
                b.plot_bar_mapq()

        """
        df = self.get_mapq_as_df()
        df.plot(kind='hist',
                bins=range(0,
                           df.max().values[0] + 1),
                legend=False,
                grid=True,
                logy=True)
        pylab.xlabel("MAPQ", fontsize=fontsize)
        pylab.ylabel("Count", fontsize=fontsize)
        try:
            # This may raise issue on MAC platforms
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
Ejemplo n.º 14
0
 def plot_ranks(self, filename=None, savefig=False):
     # ranks
     # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000).
     # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of
     # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0.
     df1 = self.df.query('score>540')
     df2 = self.df.query('score<=540')
     pylab.clf()
     pylab.plot(df1.rep1_rank,
                df1.rep2_rank,
                'ko',
                alpha=0.5,
                label='<0.05 IDR')
     pylab.plot(df2.rep1_rank,
                df2.rep2_rank,
                'ro',
                alpha=0.5,
                label='>=0.05 IDR')
     pylab.xlabel("Peak rank - replicate 1")
     pylab.ylabel("Peak rank - replicate 2")
     N = len(self.df)
     pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--')
     #pylab.xlim([0,1.05])
     #pylab.ylim([0,1.05])
     pylab.legend(loc='lower right')
     if savefig:
         pylab.savefig(filename)
Ejemplo n.º 15
0
    def hist_length_repeats(self,
                            bins=20,
                            alpha=0.5,
                            hold=False,
                            fontsize=12,
                            grid=True,
                            title="Repeat length",
                            xlabel="Repeat length",
                            ylabel="#"):
        """Plots histogram of the repeat lengths


        """
        # check that user has set a threshold
        if self._list_len_repeats is None:
            self._get_list_len_repeats()

        if hold is False:
            pylab.clf()
        pylab.hist(self._list_len_repeats, alpha=alpha, bins=bins)
        pylab.title(title)
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Ejemplo n.º 16
0
    def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.scatter(-pylab.log10(df['Adjusted P-value']),
                      range(len(df)),
                      s=10 * df['size'],
                      c=df['size'])

        pylab.xlabel("Odd ratio")
        pylab.ylabel("Gene sets")
        pylab.yticks(range(len(df)), df.name)
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.grid(True)
        ax = pylab.gca()

        M = max(df['size'])
        if M > 100:
            l1, l2, l3 = "10", "100", str(M)
        else:
            l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M)

        handles = [
            pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="")
        ]
        ax.legend(handles=handles, loc="upper left", title="gene-set size")

        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.tight_layout()
        ax = pylab.colorbar(pylab.gci())
        return df
Ejemplo n.º 17
0
    def barplot_count_ORF_CDS_by_frame(self,
                                       alpha=0.5,
                                       bins=40,
                                       xlabel="Frame",
                                       ylabel="#",
                                       bar_width=0.35):
        if self._ORF_pos is None:
            self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames) - (bar_width / 2),
                  nb_res_ORF,
                  bar_width,
                  alpha=alpha,
                  label="ORF N = %d" % sum(nb_res_ORF))
        pylab.bar(np.array(frames) + (bar_width / 2),
                  nb_res_CDS,
                  bar_width,
                  alpha=alpha,
                  label="CDS N = %d" % sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
Ejemplo n.º 18
0
    def get_max_gc_correlation(self, reference):
        """Plot correlation between coverage and GC content by varying the GC window

         The GC content uses a moving window of size W. This parameter affects
         the correlation bewteen coverage and GC. This function find the
         *optimal* window length.

        """
        pylab.clf()
        corrs = []
        wss = []

        def func(params):
            ws = int(round(params[0]))
            if ws < 10:
                return 0
            self.bed.compute_gc_content(reference, ws)
            corr = self.get_gc_correlation()
            corrs.append(corr)
            wss.append(ws)
            return corr

        from scipy.optimize import fmin
        res = fmin(func, 100, xtol=1, disp=False)  # guess is 200
        pylab.plot(wss, corrs, "o")
        pylab.xlabel("GC window size")
        pylab.ylabel("Correlation")
        pylab.grid()
        return res[0]
Ejemplo n.º 19
0
    def hist_entropy(self, bins=50):
        """Histogram of the entropy of all found repeats

        """
        self.df.entropy.hist(bins=bins)
        pylab.xlabel("Entropy")
        pylab.ylabel("#")
Ejemplo n.º 20
0
    def plot_stacked_hist(self,
                          output_filename=None,
                          dpi=200,
                          kind="barh",
                          fontsize=10,
                          edgecolor="k",
                          lw=1,
                          width=1,
                          ytick_fontsize=10):
        df = self.get_df()
        df.T.plot(kind=kind,
                  stacked=True,
                  edgecolor=edgecolor,
                  lw=lw,
                  width=width)
        ax = pylab.gca()
        positions = pylab.yticks()
        #ax.set_yticklabel(positions, labels, fontsize=ytick_fontsize)
        pylab.xlabel("Percentage (%)", fontsize=fontsize)
        pylab.ylabel("Sample index/name", fontsize=fontsize)
        pylab.yticks(fontsize=ytick_fontsize)
        pylab.legend(title="kingdom")
        pylab.xlim([0, 100])

        if output_filename:
            pylab.savefig(output_filename, dpi=dpi)
Ejemplo n.º 21
0
    def plot_unknown_barcodes(self, N=20):
        ub = self.data['UnknownBarcodes']
        df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub})
        if "unknown" in df.index and len(df) == 1:
            df.loc['known'] = [0 for i in df.columns]

        # if data is made of undetermined only, the dataframe is just made of
        # N lanes with one entry : unknown
        S = df.sum(axis=1).sort_values(ascending=False).index[0:N]
        data = df.loc[S][::-1]
        #print(data)

        data.columns = ["Lane {}".format(x) for x in data.columns]
        from matplotlib import rcParams
        rcParams['axes.axisbelow'] = True
        pylab.figure(figsize=(10, 8))
        ax = pylab.gca()
        data.plot(kind="barh", width=1, ec="k", ax=ax)
        rcParams['axes.axisbelow'] = False
        pylab.xlabel("Number of reads", fontsize=12)
        pylab.ylabel("")
        pylab.grid(True)
        pylab.legend(
            ["Lane {}".format(x) for x in range(1,
                                                len(df.columns) + 1)],
            loc="lower right")
        try:
            pylab.tight_layout()
        except Exception as err:
            print(err)
        return data
Ejemplo n.º 22
0
    def plot_specific_alignment(self, bamfile, query_name, motif,clf=True,
            show_figure=True, authorized_flags=[0,16],
            windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5):

        found = None
        bam = BAM(bamfile)
        for aln in bam:
            if aln.query_name == query_name and aln.flag in authorized_flags:
                found = aln
                break  # we may have several entries. let us pick up the first 
            

        sizes = []
        if found:
            # Detection
            seq = found.query_sequence
            if clf:pylab.clf()
            for window in windows:
                X = [seq[i:i+window].count(motif) for i in range(len(seq))]
                if show_figure:
                    pylab.plot(X, label=window)
                score = sum([x>local_threshold for x in X])
                sizes.append(score-window)
            if show_figure:
                pylab.legend()
                pylab.ylabel("# {} in a given sliding window".format(motif))
                pylab.title(query_name)
        else:
            print("{} Not found in {} file".format(query_name, bamfile))
        
        return sizes
Ejemplo n.º 23
0
    def histogram_sequence_lengths(self, logy=True):
        """Histogram sequence lengths

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.histogram_sequence_lengths()

        """
        data = [len(x) for x in self.sequences]
        bary, barx = np.histogram(data, bins=range(max(data)+1))

        # get rid of zeros to avoid warnings
        bx = [x for x,y in zip(barx, bary) if y!=0]
        by = [y for x,y in zip(barx, bary) if y!=0]
        if logy:
            pylab.bar(bx, pylab.log10(by))
        else:
            pylab.bar(bx, by)

        pylab.xlim([1,max(data)+1])

        pylab.grid(True)
        pylab.xlabel("position (bp)", fontsize=self.fontsize)
        pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
Ejemplo n.º 24
0
    def plot_count_per_sample(self, fontsize=12, sample_list=None):
        """"Number of mapped reads per sample. Each color for each replicate

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_count_per_sample()
        """
        sample_names = self.sample_names
        N = len(sample_names)
        dd = self.df[sample_names].sum()
        pylab.clf()

        colors = []
        for sample in self.sample_names:
            colors.append(self.colors[self.get_cond_from_sample(sample)])

        pylab.bar(range(N), (dd/1000000).values, 
            color=colors, alpha=1, 
            zorder=10, lw=1, ec="k", width=0.9)
        pylab.xlabel("Samples", fontsize=fontsize)
        pylab.ylabel("Total read count (millions)", fontsize=fontsize)
        pylab.grid(True, zorder=0)
        pylab.title("Total read count per sample", fontsize=fontsize)
        pylab.xticks(range(N), self.sample_names)
Ejemplo n.º 25
0
 def hist_contig_length(self, bins=30, fontsize=16):
     pylab.clf()
     pylab.hist(self.df.length, lw=1, ec="k", bins=bins)
     pylab.grid()
     pylab.xlabel("Contig length", fontsize=fontsize)
     pylab.ylabel("#", fontsize=fontsize)
     pylab.title("Distribution {} contigs".format(len(self.df)))
Ejemplo n.º 26
0
    def plot_volcano(self):
        """
        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_volcano()

        """
        d1 = self.df.query("padj>0.05")
        d2 = self.df.query("padj<=0.05")

        fig = pylab.figure()
        pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o",
            alpha=0.5, color="r", lw=0)
        pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o",
            alpha=0.5, color="k", lw=0)

        pylab.grid(True)
        pylab.xlabel("fold change")
        pylab.ylabel("log10 adjusted p-value")
        m1 = abs(min(self.df.log2FoldChange))
        m2 = max(self.df.log2FoldChange)
        limit = max(m1,m2)
        pylab.xlim([-limit, limit])
        y1,y2 = pylab.ylim()
        pylab.ylim([0,y2])

        pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
Ejemplo n.º 27
0
    def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True):
        """Number Of Polymerase Reads Per Barcode"""
        PR = self.df_barcoded["Polymerase Reads"].sum()
        data = self.df_barcoded['Polymerase Reads'].sort_values(
            ascending=False).values
        pylab.plot([int(x) for x in range(1,
                                          len(data) + 1)],
                   data,
                   label="barcodes")
        pylab.axhline(data.mean(), color="r", label="average")

        try:
            if unbarcoded is True:
                unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0]
                pylab.axhline(unbar, color="k", ls="--", label="not barcoded")
        except:
            pass

        pylab.xlabel("Barcode Rank Order", fontsize=fontsize)
        pylab.ylabel("Counts of Reads", fontsize=fontsize)
        pylab.title("Total Polymerase count: {}".format(PR))
        pylab.legend()
        pylab.ylim(ymin=0)
        try:
            pylab.tight_layout()
        except:
            pass
Ejemplo n.º 28
0
    def plot_idr_vs_peaks(self, filename=None, savefig=False):

        # global_idr is actually -log10(idr)
        pylab.clf()
        X1 = pylab.linspace(0, self.threshold, 100)
        X2 = pylab.linspace(self.threshold, 1, 100)
        # convert global idr to proba

        df1 = self.df.query("idr<@self.threshold")
        df2 = self.df.query("idr>[email protected]")

        pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2)
        shift = len(df1)

        pylab.plot([shift + sum(df2['idr'] < x) for x in X2],
                   X2,
                   "-",
                   color='k',
                   lw=2)
        pylab.xlabel('Number of significant peaks')
        pylab.ylabel('IDR')
        pylab.axhline(0.05, color='b', ls='--')
        pylab.axvline(self.N_significant_peaks, color='b', ls='--')
        if savefig:
            pylab.savefig(filename)
Ejemplo n.º 29
0
    def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:,"snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title,fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Ejemplo n.º 30
0
 def hist_plot_contig_length(self, bins=40, fontsize=16):
     """Plot distribution of contig lengths"""
     L = len(self.fasta.sequences)
     pylab.hist(self.fasta.lengths, lw=1, ec="k", bins=bins)
     pylab.grid()
     pylab.xlabel("Contig length", fontsize=fontsize)
     pylab.ylabel("#", fontsize=fontsize)
     pylab.title("Distribution {} contigs".format(L))
Ejemplo n.º 31
0
 def plot_subreads_histogram(self, bins=10, fontsize=12):
     self.df_barcoded['Subreads'].hist(bins=bins, ec="k", rwidth=0.8)
     pylab.xlabel("Number of subreads", fontsize=fontsize)
     pylab.ylabel("Number of Barcoded Samples", fontsize=fontsize)
     try:
         pylab.tight_layout()
     except:
         pass
Ejemplo n.º 32
0
 def plot_pvalue_hist(self, bins=60, fontsize=16, rotation=0):
     pylab.hist(self.df.pvalue.dropna(), bins=bins, ec="k")
     pylab.grid(True)
     pylab.xlabel("raw p-value", fontsize=fontsize)
     pylab.ylabel("Occurences", fontsize=fontsize)
     try:
         pylab.tight_layout()
     except:
         pass
Ejemplo n.º 33
0
 def plot_padj_hist(self, bins=60, fontsize=16):
     pylab.hist(self.df.padj.dropna(), bins=bins, ec="k")
     pylab.grid(True)
     pylab.xlabel("Adjusted p-value", fontsize=fontsize)
     pylab.ylabel("Occurences", fontsize=fontsize)
     try:
         pylab.tight_layout()
     except:
         pass
Ejemplo n.º 34
0
 def scatter_length_cov_gc(self, min_length=200, min_cov=10):
     pylab.clf()
     pylab.scatter(self.df.length, self.df['cov'], c=self.df.GC)
     pylab.loglog()
     pylab.axvline(min_length, lw=2, c="r", ls='--')
     pylab.axhline(min_cov, lw=2, c="r", ls='--')
     pylab.xlabel("contig length")
     pylab.ylabel("contig coverage")
     pylab.colorbar(label="GC")
     pylab.grid(True)
Ejemplo n.º 35
0
 def plot(self, fontsize=16):
     """plot quality versus base position"""
     pylab.plot(self.quality, label="offset: %s" % self.offset)
     pylab.xlabel('base position', fontsize=fontsize)
     pylab.ylabel('Quality per base', fontsize=fontsize)
     pylab.grid(True)
     # ylim set autoscale to off so if we want to call this function  several
     # times, we must reset autoscale to on before calling ylim
     pylab.autoscale()
     limits = pylab.ylim()
     pylab.ylim(max(0,limits[0]-1), limits[1]+1)
Ejemplo n.º 36
0
 def boxplot_mapq_concordance(self):
     # method can only be bwa for now
     assert self.method == "bwa"
     data = self._get_data()
     df = pd.DataFrame(data, columns=["mapq", "length", "concordance"])
     pylab.clf()
     pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1,61)])
     pylab.xlabel("mapq")
     pylab.ylabel("concordance")
     pylab.grid()
     tt = [10,20,30,40,50,60]
     pylab.xticks(tt, tt)
Ejemplo n.º 37
0
    def plot_indel_dist(self, fontsize=16):
        """Plot indel count (+ ratio)

        :Return: list of insertions, deletions and ratio insertion/deletion for
            different length starting at 1

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.plot_indel_dist()

        What you see on this figure is the presence of 10 insertions of length
        1, 1 insertion of length 2 and 3 deletions of length 1


        # Note that in samtools, several insertions or deletions in a single
        alignment are ignored and only the first one seems to be reported. For
        instance 10M1I10M1I stored only 1 insertion in its report; Same comment
        for deletions.

        .. todo:: speed up and handle long reads cases more effitiently by 
            storing INDELS as histograms rather than lists
        """
        try:
            self.insertions
        except:
            self._set_indels()

        if len(self.insertions) ==0 or len(self.deletions) == 0:
            raise ValueError("No deletions or insertions found")

        N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1
        D = [self.deletions.count(i) for i in range(N)]
        I = [self.insertions.count(i) for i in range(N)]
        R = [i/d if d!=0 else 0 for i,d in zip(I, D)]
        fig, ax = pylab.subplots()
        ax.plot(range(N), I, marker="x", label="Insertions")
        ax.plot(range(N), D, marker="x", label="Deletions")
        ax.plot(range(N), R, "--r", label="Ratio insertions/deletions")
        ax.set_yscale("symlog")
        pylab.ylim([1, pylab.ylim()[1]])
        pylab.legend()
        pylab.grid()
        from matplotlib.ticker import MaxNLocator
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        pylab.xlabel("Indel length", fontsize=fontsize)
        pylab.ylabel("Indel count", fontsize=fontsize)
        return I, D, R
Ejemplo n.º 38
0
    def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"):
        if self._ORF_pos is None:
            self._find_ORF_CDS()

        n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0]
        n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0]

        # plot for all ORF and CDS
        pylab.hist(self._ORF_pos["len_ORF"].dropna(),alpha=alpha, label="ORF, N = " + str(n_ORF),bins=bins)
        pylab.hist(self._ORF_pos["len_CDS"].dropna(),alpha=alpha, label="CDS, N = " + str(n_CDS),bins=bins)
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend()
        pylab.title("Length of ORF and CDS (after filter %s > %d)" \
            %(self._type_filter, self._threshold))
Ejemplo n.º 39
0
    def hist_coverage(self, bins=100):
        """

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.hist_coverage()
        """
        try: self.coverage
        except: self._set_coverage()
        pylab.hist(self.coverage, bins=bins)
        pylab.xlabel("Coverage")
        pylab.ylabel("Number of mapped bases")
        pylab.grid()
Ejemplo n.º 40
0
    def hist_length_repeats(self, bins=20, alpha=0.5, hold=False,
            fontsize=12, grid=True, title="Repeat length",
            xlabel="Repeat length", ylabel="#", logy=True):
        """Plots histogram of the repeat lengths

        """
        # check that user has set a threshold
        if hold is False:
            pylab.clf()
        pylab.hist(self.list_len_repeats, alpha=alpha, bins=bins)
        pylab.title(title)
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        if logy:
            pylab.semilogy()
Ejemplo n.º 41
0
    def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40,
        xlabel="Frame", ylabel="#", bar_width=0.35):
        if self._ORF_pos is None:
                self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF))
        pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
Ejemplo n.º 42
0
    def plot_bar_mapq(self, fontsize=16, filename=None, ):
        """Plots bar plots of the MAPQ (quality) of alignments

            .. plot::
                :include-source:

                from sequana import BAM, sequana_data
                b = BAM(sequana_data('test.bam', "testing"))
                b.plot_bar_mapq()

        """
        df = self.get_mapq_as_df()
        df.plot(kind='hist', bins=range(0,df.max().values[0]+1), legend=False,
            grid=True, logy=True)
        pylab.xlabel("MAPQ", fontsize=fontsize)
        pylab.ylabel("Count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
Ejemplo n.º 43
0
    def pie_plot(self, filename=None, hold=False):
        """Plot PIE plot of the status (complete / fragment / missed)

        .. plot::
            :include-source:

            from sequana import BUSCO, sequana_data
            b = BUSCO(sequana_data("test_busco_full_table.tsv"))
            b.pie_plot()

        """
        if hold is False:
            pylab.clf()
        self.df.groupby('Status').count()['# Busco id'].plot(kind="pie")
        pylab.ylabel("")
        #pylab.title("Distribution Complete/Fragmented/Missing")
        #pylab.legend()
        if filename:
            pylab.savefig(filename)
Ejemplo n.º 44
0
    def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="GC %", ylabel="#", label="",title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" %(mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:,'GC_content'], bins=bins,
            alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2))
            + ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try: pylab.tight_layout()
        except:pass
Ejemplo n.º 45
0
    def plot_acgt_content(self, stacked=False):
        """Plot histogram of GC content

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.plot_acgt_content()
        """
        df = self.get_actg_content()
        if stacked is True:
            df.plot.bar(stacked=True)
        else:
            df.plot()
            pylab.grid(True)
        pylab.xlabel("position (bp)", fontsize=self.fontsize)
        pylab.ylabel("percent", fontsize=self.fontsize)
Ejemplo n.º 46
0
    def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12,
                          grid=True, xlabel="Number of ZMW passes", logy=True,
                          ylabel="#", label="", title="Number of ZMW passes"):
        """Plot histogram of number of reads per ZMW (number of passes)

        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param bool logy: use log scale on the y axis (default to True)
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_nb_passes()
        """
        max_nb_pass = self.df.nb_passes.max()
        if bins is None:
            k = range(1, max_nb_pass+1)

        # histogram nb passes
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha,
                   label=label, log=logy, width=1)
        if len(k) < 5:
            pylab.xticks(range(6), range(6))

        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Ejemplo n.º 47
0
    def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)])
            #acceptance probability
            u = pylab.uniform(0,1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, density=1)
        pylab.plot(x,y,'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF','Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
Ejemplo n.º 48
0
    def plot(self, bins=80, rwidth=0.8, **kwargs):
        pylab.clf()
        Y, X, _ = pylab.hist(self.data, bins=bins, rwidth=rwidth, **kwargs)

        pylab.xlabel(self.xlabel, fontsize=self.fontsize)
        pylab.ylabel(self.ylabel, fontsize=self.fontsize)

        """self.Y = Y
        self.X = X

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(self.data) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=self.fontsize)
        """
        pylab.grid(self.grid)
        pylab.title(self.title)
        try: pylab.tight_layout()
        except:pass
Ejemplo n.º 49
0
    def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60],
                grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"):
        """Plot GC content versus read length

        :param bool hold:
        :param int fontsize: for x and y labels and title
        :param bins: a integer or tuple of 2 integers to specify
            the binning of the x and y 2D histogram.
        :param bool grid:
        :param str xlabel:
        :param str ylabel:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.plot_GC_read_len(bins=[10, 10])

        """
        mean_len =  np.mean(self.df.loc[:,'read_length'])
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        if hold is False:
            pylab.clf()

        data = self.df.loc[:,['read_length','GC_content']].dropna()
        h = biokit.viz.hist2d.Hist2D(data)
        res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("GC %", fontsize=fontsize)
        pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" %
            (mean_len, mean_GC), fontsize=fontsize)
        pylab.ylim([0, 100])
        if grid is True:
            pylab.grid(True)
Ejemplo n.º 50
0
    def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9,
                textcolor="red", **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_biokit(list(self.taxons.index))
        df.ix[-1] = ["Unclassified"] * 8
        data = self.taxons.copy()
        data.ix[-1] = self.unclassified

        data = data/data.sum()*100
        assert threshold > 0 and threshold < 100
        others = data[data<threshold].sum()
        data = data[data>threshold]
        names = df.ix[data.index]['name']

        data.index = names.values
        data.ix['others'] = others
        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        # text may be long so, let us increase the figsize a little bit
        pylab.figure(figsize=(10,8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%',
                radius=radius, **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind,  **kargs)
            pylab.xlabel(" percentage ")

        return data
Ejemplo n.º 51
0
 def hist_passes(self, maxp=50, fontsize=16):
     passes = self.df.nb_passes.copy()
     passes.clip_upper(maxp).hist(bins=maxp)
     pylab.xlim([0, maxp])
     pylab.ylabel("# count", fontsize=fontsize)
     pylab.xlabel("Passes (max {})".format(maxp), fontsize=fontsize)