Esempio n. 1
0
    def imshow_qualities(self):
        """Qualities

        ::

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.imshow_qualities()
            from pylab import tight_layout; tight_layout()

        """
        tiles = self._get_tile_info()
        d = defaultdict(list)
        for tile, seq in zip(tiles['tiles'], self.qualities):
            d[tile].append(seq)
        self.data_imqual = [pd.DataFrame(d[key]).mean().values for key in sorted(d.keys())]

        from biokit.viz import Imshow
        im = Imshow(self.data_imqual)
        im.plot(xticks_on=False, yticks_on=False, origin='lower')
        pylab.title("Quality per tile", fontsize=self.fontsize)
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("tile number")
Esempio n. 2
0
    def check(self, bins=60):
        y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k")
        M1 = max(y)

        # this normalisation is an approximation/hack
        pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro")
        pylab.title("simulated (blue) and target (red) distributions")
Esempio n. 3
0
 def hist_contig_length(self, bins=30, fontsize=16):
     pylab.clf()
     pylab.hist(self.df.length, lw=1, ec="k", bins=bins)
     pylab.grid()
     pylab.xlabel("Contig length", fontsize=fontsize)
     pylab.ylabel("#", fontsize=fontsize)
     pylab.title("Distribution {} contigs".format(len(self.df)))
Esempio n. 4
0
    def imshow_qualities(self):
        """Qualities

        ::

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.imshow_qualities()
            from pylab import tight_layout; tight_layout()

        """
        tiles = self._get_tile_info()
        d = defaultdict(list)
        for tile, seq in zip(tiles['tiles'], self.qualities):
            d[tile].append(seq)
        self.data_imqual = [pd.DataFrame(d[key]).mean().values for key in sorted(d.keys())]

        from sequana.viz import Imshow

        im = Imshow(self.data_imqual)
        im.plot(xticks_on=False, yticks_on=False, origin='lower')
        pylab.title("Quality per tile", fontsize=self.fontsize)
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("tile number")
Esempio n. 5
0
    def plot_specific_alignment(self, bamfile, query_name, motif,clf=True,
            show_figure=True, authorized_flags=[0,16],
            windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5):

        found = None
        bam = BAM(bamfile)
        for aln in bam:
            if aln.query_name == query_name and aln.flag in authorized_flags:
                found = aln
                break  # we may have several entries. let us pick up the first 
            

        sizes = []
        if found:
            # Detection
            seq = found.query_sequence
            if clf:pylab.clf()
            for window in windows:
                X = [seq[i:i+window].count(motif) for i in range(len(seq))]
                if show_figure:
                    pylab.plot(X, label=window)
                score = sum([x>local_threshold for x in X])
                sizes.append(score-window)
            if show_figure:
                pylab.legend()
                pylab.ylabel("# {} in a given sliding window".format(motif))
                pylab.title(query_name)
        else:
            print("{} Not found in {} file".format(query_name, bamfile))
        
        return sizes
Esempio n. 6
0
    def plot_alignment(self, bamfile, motif, window=200,
            global_th=10,title=None,legend=True, legend_fontsize=11,
            valid_rnames=[],
            valid_flags=[]):
        """


        plot alignments that match the motif. 

        """

        bam = BAM(bamfile)
        print("Found {} hits".format(len(bam)))
        pylab.clf()
        count = 0
        for aln in bam:
            if valid_rnames and aln.rname not in valid_rnames:
                continue
            if valid_flags and aln.flag not in valid_flags:
                continue

            seq = aln.query_sequence
            if seq:
                count += 1
                X1 = [seq[i:i+window].count(motif) for i in range(len(seq))]
                pylab.plot(range(aln.reference_start,
                    aln.reference_start+len(seq)),X1, label=aln.query_name)
        print("Showing {} entries after filtering".format(count))
        max_theo = int(1.2*window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count<15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)
Esempio n. 7
0
    def barplot_count_ORF_CDS_by_frame(self,
                                       alpha=0.5,
                                       bins=40,
                                       xlabel="Frame",
                                       ylabel="#",
                                       bar_width=0.35):
        if self._ORF_pos is None:
            self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames) - (bar_width / 2),
                  nb_res_ORF,
                  bar_width,
                  alpha=alpha,
                  label="ORF N = %d" % sum(nb_res_ORF))
        pylab.bar(np.array(frames) + (bar_width / 2),
                  nb_res_CDS,
                  bar_width,
                  alpha=alpha,
                  label="CDS N = %d" % sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
Esempio n. 8
0
    def plot(self,
             color_line='r',
             bgcolor='grey',
             color='yellow',
             lw=4,
             hold=False,
             ax=None):

        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax)
        pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.3)
        pylab.fill_between([0, xmax], [20, 20], [30, 30],
                           color='orange',
                           alpha=0.3)
        pylab.fill_between([0, xmax], [30, 30], [41, 41],
                           color='green',
                           alpha=0.3)

        if self.X is None:
            X = range(1, self.xmax + 1)

        pylab.fill_between(X,
                           self.df.mean() + self.df.std(),
                           self.df.mean() - self.df.std(),
                           color=color,
                           interpolate=False)

        pylab.plot(X, self.df.mean(), color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax + 1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Esempio n. 9
0
    def hist_ORF_CDS_linearscale(self,
                                 alpha=0.5,
                                 bins=40,
                                 xlabel="Length",
                                 ylabel="#"):
        if self._ORF_pos is None:
            self._find_ORF_CDS()

        n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0]
        n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0]

        # plot for all ORF and CDS
        pylab.hist(self._ORF_pos["len_ORF"].dropna(),
                   alpha=alpha,
                   label="ORF, N = " + str(n_ORF),
                   bins=bins)
        pylab.hist(self._ORF_pos["len_CDS"].dropna(),
                   alpha=alpha,
                   label="CDS, N = " + str(n_CDS),
                   bins=bins)
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend()
        pylab.title("Length of ORF and CDS (after filter %s > %d)" \
            %(self._type_filter, self._threshold))
Esempio n. 10
0
    def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8,
        align="left", fontsize=16, edgecolor="k", **kwargs):
        """

        mode can be all, lq, hq
        """
        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2
 
        Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align,
                    ec=edgecolor, **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")
Esempio n. 11
0
    def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:,"snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title,fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 12
0
    def check(self, bins=60):
        y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k")
        M1 = max(y)

        # this normalisation is an approximation/hack
        pylab.plot(self.Xtarget, self.Ytarget / (max(self.Ytarget) / M1),
                   "-ro")
        pylab.title("simulated (blue) and target (red) distributions")
Esempio n. 13
0
 def plotter(filename, key):
     name = key.replace(" ", "_")
     pylab.ioff()
     histograms[key].plot(logy=False, lw=2, marker="o")
     pylab.title(name + "(%s)" % count)
     pylab.grid(True)
     pylab.savefig(filename)
     pylab.close()  # need to close the figure otherwise warnings 
Esempio n. 14
0
 def hist_plot_contig_length(self, bins=40, fontsize=16):
     """Plot distribution of contig lengths"""
     L = len(self.fasta.sequences)
     pylab.hist(self.fasta.lengths, lw=1, ec="k", bins=bins)
     pylab.grid()
     pylab.xlabel("Contig length", fontsize=fontsize)
     pylab.ylabel("#", fontsize=fontsize)
     pylab.title("Distribution {} contigs".format(L))
Esempio n. 15
0
 def plotter(filename, key):
     name = key.replace(" ", "_")
     pylab.ioff()
     histograms[key].plot(logy=False, lw=2, marker="o")
     pylab.title(name + "(%s)" % count)
     pylab.grid(True)
     pylab.savefig(filename)
     pylab.close()  # need to close the figure otherwise warnings
Esempio n. 16
0
    def hist_GC(self,
                bins=50,
                alpha=0.5,
                hold=False,
                fontsize=12,
                grid=True,
                xlabel="GC %",
                ylabel="#",
                label="",
                title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC = np.mean(self.df.loc[:, 'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" % (mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:, 'GC_content'],
                   bins=bins,
                   alpha=alpha,
                   label=label + ", mean : " + str(round(mean_GC, 2)) +
                   ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try:
            pylab.tight_layout()
        except:
            pass
Esempio n. 17
0
 def plot_genesets_hist(self, bins=20):
     N = len(self.gene_sets.keys())
     pylab.clf()
     pylab.hist([len(v) for k, v in self.gene_sets.items()],
                bins=bins,
                lw=1,
                ec="k")
     pylab.title("{} gene sets".format(N))
     pylab.xlabel("Gene set sizes")
     pylab.grid(True)
     a, b = pylab.xlim()
     pylab.xlim([0, b])
Esempio n. 18
0
    def hist_snr(self,
                 bins=50,
                 alpha=0.5,
                 hold=False,
                 fontsize=12,
                 grid=True,
                 xlabel="SNR",
                 ylabel="#",
                 title=""):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()
        pylab.hist(self._df.loc[:, 'snr_A'], alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:, 'snr_C'], alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:, 'snr_G'], alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:, 'snr_T'], alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 19
0
    def hist_nb_passes(self,
                       bins=None,
                       alpha=0.5,
                       hold=False,
                       fontsize=12,
                       grid=True,
                       xlabel="Number of ZMW passes",
                       logy=True,
                       ylabel="#",
                       label="",
                       title="Number of ZMW passes"):
        """Plot histogram of number of reads per ZMW (number of passes)

        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param bool logy: use log scale on the y axis (default to True)
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_nb_passes()
        """
        max_nb_pass = self.df.nb_passes.max()
        if bins is None:
            k = range(1, max_nb_pass + 1)

        # histogram nb passes
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.nb_passes,
                   bins=bins,
                   alpha=alpha,
                   label=label,
                   log=logy,
                   width=1)
        if len(k) < 5:
            pylab.xticks(range(6), range(6))

        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 20
0
    def hist_len(self,
                 bins=50,
                 alpha=0.5,
                 hold=False,
                 fontsize=12,
                 grid=True,
                 xlabel="Read Length",
                 ylabel="#",
                 label="",
                 title=None):
        """Plot histogram Read length

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.hist_len()

        """
        if self._df is None:
            self._get_df()
        mean_len = np.mean(self._df.loc[:, 'read_length'])

        # set title if not provided
        if title is None:
            title = "Read length  \n Mean length : %.2f" % (mean_len)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self._df.loc[:, 'read_length'],
                   bins=bins,
                   alpha=alpha,
                   label="%s, mean : %.0f, N : %d" %
                   (label, mean_len, self._N))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 21
0
    def run(self,
            bins=50,
            xmin=0,
            xmax=30000,
            step=1000,
            burn=1000,
            alpha=1,
            output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length,
                                      bins=bins,
                                      normed=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([
                alpha,
                self.target_distribution(can) / self.target_distribution(x)
            ])
            #acceptance probability
            u = pylab.uniform(0, 1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, normed=1)
        pylab.plot(x, y, 'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF', 'Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
Esempio n. 22
0
    def plot_GC_read_len(self,
                         hold=False,
                         fontsize=12,
                         bins=[60, 60],
                         grid=True,
                         xlabel="GC %",
                         ylabel="#",
                         cmap="BrBG"):
        """Plot GC content versus read length

        :param bool hold:
        :param int fontsize: for x and y labels and title
        :param bins: a integer or tuple of 2 integers to specify
            the binning of the x and y 2D histogram.
        :param bool grid:
        :param str xlabel:
        :param str ylabel:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.plot_GC_read_len(bins=[10, 10])

        """
        if self._df is None:
            self._get_df()
        mean_len = np.mean(self._df.loc[:, 'read_length'])
        mean_GC = np.mean(self._df.loc[:, 'GC_content'])

        if hold is False:
            pylab.clf()

        data = self._df.loc[:, ['read_length', 'GC_content']].dropna()
        h = biokit.viz.hist2d.Hist2D(data)
        res = h.plot(bins=bins,
                     contour=False,
                     norm='log',
                     Nlevels=6,
                     cmap=cmap)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("GC %", fontsize=fontsize)
        pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" %
                    (mean_len, mean_GC),
                    fontsize=fontsize)
        pylab.ylim([0, 100])
        if grid is True:
            pylab.grid(True)
Esempio n. 23
0
    def hist_ZMW_subreads(self,
                          alpha=0.5,
                          hold=False,
                          fontsize=12,
                          grid=True,
                          xlabel="Number of ZMW passes",
                          logy=True,
                          ylabel="#",
                          label="",
                          title="Number of ZMW passes"):
        """Plot histogram of number of reads per ZMW (number of passes)

        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param bool logy: use log scale on the y axis (default to True)
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.hist_ZMW_subreads()
        """
        if self._nb_pass is None:
            self._get_ZMW_passes()

        max_nb_pass = max(self._nb_pass.keys())
        k = range(1, max_nb_pass + 1)
        val = [self._nb_pass[i] for i in k]

        # histogram nb passes
        if hold is False:
            pylab.clf()
        pylab.bar(k, val, alpha=alpha, label=label, log=logy)
        if len(k) < 5:
            pylab.xticks(range(6), range(6))

        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 24
0
    def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1):
        aa = self.df.query("reference_name not in [-1, '-1']").copy()
        if len(aa) == 0:
            return pd.Series(), self.df

        aa['group'] = aa.reference_name.apply(lambda x: x[0:shift])
        mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"]
        mapped.name = None

        if plot:
            mapped.plot(kind="bar")
            pylab.title(title)
            pylab.tight_layout()
        #data.to_csv(path + "_hq_sirv_grouped.csv")
        return mapped, self.df
Esempio n. 25
0
    def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1):
        aa = self.df.query("reference_name not in [-1, '-1']").copy()
        if len(aa) == 0:
            return pd.Series(), self.df

        aa['group'] = aa.reference_name.apply(lambda x: x[0:shift])
        mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"]
        mapped.name = None

        if plot:
            mapped.plot(kind="bar")
            pylab.title(title)
            pylab.tight_layout()
        #data.to_csv(path + "_hq_sirv_grouped.csv")
        return mapped, self.df
Esempio n. 26
0
    def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"):
        if self._ORF_pos is None:
            self._find_ORF_CDS()

        n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0]
        n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0]

        # plot for all ORF and CDS
        pylab.hist(self._ORF_pos["len_ORF"].dropna(),alpha=alpha, label="ORF, N = " + str(n_ORF),bins=bins)
        pylab.hist(self._ORF_pos["len_CDS"].dropna(),alpha=alpha, label="CDS, N = " + str(n_CDS),bins=bins)
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend()
        pylab.title("Length of ORF and CDS (after filter %s > %d)" \
            %(self._type_filter, self._threshold))
Esempio n. 27
0
    def boxplot_quality(self, color_line='r', bgcolor='grey', color='yellow', lw=4, 
            hold=False, ax=None):


        quality = self.df[[str(x) for x in range(42)]]  # not sure why we have phred score from 0 to 41
        N = self.metadata['ReadNum']
        proba = quality / N

        self.xmax = 150
        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax) # pragma no cover
        pylab.fill_between([0,xmax], [0,0], [20,20], color='red', alpha=0.3)
        pylab.fill_between([0,xmax], [20,20], [30,30], color='orange', alpha=0.3)
        pylab.fill_between([0,xmax], [30,30], [41,41], color='green', alpha=0.3)


        X = []
        Q = []
        S = []
        for pos in range(1, 151):
            qualities = [((int(k)+1)*v) for k,v in quality.loc[pos].items()]
            mean_quality = sum(qualities) / N
            X.append(pos)
            Q.append(mean_quality)
            proba = quality.loc[pos] / N

            std = pylab.sqrt(sum([(x-mean_quality)**2 * y for x, y in zip(range(42), proba)]))
            S.append(std)

        print(len(X))
        print(len(Q))
        print(len(S))

        Q = np.array(Q)
        X = np.array(X)
        S = np.array(S)
        pylab.fill_between(X, Q+S, Q-S, 
            color=color, interpolate=False)

        pylab.plot(X, Q, color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax+1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Esempio n. 28
0
    def hist_length_repeats(self, bins=20, alpha=0.5, hold=False,
            fontsize=12, grid=True, title="Repeat length",
            xlabel="Repeat length", ylabel="#", logy=True):
        """Plots histogram of the repeat lengths

        """
        # check that user has set a threshold
        if hold is False:
            pylab.clf()
        pylab.hist(self.list_len_repeats, alpha=alpha, bins=bins)
        pylab.title(title)
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        if logy:
            pylab.semilogy()
Esempio n. 29
0
    def hist_polymerase_per_barcode(self, bins=10, fontsize=12):
        """histogram of number of polymerase per barcode

        Cumulative histogram gives total number of polymerase reads

        """
        PR = self.df_barcoded["Polymerase Reads"].sum()
        self.df_barcoded['Polymerase Reads'].hist(bins=bins,
                                                  ec="k",
                                                  rwidth=0.8)
        pylab.title("Total Polymerase count: {}".format(PR))
        pylab.xlabel("Number of Polymerase Reads", fontsize=fontsize)
        pylab.ylabel("Number of Barcoded Samples", fontsize=fontsize)
        try:
            pylab.tight_layout()
        except:
            pass
Esempio n. 30
0
    def hist_GC(self, bins=50, hold=False, fontsize=12,
                grid=True,xlabel="GC %",ylabel="#"):
        """Plot histogram GC content"""

        if self._df is None:
            self._get_df()
        mean_GC =  np.mean(self._df.loc[:,'GC_content'])

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self._df.loc[:,'GC_content'], bins=bins)
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title("GC %%  \n Mean GC : %.2f" %(mean_GC), fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 31
0
    def hist_read_length_consensus_isoform(self,
                                           mode="all",
                                           bins=80,
                                           rwidth=0.8,
                                           align="left",
                                           fontsize=16,
                                           edgecolor="k",
                                           **kwargs):
        """

        mode can be all, lq, hq
        """

        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2

        Y, X, _ = pylab.hist(L,
                             bins=bins,
                             rwidth=rwidth,
                             align=align,
                             ec=edgecolor,
                             **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[1:] - shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")
Esempio n. 32
0
    def histogram_gc_content(self):
        """Plot histogram of GC content

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.histogram_gc_content()

        """
        pylab.hist(self.gc_list, bins=range(0, 100))
        pylab.grid()
        pylab.title("GC content distribution (per sequence)")
        pylab.xlabel(r"Mean GC content (%)", fontsize=self.fontsize)
        pylab.xlim([0,100])
Esempio n. 33
0
    def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40,
        xlabel="Frame", ylabel="#", bar_width=0.35):
        if self._ORF_pos is None:
                self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF))
        pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
Esempio n. 34
0
    def plot_GC_read_len(self, alpha=0.07, hold=False, fontsize=12,
                grid=True,xlabel="GC %",ylabel="#"):
        """Plot GC content versus read length"""

        if self._df is None:
            self._get_df()
        mean_len =  np.mean(self._df.loc[:,'read_length'])
        mean_GC =  np.mean(self._df.loc[:,'GC_content'])

        if hold is False:
            pylab.clf()
        data = self._df.loc[:,['read_length','GC_content']]
        h = hist2d.Hist2D(data)
        res = h.plot(bins=[40,40], contour=False, nnorm='log', Nlevels=6)
        #pylab.plot(self._df.loc[:,'read_length'] , self._df.loc[:,'GC_content'], 'bo', alpha=alpha)
        pylab.xlabel("Read length", fontsize=12)
        pylab.ylabel("GC %", fontsize=12)
        pylab.title("GC % vs length \n Mean length : %.2f , Mean GC : %.2f" %(mean_len, mean_GC))
Esempio n. 35
0
    def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="GC %", ylabel="#", label="",title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" %(mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:,'GC_content'], bins=bins,
            alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2))
            + ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try: pylab.tight_layout()
        except:pass
Esempio n. 36
0
    def plot_alignment(self,
                       motif,
                       window=200,
                       global_th=10,
                       title=None,
                       legend=True,
                       legend_fontsize=11):
        """


        plot alignments that match the motif. 

        """
        df = self._get_aligments(motif=motif,
                                 window=window,
                                 global_th=global_th)
        print("Found {} hits".format(len(df)))
        bam = BAM(self.bamfile)
        pylab.clf()
        count = 0
        for aln in bam:
            if aln.query_name in df.query_name.values:
                seq = aln.query_sequence
                if seq:
                    count += 1
                    X1 = [
                        seq[i:i + window].count(motif) for i in range(len(seq))
                    ]
                    pylab.plot(range(aln.reference_start,
                                     aln.reference_start + len(seq)),
                               X1,
                               label=aln.query_name)

        max_theo = int(1.2 * window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count < 15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)

        return df
Esempio n. 37
0
    def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12,
                          grid=True, xlabel="Number of ZMW passes", logy=True,
                          ylabel="#", label="", title="Number of ZMW passes"):
        """Plot histogram of number of reads per ZMW (number of passes)

        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param bool logy: use log scale on the y axis (default to True)
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_nb_passes()
        """
        max_nb_pass = self.df.nb_passes.max()
        if bins is None:
            k = range(1, max_nb_pass+1)

        # histogram nb passes
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha,
                   label=label, log=logy, width=1)
        if len(k) < 5:
            pylab.xticks(range(6), range(6))

        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 38
0
    def plot_scatter_contig_length_nread_cov(self,
                                             fontsize=16,
                                             vmin=0,
                                             vmax=50,
                                             min_nreads=20,
                                             min_length=50000):

        if self._df is None:
            _ = self.get_df()
        pylab.clf()
        df = self._df

        m1 = df.length.min()
        M1 = df.length.max()

        # least square
        X = df.query("nread>@min_nreads and length>@min_length")['length']
        Y = df.query("nread>@min_nreads and length>@min_length")['nread']
        Z = df.query("nread>@min_nreads and length>@min_length")['covStat']
        print(X)
        print(Y)
        print(Z)

        A = np.vstack([X, np.ones(len(X))]).T
        m, c = np.linalg.lstsq(A, Y.as_matrix())[0]
        x = np.array([m1, M1])

        X = df['length']
        Y = df['nread']
        Z = df['covStat']
        pylab.scatter(X, Y, c=Z, vmin=vmin, vmax=vmax)
        pylab.colorbar()
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("Contig reads", fontsize=fontsize)
        pylab.title("coverage function of contig length and reads used")
        pylab.grid()
        pylab.plot(x, m * x + c, "o-r")
        pylab.loglog()
        pylab.tight_layout()
Esempio n. 39
0
    def hist_ZMW_subreads(self, hold=False, fontsize=12,
                            grid=True,xlabel="Number of ZMW passes",ylabel="#"):
        """
        Plot histogram of number of reads per ZMW
        """
        if self._nb_pass is None:
            self._get_ZMW_passes()

        max_nb_pass = max(self._nb_pass.keys())
        k = range(1,max_nb_pass+1)
        val = [self._nb_pass[i] for i in k]

        # histogram nb passes
        if hold is False:
            pylab.clf()
        pylab.hist(k, weights=val, bins=max_nb_pass)
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.yscale('log')
        pylab.title("Number of ZMW passes",fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Esempio n. 40
0
    def plot_count_per_sample(self, fontsize=12, sample_list=None):
        """"Number of mapped reads per sample. Each color for each replicate

        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_count_per_sample()
        """
        sample_names = [x for x in self.df.columns if x.startswith("norm")] 
        sample_names = [x.replace("norm.", "") for x in sample_names]
        N = len(sample_names)
        dd = self.df[sample_names].sum()
        pylab.clf()
        pylab.bar(range(N), (dd/1000000).values, color=['r']*3+['b']*3, alpha=1)
        pylab.xlabel("Samples", fontsize=fontsize)
        pylab.ylabel("Total read count (millions)", fontsize=fontsize)
        pylab.grid(True)
        pylab.title("Total read count per sample", fontsize=fontsize)
Esempio n. 41
0
    def barplot(self, filename="lane{}_status.png", lanes=None):
        df = self.get_data_reads()
        if lanes is None:
            lanes = df.lane.unique()

        for lane in lanes:
            pylab.clf()
            query = "lane==@lane and name!='Undetermined'"
            counts = df.query(query)['count']
            total = counts.sum()
            L = len(counts)

            query = "lane==@lane and name=='Undetermined'"
            under = df.query(query)['count'].sum()
            if total > 0:
                pylab.bar(range(L), counts, color="b", label="reads")

            if total == 0:
                color = "red"
            else:
                if 100 * under / total < 20:
                    color = "green"
                elif 100 * under / total < 50:
                    color = "orange"
                else:
                    color = "red"

            pylab.bar(range(L, L + 1),
                      under,
                      color=color,
                      label="undetermined")
            pylab.xticks([])
            pylab.ylabel("Number of reads")
            try:
                pylab.legend(loc="lower left")
            except:
                pass
            pylab.title("Lane {}".format(lane))
            pylab.savefig(filename.format(lane), dpi=200)
Esempio n. 42
0
    def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)])
            #acceptance probability
            u = pylab.uniform(0,1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, density=1)
        pylab.plot(x,y,'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF','Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
Esempio n. 43
0
    def plot(self, bins=80, rwidth=0.8, **kwargs):
        pylab.clf()
        Y, X, _ = pylab.hist(self.data, bins=bins, rwidth=rwidth, **kwargs)

        pylab.xlabel(self.xlabel, fontsize=self.fontsize)
        pylab.ylabel(self.ylabel, fontsize=self.fontsize)

        """self.Y = Y
        self.X = X

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(self.data) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=self.fontsize)
        """
        pylab.grid(self.grid)
        pylab.title(self.title)
        try: pylab.tight_layout()
        except:pass
Esempio n. 44
0
    def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60],
                grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"):
        """Plot GC content versus read length

        :param bool hold:
        :param int fontsize: for x and y labels and title
        :param bins: a integer or tuple of 2 integers to specify
            the binning of the x and y 2D histogram.
        :param bool grid:
        :param str xlabel:
        :param str ylabel:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.plot_GC_read_len(bins=[10, 10])

        """
        mean_len =  np.mean(self.df.loc[:,'read_length'])
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        if hold is False:
            pylab.clf()

        data = self.df.loc[:,['read_length','GC_content']].dropna()
        h = biokit.viz.hist2d.Hist2D(data)
        res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("GC %", fontsize=fontsize)
        pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" %
            (mean_len, mean_GC), fontsize=fontsize)
        pylab.ylim([0, 100])
        if grid is True:
            pylab.grid(True)
Esempio n. 45
0
    def diagnostics(self, bins=60, clear=True):
        if clear: pylab.clf()

        pylab.subplot(3,1,1)
        pylab.hist(self.aprob, bins=bins)
        pylab.title("Acceptation")

        pylab.subplot(3,1,2)
        pylab.plot(self.vec)
        pylab.title("proposition")

        pylab.subplot(3,1,3)
        y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k")
        M1 = max(y)

        # this normalisation is an approximation/hack
        pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro")
        pylab.title("simulated (blue) and target (red) distributions")
Esempio n. 46
0
 def hist_median_ccs(self, bins=1000, **kwargs):
     """Group subreads by ZMW and plot median of read length for each polymerase"""
     data = self.df[['read_length', 'ZMW']].groupby('ZMW')
     data.median().hist(bins=bins, **kwargs)
     pylab.title("CCS median read length")
     return data