コード例 #1
0
ファイル: isoseq.py プロジェクト: sequana/sequana
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
コード例 #2
0
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
コード例 #3
0
 def stats(self):
     return {
         "mean_read_length": pylab.mean(self.lengths),
         "ZMW": len(self.passes),
         "N": len(self.lengths),
         "mean_ZMW_passes": pylab.mean(self.passes)
     }
コード例 #4
0
ファイル: bamtools.py プロジェクト: ranjit58/sequana
    def plot_gc_content(self, fontsize=16, ec="k", bins=100):
        """plot GC content histogram

        :params bins: a value for the number of bins or an array (with a copy()
            method)
        :param ec: add black contour on the bars

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam'))
            b.plot_gc_content()

        """
        data = self.get_gc_content()
        try:
            X = np.linspace(0, 100, bins)
        except:
            X = bins.copy()

        pylab.hist(data, X, normed=True, ec=ec)
        pylab.grid(True)
        mu = pylab.mean(data)
        sigma = pylab.std(data)

        X = pylab.linspace(X.min(), X.max(), 100)
        pylab.plot(X, pylab.normpdf(X, mu, sigma), lw=2, color="r", ls="--")
        pylab.xlabel("GC content", fontsize=16)
コード例 #5
0
ファイル: bamtools.py プロジェクト: sequana/sequana
    def plot_gc_content(self, fontsize=16, ec="k", bins=100):
        """plot GC content histogram

        :params bins: a value for the number of bins or an array (with a copy()
            method)
        :param ec: add black contour on the bars

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam'))
            b.plot_gc_content()

        """
        data = self.get_gc_content()
        try:
            X = np.linspace(0, 100, bins)
        except:
            X = bins.copy()

        pylab.hist(data, X, density=True, ec=ec)
        pylab.grid(True)
        mu = pylab.mean(data)
        sigma = pylab.std(data)

        X = pylab.linspace(X.min(), X.max(), 100)

        from sequana.misc import normpdf

        pylab.plot(X, normpdf(X, mu, sigma), lw=2, color="r", ls="--")
        pylab.xlabel("GC content", fontsize=16)
コード例 #6
0
ファイル: bamtools.py プロジェクト: ranjit58/sequana
 def get_metrics_count(self):
     """ Count flags/mapq/read length in one pass."""
     mapq_dict = {}
     read_length_dict = {}
     flag_dict = {}
     mean_qualities = []
     for read in self:
         self._count_item(mapq_dict, read.mapq)
         self._count_item(flag_dict, read.flag)
         if read.is_unmapped is False:
             self._count_item(read_length_dict, read.reference_length)
         mean_qualities.append(pylab.mean(read.query_qualities))
     self.metrics_count = {"mapq": mapq_dict,
                           "read_length": read_length_dict,
                           "flags": flag_dict,
                             "mean_quality": pylab.mean(mean_qualities)}
     return self.metrics_count
コード例 #7
0
ファイル: bamtools.py プロジェクト: wenliangz/sequana
    def _get_summary(self):
        """Count flags/mapq/read length in one pass."""
        if self._summary is not None:
            return self._summary

        mapq_dict = {}
        read_length_dict = {}
        flag_dict = {}
        mean_qualities = []
        for read in self:
            self._count_item(mapq_dict, read.mapq)
            self._count_item(flag_dict, read.flag)
            if read.is_unmapped is False:
                self._count_item(read_length_dict, read.reference_length)
            try:mean_qualities.append(pylab.mean(read.query_qualities))
            except:mean_qualities.append(read.query_qualities)
        self._summary = {"mapq": mapq_dict,
                         "read_length": read_length_dict,
                         "flags": flag_dict,
                         "mean_quality": pylab.mean(mean_qualities)
                         }
        return self._summary
コード例 #8
0
    def stats(self):
        results = {}
        if self.data is not None:
            logger.info("Reading strand")
            results['strand'] = {
                "+": sum(self.data.strand == "+"),
                "-": sum(self.data.strand == "-"),
                "?": sum(self.data.strand.isnull())
            }

            results['classification'] = {
                "total_ccs_reads": len(self.data),
                "five_prime_reads": int(self.data.fiveseen.sum()),
                "three_prime_reads": int(self.data.threeseen.sum()),
                "chimera": int(self.data.chimera.sum()),
                "polyA_reads": int(self.data.polyAseen.sum()),
            }

        if self.lq_isoforms:
            logger.info("Reading LQ isoforms")
            results['lq_isoform'] = self.lq_sequence.stats()  # number of

        if self.hq_isoforms:
            logger.info("Reading HQ isoforms")
            results['hq_isoform'] = self.hq_sequence.stats(
            )  # number of polished HQ isoform

        if self.ccs:
            seq = [len(read.sequence) for read in self.ccs]
            results["CCS"] = {
                "mean_length": pylab.mean(seq),
                "number_ccs_bases": sum(seq),
                "number_ccs_reads": len(seq)
            }

        self.idents_v = []
        self.full_v = []
        self.non_full_v = []
        self.isoform_lengths = []
        for read in self.lq_sequence:
            ident, full, non_full, length = read['identifier'].decode().split(
                ";")
            self.idents_v.append(ident)
            self.full_v.append(int(full.split("=")[1]))
            self.non_full_v.append(int(non_full.split("=")[1]))
            self.isoform_lengths.append(int(length.split("=")[1]))

        return results
コード例 #9
0
ファイル: isoseq.py プロジェクト: sequana/sequana
    def stats(self):
        results = {}
        if self.data is not None:
            logger.info("Reading strand")
            results['strand'] = {
                "+": sum(self.data.strand == "+"),
                "-": sum(self.data.strand == "-"),
                "?": sum(self.data.strand.isnull())
            }

            results['classification'] = {
                "total_ccs_reads" : len(self.data),
                "five_prime_reads" : int(self.data.fiveseen.sum()),
                "three_prime_reads" : int(self.data.threeseen.sum()),
                "chimera" : int(self.data.chimera.sum()),
                "polyA_reads" : int(self.data.polyAseen.sum()),
            }

        if self.lq_isoforms:
            logger.info("Reading LQ isoforms")
            results['lq_isoform'] = self.lq_sequence.stats() # number of 

        if self.hq_isoforms:
            logger.info("Reading HQ isoforms")
            results['hq_isoform'] = self.hq_sequence.stats() # number of polished HQ isoform

        if self.ccs:
            seq = [ len(read.sequence) for read in self.ccs]
            results["CCS"] = {
                "mean_length" : pylab.mean(seq),
                "number_ccs_bases" : sum(seq),
                "number_ccs_reads" : len(seq)
            }

        self.idents_v = []
        self.full_v = []
        self.non_full_v = []
        self.isoform_lengths = []
        for read in self.lq_sequence:
            ident, full, non_full, length = read['identifier'].decode().split(";")
            self.idents_v.append(ident)
            self.full_v.append(int(full.split("=")[1]))
            self.non_full_v.append(int(non_full.split("=")[1]))
            self.isoform_lengths.append(int(length.split("=")[1]))

        return results
コード例 #10
0
ファイル: pacbio.py プロジェクト: wenliangz/sequana
    def random_selection(self,
                         output_filename,
                         nreads=None,
                         expected_coverage=None,
                         reference_length=None,
                         read_lengths=None):
        """Select random reads

        :param nreads: number of reads to select randomly. Must be less than
            number of available reads in the orignal file.
        :param expected_coverage:
        :param reference_length:

        if expected_coverage and reference_length provided, nreads is replaced
        automatically.

        .. note:: to speed up computation (if you need to call random_selection
            many times), you can provide the mean read length manually
        """
        assert output_filename != self.filename, \
            "output filename should be different from the input filename"

        if read_lengths is None:
            self.reset()
            read_lengths = [
                read.query_length for i, read in enumerate(self.data)
            ]

        N = len(read_lengths)

        if expected_coverage and reference_length:
            mu = pylab.mean(read_lengths)
            nreads = int(expected_coverage * reference_length / mu)

        assert nreads < N, "nreads parameter larger than actual Number of reads"
        selector = random.sample(range(N), nreads)
        logger.info("Creating a pacbio BAM file with {} reads".format(nreads))

        with pysam.AlignmentFile(output_filename, "wb",
                                 template=self.data) as fh:
            self.reset()
            for i, read in enumerate(self.data):
                if i in selector:
                    fh.write(read)
コード例 #11
0
ファイル: pacbio.py プロジェクト: sequana/sequana
    def random_selection(self, output_filename, nreads=None,
            expected_coverage=None, reference_length=None, read_lengths=None):
        """Select random reads

        :param nreads: number of reads to select randomly. Must be less than
            number of available reads in the orignal file.
        :param expected_coverage:
        :param reference_length:

        if expected_coverage and reference_length provided, nreads is replaced
        automatically.

        .. note:: to speed up computation (if you need to call random_selection
            many times), you can provide the mean read length manually
        """
        assert output_filename != self.filename, \
            "output filename should be different from the input filename"

        if read_lengths is None:
            self.reset()
            read_lengths = [read.query_length for i, read in enumerate(self.data)]

        N = len(read_lengths)

        if expected_coverage and reference_length:
            mu = pylab.mean(read_lengths)
            nreads = int(expected_coverage * reference_length / mu)

        assert nreads < N, "nreads parameter larger than actual Number of reads"
        selector = random.sample(range(N), nreads)
        logger.info("Creating a pacbio BAM file with {} reads".format(nreads))

        with pysam.AlignmentFile(output_filename,"wb", template=self.data) as fh:
            self.reset()
            for i, read in enumerate(self.data):
                if i in selector:
                    fh.write(read)
コード例 #12
0
ファイル: isoseq.py プロジェクト: sequana/sequana
 def stats(self):
     return {"mean_read_length": pylab.mean(self.lengths), 
             "ZMW": len(self.passes),
             "N": len(self.lengths),
             "mean_ZMW_passes": pylab.mean(self.passes)}
コード例 #13
0
ファイル: fastq.py プロジェクト: wenliangz/sequana
 def stats(self):
     self.rewind()
     data = [len(read['sequence']) for read in self]
     return {"mean_read_length": pylab.mean(data), "N": len(data)}