def hist_average_quality(self, fontsize=16, bins=None): """ bins is from 0 to 94 """ hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) for read in self.hq_sequence] lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) for read in self.lq_sequence] if bins is None: bins = range(0,94) Y1, X = np.histogram(hq_qv, bins=bins) Y2, X = np.histogram(lq_qv, bins=bins) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlim([0.5, 93.5]) pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize) ax = pylab.twinx() N = np.sum(Y1+Y2) ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
def stats(self): return { "mean_read_length": pylab.mean(self.lengths), "ZMW": len(self.passes), "N": len(self.lengths), "mean_ZMW_passes": pylab.mean(self.passes) }
def plot_gc_content(self, fontsize=16, ec="k", bins=100): """plot GC content histogram :params bins: a value for the number of bins or an array (with a copy() method) :param ec: add black contour on the bars .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam')) b.plot_gc_content() """ data = self.get_gc_content() try: X = np.linspace(0, 100, bins) except: X = bins.copy() pylab.hist(data, X, normed=True, ec=ec) pylab.grid(True) mu = pylab.mean(data) sigma = pylab.std(data) X = pylab.linspace(X.min(), X.max(), 100) pylab.plot(X, pylab.normpdf(X, mu, sigma), lw=2, color="r", ls="--") pylab.xlabel("GC content", fontsize=16)
def plot_gc_content(self, fontsize=16, ec="k", bins=100): """plot GC content histogram :params bins: a value for the number of bins or an array (with a copy() method) :param ec: add black contour on the bars .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam')) b.plot_gc_content() """ data = self.get_gc_content() try: X = np.linspace(0, 100, bins) except: X = bins.copy() pylab.hist(data, X, density=True, ec=ec) pylab.grid(True) mu = pylab.mean(data) sigma = pylab.std(data) X = pylab.linspace(X.min(), X.max(), 100) from sequana.misc import normpdf pylab.plot(X, normpdf(X, mu, sigma), lw=2, color="r", ls="--") pylab.xlabel("GC content", fontsize=16)
def get_metrics_count(self): """ Count flags/mapq/read length in one pass.""" mapq_dict = {} read_length_dict = {} flag_dict = {} mean_qualities = [] for read in self: self._count_item(mapq_dict, read.mapq) self._count_item(flag_dict, read.flag) if read.is_unmapped is False: self._count_item(read_length_dict, read.reference_length) mean_qualities.append(pylab.mean(read.query_qualities)) self.metrics_count = {"mapq": mapq_dict, "read_length": read_length_dict, "flags": flag_dict, "mean_quality": pylab.mean(mean_qualities)} return self.metrics_count
def _get_summary(self): """Count flags/mapq/read length in one pass.""" if self._summary is not None: return self._summary mapq_dict = {} read_length_dict = {} flag_dict = {} mean_qualities = [] for read in self: self._count_item(mapq_dict, read.mapq) self._count_item(flag_dict, read.flag) if read.is_unmapped is False: self._count_item(read_length_dict, read.reference_length) try:mean_qualities.append(pylab.mean(read.query_qualities)) except:mean_qualities.append(read.query_qualities) self._summary = {"mapq": mapq_dict, "read_length": read_length_dict, "flags": flag_dict, "mean_quality": pylab.mean(mean_qualities) } return self._summary
def stats(self): results = {} if self.data is not None: logger.info("Reading strand") results['strand'] = { "+": sum(self.data.strand == "+"), "-": sum(self.data.strand == "-"), "?": sum(self.data.strand.isnull()) } results['classification'] = { "total_ccs_reads": len(self.data), "five_prime_reads": int(self.data.fiveseen.sum()), "three_prime_reads": int(self.data.threeseen.sum()), "chimera": int(self.data.chimera.sum()), "polyA_reads": int(self.data.polyAseen.sum()), } if self.lq_isoforms: logger.info("Reading LQ isoforms") results['lq_isoform'] = self.lq_sequence.stats() # number of if self.hq_isoforms: logger.info("Reading HQ isoforms") results['hq_isoform'] = self.hq_sequence.stats( ) # number of polished HQ isoform if self.ccs: seq = [len(read.sequence) for read in self.ccs] results["CCS"] = { "mean_length": pylab.mean(seq), "number_ccs_bases": sum(seq), "number_ccs_reads": len(seq) } self.idents_v = [] self.full_v = [] self.non_full_v = [] self.isoform_lengths = [] for read in self.lq_sequence: ident, full, non_full, length = read['identifier'].decode().split( ";") self.idents_v.append(ident) self.full_v.append(int(full.split("=")[1])) self.non_full_v.append(int(non_full.split("=")[1])) self.isoform_lengths.append(int(length.split("=")[1])) return results
def stats(self): results = {} if self.data is not None: logger.info("Reading strand") results['strand'] = { "+": sum(self.data.strand == "+"), "-": sum(self.data.strand == "-"), "?": sum(self.data.strand.isnull()) } results['classification'] = { "total_ccs_reads" : len(self.data), "five_prime_reads" : int(self.data.fiveseen.sum()), "three_prime_reads" : int(self.data.threeseen.sum()), "chimera" : int(self.data.chimera.sum()), "polyA_reads" : int(self.data.polyAseen.sum()), } if self.lq_isoforms: logger.info("Reading LQ isoforms") results['lq_isoform'] = self.lq_sequence.stats() # number of if self.hq_isoforms: logger.info("Reading HQ isoforms") results['hq_isoform'] = self.hq_sequence.stats() # number of polished HQ isoform if self.ccs: seq = [ len(read.sequence) for read in self.ccs] results["CCS"] = { "mean_length" : pylab.mean(seq), "number_ccs_bases" : sum(seq), "number_ccs_reads" : len(seq) } self.idents_v = [] self.full_v = [] self.non_full_v = [] self.isoform_lengths = [] for read in self.lq_sequence: ident, full, non_full, length = read['identifier'].decode().split(";") self.idents_v.append(ident) self.full_v.append(int(full.split("=")[1])) self.non_full_v.append(int(non_full.split("=")[1])) self.isoform_lengths.append(int(length.split("=")[1])) return results
def random_selection(self, output_filename, nreads=None, expected_coverage=None, reference_length=None, read_lengths=None): """Select random reads :param nreads: number of reads to select randomly. Must be less than number of available reads in the orignal file. :param expected_coverage: :param reference_length: if expected_coverage and reference_length provided, nreads is replaced automatically. .. note:: to speed up computation (if you need to call random_selection many times), you can provide the mean read length manually """ assert output_filename != self.filename, \ "output filename should be different from the input filename" if read_lengths is None: self.reset() read_lengths = [ read.query_length for i, read in enumerate(self.data) ] N = len(read_lengths) if expected_coverage and reference_length: mu = pylab.mean(read_lengths) nreads = int(expected_coverage * reference_length / mu) assert nreads < N, "nreads parameter larger than actual Number of reads" selector = random.sample(range(N), nreads) logger.info("Creating a pacbio BAM file with {} reads".format(nreads)) with pysam.AlignmentFile(output_filename, "wb", template=self.data) as fh: self.reset() for i, read in enumerate(self.data): if i in selector: fh.write(read)
def random_selection(self, output_filename, nreads=None, expected_coverage=None, reference_length=None, read_lengths=None): """Select random reads :param nreads: number of reads to select randomly. Must be less than number of available reads in the orignal file. :param expected_coverage: :param reference_length: if expected_coverage and reference_length provided, nreads is replaced automatically. .. note:: to speed up computation (if you need to call random_selection many times), you can provide the mean read length manually """ assert output_filename != self.filename, \ "output filename should be different from the input filename" if read_lengths is None: self.reset() read_lengths = [read.query_length for i, read in enumerate(self.data)] N = len(read_lengths) if expected_coverage and reference_length: mu = pylab.mean(read_lengths) nreads = int(expected_coverage * reference_length / mu) assert nreads < N, "nreads parameter larger than actual Number of reads" selector = random.sample(range(N), nreads) logger.info("Creating a pacbio BAM file with {} reads".format(nreads)) with pysam.AlignmentFile(output_filename,"wb", template=self.data) as fh: self.reset() for i, read in enumerate(self.data): if i in selector: fh.write(read)
def stats(self): return {"mean_read_length": pylab.mean(self.lengths), "ZMW": len(self.passes), "N": len(self.lengths), "mean_ZMW_passes": pylab.mean(self.passes)}
def stats(self): self.rewind() data = [len(read['sequence']) for read in self] return {"mean_read_length": pylab.mean(data), "N": len(data)}