def extract_from_bam(bam, chromosome, keep_supplementary=True): """Extracts metrics from bam. Worker function per chromosome loop over a bam file and create list with tuples containing metrics: -qualities -aligned qualities -lengths -aligned lengths -mapping qualities -edit distances to the reference genome scaled by read length """ samfile = pysam.AlignmentFile(bam, "rb") if keep_supplementary: return [(read.query_name, nanomath.ave_qual(read.query_qualities), nanomath.ave_qual(read.query_alignment_qualities), read.query_length, read.query_alignment_length, read.mapping_quality, get_pID(read)) for read in samfile.fetch(reference=chromosome, multiple_iterators=True) if not read.is_secondary and not read.is_unmapped] else: return [(read.query_name, nanomath.ave_qual(read.query_qualities), nanomath.ave_qual(read.query_alignment_qualities), read.query_length, read.query_alignment_length, read.mapping_quality, get_pID(read)) for read in samfile.fetch(reference=chromosome, multiple_iterators=True) if not read.is_secondary and not read.is_unmapped and not read.is_supplementary]
def process_fastq_rich(fastq, **kwargs): """Extract metrics from a richer fastq file. Extract information from fastq files generated by albacore or MinKNOW, containing richer information in the header (key-value pairs) read=<int> [72] ch=<int> [159] start_time=<timestamp> [2016-07-15T14:23:22Z] # UTC ISO 8601 ISO 3339 timestamp Z indicates UTC time, T is the delimiter between date expression and time expression dateutil.parser.parse("2016-07-15T14:23:22Z") imported as dparse -> datetime.datetime(2016, 7, 15, 14, 23, 22, tzinfo=tzutc()) """ logging.info( "Nanoget: Starting to collect statistics from rich fastq file.") inputfastq = handle_compressed_input(fastq) res = [] for record in SeqIO.parse(inputfastq, "fastq"): try: read_info = info_to_dict(record.description) res.append( (nanomath.ave_qual(record.letter_annotations["phred_quality"]), len(record), read_info["ch"], read_info["start_time"], read_info["runid"])) except KeyError: logging.error("Nanoget: keyerror when processing record {}".format( record.description)) sys.exit("Unexpected fastq identifier:\n{}\n\n \ missing one or more of expected fields 'ch', 'start_time' or 'runid'" .format(record.description)) df = pd.DataFrame( data=res, columns=["quals", "lengths", "channelIDs", "timestamp", "runIDs"]).dropna() df["channelIDs"] = df["channelIDs"].astype("int64") return ut.reduce_memory_usage(df)
def extract_all_from_fastq(rec): """Extract metrics from a fastq file. Return identifier, read length, average quality and median quality """ return (rec.id, len(rec), nanomath.ave_qual(rec.letter_annotations["phred_quality"]), nanomath.median_qual(rec.letter_annotations["phred_quality"]))
def extract_from_fastq(fq): """Extract metrics from a fastq file. Return average quality and read length """ for rec in SeqIO.parse(fq, "fastq"): yield nanomath.ave_qual( rec.letter_annotations["phred_quality"]), len(rec)
def splitFq(fq, args): ''' Split a fastq file in a fail and pass file Optionally trim a number of nucleotides from beginning and end. ''' prefix = os.path.join( args.outdir, os.path.basename(args.fastqfile.name).replace('.fastq', '').replace( '.gz', '').replace('.fq', '')) p, f = 0, 0 with gzip.open(prefix + ".pass.fastq.gz", 'wt') as passed, gzip.open(prefix + ".fail.fastq.gz", 'wt') as failed: for record in SeqIO.parse(fq, "fastq"): if ave_qual(record.letter_annotations["phred_quality"] ) >= args.quality: p += 1 passed.write(record.format("fastq")) else: failed.write(record.format("fastq")) f += 1 print("Split the file in {} reads in <pass> and {} reads in <fail>".format( p, f))
def process_ubam(bam, **kwargs): """Extracting metrics from unaligned bam format Extracting lengths """ logging.info( "Nanoget: Starting to collect statistics from ubam file {}.".format( bam)) samfile = pysam.AlignmentFile(bam, "rb", check_sq=False) if not samfile.has_index(): pysam.index(bam) # Need to reload the samfile after creating index samfile = pysam.AlignmentFile(bam, "rb", check_sq=False) logging.info( "Nanoget: No index for bam file could be found, created index.") datadf = pd.DataFrame( data=[(read.query_name, nanomath.ave_qual(read.query_qualities), read.query_length) for read in samfile.fetch(until_eof=True)], columns=["readIDs", "quals", "lengths"]) \ .dropna(axis='columns', how='all') \ .dropna(axis='index', how='any') logging.info("Nanoget: ubam {} contains {} reads.".format( bam, datadf["lengths"].size)) return ut.reduce_memory_usage(datadf)
def test_ave_qual(self): """Test average quality calculation.""" quals = list(range(128 + 1)) * 100 mq = nm.ave_qual(quals, qround=True) self.assertEqual(mq, 14)