def process_cram(cram, **kwargs): """Combines metrics from cram after extraction. Processing function: calls pool of worker functions to extract from a cram file the following metrics: -lengths -aligned lengths -qualities -aligned qualities -mapping qualities -edit distances to the reference genome scaled by read length Returned in a pandas DataFrame """ logging.info( "Nanoget: Starting to collect statistics from cram file {}.".format( cram)) samfile = check_bam(cram, samtype="cram") chromosomes = samfile.references params = zip([cram] * len(chromosomes), chromosomes) with cfutures.ProcessPoolExecutor( max_workers=kwargs["threads"]) as executor: datadf = pd.DataFrame( data=[res for sublist in executor.map(extract_from_bam, params) for res in sublist], columns=["readIDs", "quals", "aligned_quals", "lengths", "aligned_lengths", "mapQ", "percentIdentity"]) \ .dropna(axis='columns', how='all') \ .dropna(axis='index', how='any') logging.info("Nanoget: cram {} contains {} primary alignments.".format( cram, datadf["lengths"].size)) return ut.reduce_memory_usage(datadf)
def process_fastq_rich(fastq, **kwargs): """Extract metrics from a richer fastq file. Extract information from fastq files generated by albacore or MinKNOW, containing richer information in the header (key-value pairs) read=<int> [72] ch=<int> [159] start_time=<timestamp> [2016-07-15T14:23:22Z] # UTC ISO 8601 ISO 3339 timestamp Z indicates UTC time, T is the delimiter between date expression and time expression dateutil.parser.parse("2016-07-15T14:23:22Z") imported as dparse -> datetime.datetime(2016, 7, 15, 14, 23, 22, tzinfo=tzutc()) """ logging.info( "Nanoget: Starting to collect statistics from rich fastq file.") inputfastq = handle_compressed_input(fastq) res = [] for record in SeqIO.parse(inputfastq, "fastq"): try: read_info = info_to_dict(record.description) res.append( (ut.ave_qual(record.letter_annotations["phred_quality"]), len(record), read_info["ch"], read_info["start_time"], read_info["runid"])) except KeyError: logging.error("Nanoget: keyerror when processing record {}".format( record.description)) sys.exit("Unexpected fastq identifier:\n{}\n\n \ missing one or more of expected fields 'ch', 'start_time' or 'runid'" .format(record.description)) df = pd.DataFrame( data=res, columns=["quals", "lengths", "channelIDs", "timestamp", "runIDs"]).dropna() df["channelIDs"] = df["channelIDs"].astype("int64") return ut.reduce_memory_usage(df)
def process_fasta(fasta, **kwargs): """Combine metrics extracted from a fasta file.""" logging.info("Nanoget: Starting to collect statistics from a fasta file.") inputfasta = handle_compressed_input(fasta, file_type="fasta") return ut.reduce_memory_usage( pd.DataFrame( data=[len(rec) for rec in SeqIO.parse(inputfasta, "fasta")], columns=["lengths"]).dropna())
def process_fastq_plain(fastq, **kwargs): """Combine metrics extracted from a fastq file.""" logging.info( "Nanoget: Starting to collect statistics from plain fastq file.") inputfastq = handle_compressed_input(fastq) return ut.reduce_memory_usage( pd.DataFrame( data=[res for res in extract_from_fastq(inputfastq) if res], columns=["quals", "lengths"]).dropna())
def process_fastq_minimal(fastq, **kwargs): """Swiftly extract minimal features (length and timestamp) from a rich fastq file""" infastq = handle_compressed_input(fastq) try: df = pd.DataFrame(data=[rec for rec in fq_minimal(infastq) if rec], columns=["timestamp", "lengths"]) except IndexError: logging.error("Fatal: Incorrect file structure for fastq_minimal") sys.exit( "Error: file does not match expected structure for fastq_minimal") return ut.reduce_memory_usage(df)
def process_bam(bam, **kwargs): """Combines metrics from bam after extraction. Processing function: calls pool of worker functions to extract from a bam file the following metrics: -lengths -aligned lengths -qualities -aligned qualities -mapping qualities -edit distances to the reference genome scaled by read length Returned in a pandas DataFrame """ logging.info( "Nanoget: Starting to collect statistics from bam file {}.".format( bam)) samfile = check_bam(bam) chromosomes = samfile.references if len(chromosomes) > 100 or kwargs["huge"]: logging.info( "Nanoget: lots of contigs (>100) or --huge, not running in separate processes" ) datadf = pd.DataFrame( data=extract_from_bam(bam, None, kwargs["keep_supp"]), columns=["readIDs", "quals", "aligned_quals", "lengths", "aligned_lengths", "mapQ", "percentIdentity"]) \ .dropna(axis='columns', how='all') \ .dropna(axis='index', how='any') else: unit = chromosomes with cfutures.ProcessPoolExecutor( max_workers=kwargs["threads"]) as executor: datadf = pd.DataFrame( data=[res for sublist in executor.map(extract_from_bam, repeat(bam), unit, repeat(kwargs["keep_supp"])) for res in sublist], columns=["readIDs", "quals", "aligned_quals", "lengths", "aligned_lengths", "mapQ", "percentIdentity"]) \ .dropna(axis='columns', how='all') \ .dropna(axis='index', how='any') logging.info( f"Nanoget: bam {bam} contains {datadf['lengths'].size} primary alignments." ) return ut.reduce_memory_usage(datadf)
def process_ubam(bam, **kwargs): """Extracting metrics from unaligned bam format Extracting lengths """ logging.info( "Nanoget: Starting to collect statistics from ubam file {}.".format( bam)) samfile = pysam.AlignmentFile(bam, "rb", check_sq=False) if not samfile.has_index(): pysam.index(bam) # Need to reload the samfile after creating index samfile = pysam.AlignmentFile(bam, "rb", check_sq=False) logging.info( "Nanoget: No index for bam file could be found, created index.") datadf = pd.DataFrame( data=[(read.query_name, ut.ave_qual(read.query_qualities), read.query_length) for read in samfile.fetch(until_eof=True)], columns=["readIDs", "quals", "lengths"]) \ .dropna(axis='columns', how='all') \ .dropna(axis='index', how='any') logging.info("Nanoget: ubam {} contains {} reads.".format( bam, datadf["lengths"].size)) return ut.reduce_memory_usage(datadf)
def process_summary(summaryfile, **kwargs): """Extracting information from an albacore summary file. Only reads which have a >0 length are returned. The fields below may or may not exist, depending on the type of sequencing performed. Fields 1-14 are for 1D sequencing. Fields 1-23 for 2D sequencing. Fields 24-27, 2-5, 22-23 for 1D^2 (1D2) sequencing Fields 28-38 for barcoded workflows 1 filename 2 read_id 3 run_id 4 channel 5 start_time 6 duration 7 num_events 8 template_start 9 num_events_template 10 template_duration 11 num_called_template 12 sequence_length_template 13 mean_qscore_template 14 strand_score_template 15 complement_start 16 num_events_complement 17 complement_duration 18 num_called_complement 19 sequence_length_complement 20 mean_qscore_complement 21 strand_score_complement 22 sequence_length_2d 23 mean_qscore_2d 24 filename1 25 filename2 26 read_id1 27 read_id2 28 barcode_arrangement 29 barcode_score 30 barcode_full_arrangement 31 front_score 32 rear_score 33 front_begin_index 34 front_foundseq_length 35 rear_end_index 36 rear_foundseq_length 37 kit 38 variant """ logging.info( "Nanoget: Collecting metrics from summary file {} for {} sequencing". format(summaryfile, kwargs["readtype"])) ut.check_existance(summaryfile) if kwargs["readtype"] == "1D": cols = [ "channel", "start_time", "duration", "sequence_length_template", "mean_qscore_template" ] elif kwargs["readtype"] in ["2D", "1D2"]: cols = [ "channel", "start_time", "duration", "sequence_length_2d", "mean_qscore_2d" ] if kwargs["barcoded"]: cols.append("barcode_arrangement") logging.info("Nanoget: Extracting metrics per barcode.") try: datadf = pd.read_csv( filepath_or_buffer=summaryfile, sep="\t", usecols=cols, ) except ValueError: logging.error( "Nanoget: did not find expected columns in summary file {}:\n {}". format(summaryfile, ', '.join(cols))) sys.exit("ERROR: expected columns in summary file {} not found:\n {}". format(summaryfile, ', '.join(cols))) if kwargs["barcoded"]: datadf.columns = [ "channelIDs", "time", "duration", "lengths", "quals", "barcode" ] else: datadf.columns = ["channelIDs", "time", "duration", "lengths", "quals"] logging.info( "Nanoget: Finished collecting statistics from summary file {}".format( summaryfile)) return ut.reduce_memory_usage(datadf.loc[datadf["lengths"] != 0].copy())