def _debug_dummy_plot( taxonomy: Taxonomy, htmlfile: Filename, scoring: Scoring = Scoring.SHEL, ): """ Generate dummy Krona plot via Krona 2.0 XML spec and exit """ print(gray(f'Generating dummy Krona plot {htmlfile}...'), end='') sys.stdout.flush() samples: List[Sample] = [ Sample('SINGLE'), ] krona: KronaTree = KronaTree( samples, min_score=Score(35), max_score=Score(100), scoring=scoring, ) polytree: MultiTree = MultiTree(samples=samples) polytree.grow(ontology=taxonomy) polytree.toxml(ontology=taxonomy, krona=krona) krona.tohtml(htmlfile, pretty=True) print(green('OK!'))
def generate_krona(): """Generate Krona plot with all the results via Krona 2.0 XML spec""" print(gray('\nBuilding the taxonomy multiple tree... '), end='') sys.stdout.flush() krona: KronaTree = KronaTree( samples, num_raw_samples=len(raw_samples), stats=stats, min_score=Score( min([ min(scores[sample].values()) for sample in samples if len(scores[sample]) ])), max_score=Score( max([ max(scores[sample].values()) for sample in samples if len(scores[sample]) ])), scoring=scoring, ) polytree.grow(ontology=ncbi, abundances=counts, accs=accs, scores=scores) print(green('OK!')) print(gray('Generating final plot (') + magenta(htmlfile) + gray(')... '), end='') sys.stdout.flush() polytree.toxml(ontology=ncbi, krona=krona) krona.tohtml(htmlfile, pretty=False) print(green('OK!'))
def swmean(cnt1: int, sco1: Score, cnt2: int, sco2: Score) -> Score: """Weighted mean of scores by counts""" if sco1 == NO_SCORE: return sco2 elif sco2 == NO_SCORE: return sco1 return Score((cnt1 * sco1 + cnt2 * sco2) / (cnt1 + cnt2))
def read_output( output_file: Filename, scoring: Scoring = Scoring.SHEL, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read Centrifuge output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: file.readline() # discard header for output_line in file: try: _, _, _tid, _score, _, _, _length, *_ = output_line.split( '\t') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue tid = Id(_tid) try: # From Centrifuge score get "single hit equivalent length" shel = Score(float(_score)**0.5 + 15) length = int(_length) except ValueError: print(yellow('Failure'), f'parsing score ({_score}) for ', f'query length {_length} for taxid {_tid}', f'in {output_file}. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue num_read += 1 nt_read += length if tid == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue else: taxids.add(tid) # Save all the tids of classified reads if minscore is not None and shel < minscore: continue # Ignore read if low confidence try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, scores=all_scores, lens=all_length, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f}' + gray(', max = ') + f'{stat.sco.maxi:.1f}' + gray(', avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Length: min = ') + f'{stat.len.mini}' + gray(', max = ') + f'{stat.len.maxi}' + gray(', avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f' Centrifuge: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
def read_clark_output( output_file: Filename, scoring: Scoring = Scoring.CLARK_C, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read CLARK(-l)(-S) full mode output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_confs: Dict[Id, List[Score]] = {} all_gammas: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: # Check number of cols in header header = file.readline().split(',') if len(header) != 8: print( red('\nERROR! ') + 'CLARK output format of ', yellow(f'"{output_file}"'), 'not supported.') print(magenta('Expected:'), 'ID,Length,Gamma,1st,score1,2nd,score2,conf') print(magenta('Found:'), ','.join(header), end='') print(blue('HINT:'), 'Use CLARK, CLARK-l, or CLARK-S ' 'with full mode (', blue('-m 0'), ')') raise Exception('Unsupported file format. Aborting.') for raw_line in file: try: output_line = raw_line.strip() (_label, _length, _gamma, _tid1, _score1, _tid2, _score2, _conf) = output_line.split(',') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue try: length: int = int(_length) gamma: Score = Score(float(_gamma)) tid1: Id = Id(_tid1) score1: Score = Score(float(_score1)) tid2: Id = Id(_tid2) score2: Score = Score(float(_score2)) conf: Score = Score(float(_conf)) except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue num_read += 1 nt_read += length # Select tid and score between CLARK assignments 1 and 2 tid: Id = tid1 score: Score = score1 if tid1 == UNCLASSIFIED: if tid2 == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue else: # Majority of read unclassified tid = tid2 score = score2 conf = Score(1 - conf) # Get CLARK's h2/(h1+h2) # From CLARK_C(S) score get "single hit equivalent length" shel: Score = Score(score + K_MER_SIZE) taxids.add(tid) # Save all the selected tids (tid1 or tid2) if minscore is not None: # Decide if ignore read if low score if scoring is Scoring.CLARK_C: if conf < minscore: continue elif scoring is Scoring.CLARK_G: if gamma < minscore: continue else: if shel < minscore: continue try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_confs[tid].append(conf) except KeyError: all_confs[tid] = [ conf, ] try: all_gammas[tid].append(gamma) except KeyError: all_gammas[tid] = [ gamma, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, scores2=all_confs, scores3=all_gammas, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Hit (score): min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Conf. score: min = ') + f'{stat.sco2.mini:.1f},' + gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') + f'{stat.sco2.mean:.1f}\n') output.write( gray(' Gamma score: min = ') + f'{stat.sco3.mini:.1f},' + gray(' max = ') + f'{stat.sco3.maxi:.1f},' + gray(' avr = ') + f'{stat.sco3.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.CLARK_C: out_scores = { tid: Score(mean(all_confs[tid]) * 100) for tid in all_confs } elif scoring is Scoring.CLARK_G: out_scores = {tid: Score(mean(all_gammas[tid])) for tid in all_gammas} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f'clark: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
def read_lmat_output( output_file: Filename, scoring: Scoring = Scoring.LMAT, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read LMAT output (iterate over all the output files) Args: output_file: output file name (prefix) scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} nt_read: int = 0 matchings: Counter[Match] = Counter() output_files: List[Filename] = [] # Select files to process depending on if the output files are explicitly # given or directory name is provided (all the output files there) if os.path.isdir(output_file): # Just the directory name is provided dirname = os.path.normpath(output_file) for file in os.listdir(dirname): # Add all LMAT output files in dir if ('_output' in file and file.endswith('.out') and 'canVfin' not in file and 'pyLCA' not in file): output_files.append(Filename(file)) else: # Explicit path and file name prefix is given dirname, basename = os.path.split(output_file) for file in os.listdir(dirname): # Add selected output files in dir if (file.startswith(basename) and file.endswith('.out') and 'canVfin' not in file and 'pyLCA' not in file): output_files.append(Filename(file)) if not output_files: raise Exception( f'\n\033[91mERROR!\033[0m Cannot read from "{output_file}"') # Read LMAT output files for output_name in output_files: path: Filename = Filename(os.path.join(dirname, output_name)) output.write(f'\033[90mLoading output file {path}...\033[0m') try: with open(path, 'r') as io_file: for seq in SeqIO.parse(io_file, "lmat"): tid: Id = seq.annotations['final_taxid'] score: Score = seq.annotations['final_score'] match: Match = Match.lmat(seq.annotations['final_match']) matchings[match] += 1 length: int = len(seq) nt_read += length if minscore is not None: if score < minscore: # Ignore read if low score continue if match in [Match.DIRECTMATCH, Match.MULTIMATCH]: try: all_scores[tid].append(score) except KeyError: all_scores[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR!') + f'Cannot read "{path}"') output.write(green('OK!\n')) abundances: Counter[Id] = Counter( {tid: len(all_scores[tid]) for tid in all_scores}) # Basic output statistics read_seqs: int = sum(matchings.values()) if read_seqs == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from"{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, scores=all_scores, lens=all_length, seq_read=read_seqs, seq_filt=filt_seqs, seq_clas=matchings[Match.DIRECT] + matchings[Match.MULTI]) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) multi_rel: float = matchings[Match.MULTI] / read_seqs direct_rel: float = matchings[Match.DIRECT] / read_seqs nodbhits_rel: float = matchings[Match.NODBHITS] / read_seqs tooshort_rel: float = matchings[Match.READTOOSHORT] / read_seqs lowscore_rel: float = matchings[Match.LOWSCORE] / read_seqs output.write(f'\033[90m DB Matching: ' f'Multi =\033[0m {multi_rel:.1%}\033[90m ' f'Direct =\033[0m {direct_rel:.1%}\033[90m ' f'ReadTooShort =\033[0m {tooshort_rel:.1%}\033[90m ' f'LowScore =\033[0m {lowscore_rel:.1%}\033[90m ' f'NoDbHits =\033[0m {nodbhits_rel:.1%}\033[90m\n') output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write(f' {stat.num_taxa}' + gray(f' taxa with assigned reads\n')) # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.LMAT: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} else: print(red('ERROR!'), f' LMAT: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, abundances, out_scores
def main(): """Main entry point to script.""" # Argument Parser Configuration parser = argparse.ArgumentParser( description='Extract reads following Centrifuge/Kraken output', epilog=f'%(prog)s - {__author__} - {__date__}') parser.add_argument('-V', '--version', action='version', version=f'%(prog)s release {__version__} ({__date__})') parser.add_argument('-f', '--file', action='store', metavar='FILE', required=True, help='Centrifuge output file.') parser.add_argument('-l', '--limit', action='store', metavar='NUMBER', type=int, default=None, help=('Limit of FASTQ reads to extract. ' 'Default: no limit')) parser.add_argument( '-m', '--maxreads', action='store', metavar='NUMBER', type=int, default=None, help=('Maximum number of FASTQ reads to search for the taxa. ' 'Default: no maximum')) parser.add_argument( '-n', '--nodespath', action='store', metavar='PATH', default=TAXDUMP_PATH, help=('path for the nodes information files (nodes.dmp and names.dmp' + ' from NCBI')) parser.add_argument( '-i', '--include', action='append', metavar='TAXID', type=TaxId, default=[], help=('NCBI taxid code to include a taxon and all underneath ' + '(multiple -i is available to include several taxid). ' + 'By default all the taxa is considered for inclusion.')) parser.add_argument( '-x', '--exclude', action='append', metavar='TAXID', type=TaxId, default=[], help=('NCBI taxid code to exclude a taxon and all underneath ' + '(multiple -x is available to exclude several taxid)')) parser.add_argument( '-y', '--minscore', action='store', metavar='NUMBER', type=lambda txt: Score(float(txt)), default=None, help=('minimum score/confidence of the classification of a read ' 'to pass the quality filter; all pass by default')) filein = parser.add_mutually_exclusive_group(required=True) filein.add_argument('-q', '--fastq', action='store', metavar='FILE', default=None, help='Single FASTQ file (no paired-ends)') filein.add_argument('-1', '--mate1', action='store', metavar='FILE', default=None, help='Paired-ends FASTQ file for mate 1s ' '(filename usually includes _1)') parser.add_argument('-2', '--mate2', action='store', metavar='FILE', default=None, help='Paired-ends FASTQ file for mate 2s ' '(filename usually includes _2)') # timing initialization start_time: float = time.time() # Program header print(f'\n=-= {sys.argv[0]} =-= v{__version__} =-= {__date__} =-=\n') sys.stdout.flush() # Parse arguments args = parser.parse_args() output_file = args.file nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE)) namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE)) excluding: Set[TaxId] = set(args.exclude) including: Set[TaxId] = set(args.include) fastq_1: Filename fastq_2: Filename = args.mate2 if not fastq_2: fastq_1 = args.fastq else: fastq_1 = args.mate1 # Load NCBI nodes, names and build children plasmidfile: Filename = None ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, False, excluding, including) # Build taxonomy tree print(gray('Building taxonomy tree...'), end='') sys.stdout.flush() tree = TaxTree() tree.grow(taxonomy=ncbi, look_ancestors=False) print(green(' OK!')) # Get the taxa print(gray('Filtering taxa...'), end='') sys.stdout.flush() ranks: Ranks = Ranks({}) tree.get_taxa(ranks=ranks, include=including, exclude=excluding) print(green(' OK!')) taxids: Set[TaxId] = set(ranks) taxlevels: TaxLevels = Rank.ranks_to_taxlevels(ranks) num_taxlevels = Counter({rank: len(taxlevels[rank]) for rank in taxlevels}) num_taxlevels = +num_taxlevels # Statistics about including taxa print(f' {len(taxids)}\033[90m taxid selected in \033[0m', end='') print(f'{len(num_taxlevels)}\033[90m different taxonomical levels:\033[0m') for rank in num_taxlevels: print(f' Number of different {rank}: {num_taxlevels[rank]}') assert taxids, red('ERROR! No taxids to search for!') # Get the records records: List[SeqRecord] = [] num_seqs: int = 0 # timing initialization start_time_load: float = time.perf_counter() print(gray(f'Loading output file {output_file}...'), end='') sys.stdout.flush() try: with open(output_file, 'rU') as file: file.readline() # discard header for num_seqs, record in enumerate(SeqIO.parse(file, 'centrifuge')): tid: TaxId = record.annotations['taxID'] if tid not in taxids: continue # Ignore read if low confidence score: Score = Score(record.annotations['score']) if args.minscore is not None and score < args.minscore: continue records.append(record) except FileNotFoundError: raise Exception(red('ERROR!') + 'Cannot read "' + output_file + '"') print(green(' OK!')) # Basic records statistics print( gray(' Load elapsed time: ') + f'{time.perf_counter() - start_time_load:.3g}' + gray(' sec')) print(f' \033[90mMatching reads: \033[0m{len(records):_d} \033[90m\t' f'(\033[0m{len(records)/num_seqs:.4%}\033[90m of sample)') sys.stdout.flush() # FASTQ sequence dealing # records_ids: List[SeqRecord] = [record.id for record in records] records_ids: Set[SeqRecord] = {record.id for record in records} seqs1: List[SeqRecord] = [] seqs2: List[SeqRecord] = [] extracted: int = 0 i: int = 0 if fastq_2: print( f'\033[90mLoading FASTQ files {fastq_1} and {fastq_2}...\n' f'Mseqs: \033[0m', end='') sys.stdout.flush() try: with open(fastq_1, 'rU') as file1, open(fastq_2, 'rU') as file2: for i, (rec1, rec2) in enumerate( zip(SeqIO.parse(file1, 'quickfastq'), SeqIO.parse(file2, 'quickfastq'))): if not records_ids or (args.maxreads and i >= args.maxreads ) or (args.limit and extracted >= args.limit): break elif not i % 1000000: print(f'{i//1000000:_d}', end='') sys.stdout.flush() elif not i % 100000: print('.', end='') sys.stdout.flush() try: records_ids.remove(rec1.id) except KeyError: pass else: seqs1.append(rec1) seqs2.append(rec2) extracted += 1 except FileNotFoundError: raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ files') else: print(f'\033[90mLoading FASTQ files {fastq_1}...\n' f'Mseqs: \033[0m', end='') sys.stdout.flush() try: with open(fastq_1, 'rU') as file1: for i, rec1 in enumerate(SeqIO.parse(file1, 'quickfastq')): if not records_ids or (args.maxreads and i >= args.maxreads ) or (args.limit and extracted >= args.limit): break elif not i % 1000000: print(f'{i//1000000:_d}', end='') sys.stdout.flush() elif not i % 100000: print('.', end='') sys.stdout.flush() try: records_ids.remove(rec1.id) except KeyError: pass else: seqs1.append(rec1) extracted += 1 except FileNotFoundError: raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ file') print(cyan(f' {i/1e+6:.3g} Mseqs'), green('OK! ')) def format_filename(fastq: Filename) -> Filename: """Auxiliary function to properly format the output filenames. Args: fastq: Complete filename of the fastq input file Returns: Filename of the rextracted fastq output file """ fastq_filename, _ = os.path.splitext(fastq) output_list: List[str] = [fastq_filename, '_rxtr'] if including: output_list.append('_incl') output_list.extend('_'.join(including)) if excluding: output_list.append('_excl') output_list.extend('_'.join(excluding)) output_list.append('.fastq') return Filename(''.join(output_list)) filename1: Filename = format_filename(fastq_1) SeqIO.write(seqs1, filename1, 'quickfastq') print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename1) if fastq_2: filename2: Filename = format_filename(fastq_2) SeqIO.write(seqs2, filename2, 'quickfastq') print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename2) # Timing results print(gray('Total elapsed time:'), time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
def read_output( output_file: Filename, scoring: Scoring = Scoring.SHEL, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[TaxId], Dict[TaxId, Score]]: """ Read Centrifuge output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[TaxId, List[Score]] = {} all_length: Dict[TaxId, List[int]] = {} num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 error_read: int = None output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: file.readline() # discard header for output_line in file: try: _, _, _tid, _score, _, _, _length, *_ = output_line.split( '\t') except ValueError: print( red('Error'), f'parsing line: ({output_line}) ' f'in {output_file}. Ignoring line!') error_read = num_read + 1 continue tid = TaxId(_tid) try: # From Centrifuge score get "single hit equivalent length" shel = Score(float(_score)**0.5 + 15) length = int(_length) except ValueError: print(red('Error'), f'parsing score ({_score}) for query', f'length ({_length}) for taxid {_tid}', f'in {output_file}. Ignoring line!') continue num_read += 1 nt_read += length if tid == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue elif minscore is not None and shel < minscore: continue # Ignore read if low confidence try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if error_read == num_read + 1: # Check if error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[TaxId] = Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from"{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, scores=all_scores, lens=all_length, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs) # Output statistics output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write(f' {stat.num_taxa}' + gray(f' taxa with assigned reads\n')) # Select score output out_scores: Dict[TaxId, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[TaxId, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[TaxId, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: raise Exception(f'\n\033[91mERROR!\033[0m Unknown Scoring "{scoring}"') # Return return output.getvalue(), stat, counts, out_scores
def configure_parser(): """Argument Parser Configuration""" parser = argparse.ArgumentParser( description='Analyze results of metagenomic taxonomic classifiers', epilog=f'%(prog)s - Release {__version__} - {__date__}' + LICENSE, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '-V', '--version', action='version', version=f'%(prog)s version {__version__} released in {__date__}') parser_in = parser.add_argument_group( 'input', 'Define Recentrifuge input files and formats') parser_in.add_argument('-n', '--nodespath', action='store', metavar='PATH', default=TAXDUMP_PATH, help=('path for the nodes information files ' '(nodes.dmp and names.dmp from NCBI)')) parser_filein = parser_in.add_mutually_exclusive_group(required=True) parser_filein.add_argument( '-f', '--file', action='append', metavar='FILE', type=Filename, help=('Centrifuge output files. If a single directory is entered, ' 'every .out file inside will be taken as a different sample.' ' Multiple -f is available to include several samples.')) parser_filein.add_argument( '-l', '--lmat', action='append', metavar='FILE', type=Filename, default=None, help=('LMAT output dir or file prefix. If just "." is entered, ' 'every subdirectory under the current directory will be ' 'taken as a sample and scanned looking for LMAT output files' '. Multiple -l is available to include several samples.')) parser_filein.add_argument( '-k', '--clark', action='append', metavar='FILE', type=Filename, help=('CLARK(S) output files. If a single directory is entered, ' 'every .csv file inside will be taken as a different sample.' ' Multiple -k is available to include several samples.')) parser_filein.add_argument( '-r', '--report', action='append', metavar='FILE', type=Filename, help=('Centrifuge/Kraken report files ' '(multiple -r is available to include several samples)')) parser_out = parser.add_argument_group( 'output', 'Related to the Recentrifuge output files') parser_out.add_argument( '-o', '--outhtml', action='store', metavar='FILE', type=Filename, help='HTML output file (if not given, the filename will be ' 'inferred from input files)') parser_out.add_argument( '-e', '--excel', action='store', metavar='OUTPUT_TYPE', choices=[str(excel) for excel in Excel], default=str(Excel(0)), help=(f'type of excel report to be generated, and can be one of ' f'{[str(excel) for excel in Excel]}')) parser_coarse = parser.add_argument_group( 'tuning', 'Coarse tuning of algorithm parameters') parser_cross = parser_coarse.add_mutually_exclusive_group( required=False) parser_cross.add_argument( '-c', '--controls', action='store', metavar='CONTROLS_NUMBER', type=int, default=0, help=('this number of first samples will be treated as negative ' 'controls; default is no controls')) parser_coarse.add_argument( '-s', '--scoring', action='store', metavar='SCORING', choices=[str(each_score) for each_score in Scoring], default=str(Scoring(0)), help=(f'type of scoring to be applied, and can be one of ' f'{[str(scoring) for scoring in Scoring]}')) parser_coarse.add_argument( '-y', '--minscore', action='store', metavar='NUMBER', type=lambda txt: Score(float(txt)), default=None, help=('minimum score/confidence of the classification of a read ' 'to pass the quality filter; all pass by default')) parser_coarse.add_argument( '-m', '--mintaxa', action='store', metavar='INT', type=int, default=DEFMINTAXA, help='minimum taxa to avoid collapsing one level to the parent one' ) parser_coarse.add_argument( '-x', '--exclude', action='append', metavar='TAXID', type=Id, default=[], help=('NCBI taxid code to exclude a taxon and all underneath ' '(multiple -x is available to exclude several taxid)')) parser_coarse.add_argument( '-i', '--include', action='append', metavar='TAXID', type=Id, default=[], help=('NCBI taxid code to include a taxon and all underneath ' '(multiple -i is available to include several taxid); ' 'by default, all the taxa are considered for inclusion')) parser_cross.add_argument('-a', '--avoidcross', action='store_true', help='avoid cross analysis') parser_fine = parser.add_argument_group( 'fine tuning', 'Fine tuning of algorithm parameters') parser_fine.add_argument( '-z', '--ctrlminscore', action='store', metavar='NUMBER', type=lambda txt: Score(float(txt)), default=None, help=('minimum score/confidence of the classification of a read ' 'in control samples to pass the quality filter; if defaults ' 'to "minscore"')) parser_fine.add_argument( '-w', '--ctrlmintaxa', action='store', metavar='INT', type=int, default=None, help='minimum taxa to avoid collapsing one level to the parent one' ' in control samples; it defaults to "mintaxa"') parser_fine.add_argument( '-u', '--summary', action='store', metavar='OPTION', choices=['add', 'only', 'avoid'], default='add', help=( 'select to "add" summary samples to other samples, or to ' '"only" show summary samples or to "avoid" summaries at all')) parser_fine.add_argument( '-t', '--takeoutroot', action='store_true', help='remove counts directly assigned to the "root" level') parser_fine.add_argument('--nokollapse', action='store_true', help='show the "cellular organisms" taxon') parser_mode = parser.add_argument_group('advanced', 'Advanced modes of running') parser_mode.add_argument( '--dummy', # hidden flag: just generate a dummy plot for JS debug action='store_true', help=argparse.SUPPRESS) parser_mode.add_argument( '-g', '--debug', action='store_true', help='increase output verbosity and perform additional checks') parser_mode.add_argument('--sequential', action='store_true', help='deactivate parallel processing') return parser
def process_output( *args, **kwargs) -> Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]: """ Process classifiers output files (to be usually called in parallel!). """ # timing initialization start_time: float = time.perf_counter() # Recover input and parameters target_file: Filename = args[0] debug: bool = kwargs['debug'] is_ctrl: bool = args[1] if debug: print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'), target_file, gray('...')) sys.stdout.flush() ontology: Ontology = kwargs['ontology'] mintaxa: Optional[int] = (kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa']) minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore'] including: Union[Tuple, Set[Id]] = ontology.including excluding: Union[Tuple, Set[Id]] = ontology.excluding scoring: Scoring = kwargs['scoring'] classifier: Classifier = kwargs['classifier'] genfmt: GenericFormat = kwargs['genfmt'] output: io.StringIO = io.StringIO(newline='') def vwrite(*args): """Print only if verbose/debug mode is enabled""" if kwargs['debug']: output.write(' '.join(str(item) for item in args)) sample: Sample = Sample(os.path.splitext(target_file)[0]) error: Err = Err.NO_ERROR # Read taxonomic classifier output files to get abundances read_method: Callable[ # Format: [[Input], Output] [Filename, Scoring, Optional[Score]], Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]] log: str stat: SampleStats counts: Counter[Id] scores: Dict[Id, Score] if classifier is Classifier.GENERIC: # Direct call to generic method log, stat, counts, scores = read_generic_output( target_file, scoring, minscore, genfmt) else: # Use read_method if classifier is Classifier.KRAKEN: read_method = read_kraken_output elif classifier is Classifier.CLARK: read_method = read_clark_output elif classifier is Classifier.LMAT: read_method = read_lmat_output elif classifier is Classifier.CENTRIFUGE: read_method = read_output else: raise Exception(red('\nERROR!'), f'taxclass: Unknown classifier "{classifier}".') log, stat, counts, scores = read_method(target_file, scoring, minscore) output.write(log) # Complete/Update fields in stats stat.is_ctrl = is_ctrl # set control nature of the sample if mintaxa is not None: # manual mintaxa has precedence over automatic stat.mintaxa = mintaxa else: # update local value with the automatically guessed value mintaxa = stat.mintaxa # Move cellular_organisms counts to root, in case if ontology.collapse and counts[CELLULAR_ORGANISMS]: vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS], gray('"CELLULAR_ORGANISMS" reads to "ROOT"... \n')) if counts[ontology.ROOT]: stat.decrease_filtered_taxids() scores[ontology.ROOT] = Score( (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] + scores[ontology.ROOT] * counts[ontology.ROOT]) / (counts[CELLULAR_ORGANISMS] + counts[ontology.ROOT])) else: scores[ontology.ROOT] = scores[CELLULAR_ORGANISMS] counts[ontology.ROOT] += counts[CELLULAR_ORGANISMS] counts[CELLULAR_ORGANISMS] = 0 scores[CELLULAR_ORGANISMS] = NO_SCORE # Remove root counts, in case if kwargs['root'] and counts[ontology.ROOT]: vwrite(gray('Removing'), counts[ontology.ROOT], gray('"ROOT" reads... ')) stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ontology.ROOT]) stat.decrease_filtered_taxids() counts[ontology.ROOT] = 0 scores[ontology.ROOT] = NO_SCORE vwrite(green('OK!'), '\n') # Building ontology tree output.write( gray('Building from raw data with mintaxa = ') + f'{mintaxa:_d}' + gray(' ... \n')) vwrite(gray(' Building ontology tree with all-in-1... ')) tree = TaxTree() ancestors: Set[Id] orphans: Set[Id] ancestors, orphans = ontology.get_ancestors(counts.keys()) out = SampleDataById(['all']) tree.allin1(ontology=ontology, counts=counts, scores=scores, ancestors=ancestors, min_taxa=mintaxa, include=including, exclude=excluding, out=out) out.purge_counters() vwrite(green('OK!'), '\n') # Stats: Complete final value for TaxIDs after tree building and folding final_taxids: int = len(out.counts) if out.counts is not None else 0 stat.set_final_taxids(final_taxids) # Check for additional loss of reads (due to include/exclude an orphans) output.write(gray(' Check for more seqs lost ([in/ex]clude affects)... ')) if out.counts is not None: discard: int = sum(counts.values()) - sum(out.counts.values()) if discard: output.write( blue('\n Info:') + f' {discard} ' + gray('additional seqs discarded (') + f'{discard/sum(counts.values()):.3%} ' + gray('of accepted)\n')) else: output.write(green('OK!\n')) else: output.write(red('No counts in sample tree!\n')) # Warn or give detailed stats about orphan taxid and orphan seqs if debug: vwrite(gray(' Checking taxid loss (orphans)... ')) lost: int = 0 if orphans: for orphan in orphans: vwrite(yellow(' Warning!'), gray('Orphan taxid'), f'{orphan}\n') lost += counts[orphan] vwrite( yellow(' WARNING!'), f'{len(orphans)} orphan taxids (' f'{len(orphans)/len(counts):.2%} of accepted)\n' f' and {lost} orphan sequences (' f'{lost/sum(counts.values()):.3%} of accepted)\n') else: vwrite(green('OK!\n')) elif orphans: output.write( yellow('\n Warning!') + f' {len(orphans)} orphan taxids' + gray(' (rerun with --debug for details)\n')) # Check the removal of TaxIDs (accumulation of leaves in parents) if debug and not excluding and including == {ontology.ROOT}: vwrite(gray(' Assess accumulation due to "folding the tree"...\n')) migrated: int = 0 if out.counts is not None: for taxid in counts: if out.counts[taxid] == 0: migrated += 1 vwrite( blue(' Info:'), gray(f'Folded TaxID {taxid} (') + f'{ontology.get_name(taxid)}' + gray(') with ') + f'{counts[taxid]}' + gray(' original seqs\n')) if migrated: vwrite( blue(' INFO:'), f'{migrated} TaxIDs folded (' f'{migrated/len(+counts):.2%} of TAF —TaxIDs after filtering—)' '\n') vwrite( blue(' INFO:'), f'Final assigned TaxIDs: {final_taxids} ' f'(reduced to {final_taxids/len(+counts):.2%} of ' 'number of TAF)\n') else: vwrite(blue(' INFO:'), gray('No migration!'), green('OK!\n')) # Print last message and check if the sample is void if out.counts: output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') + green('OK!\n')) elif is_ctrl: output.write(sample + red(' ctrl VOID!\n')) error = Err.VOID_CTRL else: output.write(sample + blue(' sample ') + yellow('VOID\n')) error = Err.VOID_SAMPLE # Timing results output.write( gray('Load elapsed time: ') + f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n')) print(output.getvalue()) sys.stdout.flush() return sample, tree, out, stat, error
def read_kraken_output( output_file: Filename, scoring: Scoring = Scoring.KRAKEN, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read Kraken output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_kmerel: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: # Check number of cols in header header = file.readline().split('\t') if len(header) != 5: print( red('\nERROR! ') + 'Kraken output format of ', yellow(f'"{output_file}"'), 'not supported.') print(magenta('Expected:'), 'C/U, ID, taxid, length, list of mappings') print(magenta('Found:'), '\t'.join(header), end='') print(blue('HINT:'), 'Use Kraken or Kraken2 direct output.') raise Exception('Unsupported file format. Aborting.') for raw_line in file: try: output_line = raw_line.strip() (_clas, _label, _tid, _length, _maps) = output_line.split('\t') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue try: length: int = sum(map(int, _length.split('|'))) num_read += 1 nt_read += length if _clas == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue tid: Id = Id(_tid) maps: List[str] = _maps.split() try: maps.remove('|:|') except ValueError: pass mappings: Counter[Id] = col.Counter() for pair in maps: couple: List[str] = pair.split(':') mappings[Id(couple[0])] += int(couple[1]) # From Kraken score get "single hit equivalent length" shel: Score = Score(mappings[tid] + K_MER_SIZE) score: Score = Score(mappings[tid] / sum(mappings.values()) * 100) # % relative to all k-mers except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue else: taxids.add(tid) # Save all the tids of classified reads if minscore is not None: # Decide if ignore read if low score if scoring is Scoring.KRAKEN: if score < minscore: continue else: if shel < minscore: continue try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_kmerel[tid].append(score) except KeyError: all_kmerel[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, scores2=all_kmerel, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores SHEL: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Coverage(%): min = ') + f'{stat.sco2.mini:.1f},' + gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') + f'{stat.sco2.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.KRAKEN: out_scores = {tid: Score(mean(all_kmerel[tid])) for tid in all_kmerel} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f'kraken: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
def read_generic_output( output_file: Filename, scoring: Scoring = Scoring.GENERIC, minscore: Score = None, genfmt: GenericFormat = None ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read an output file from a generic classifier Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification genfmt: GenericFormat object specifying the files format Returns: log string, statistics, abundances counter, scores dict """ # Initialization of variables output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) # Check format if not isinstance(genfmt, GenericFormat): raise Exception( red('\nERROR!'), 'Missing GenericFormat when reading a generic output.') try: with open(output_file, 'r') as file: # Main loop processing each file line for raw_line in file: raw_line = raw_line.strip(' \n\t') splitting: str if genfmt.typ is GenericType.CSV: splitting = ',' elif genfmt.typ is GenericType.TSV: splitting = '\t' elif genfmt.typ is GenericType.SSV: splitting = ' ' else: raise Exception(f'ERROR! Unknown GenericType {genfmt.typ}') output_line: List[str] = raw_line.split(splitting) if len(output_line) < GenericFormat.MIN_COLS: if num_read == 0 and last_error_read < 0: last_error_read = 0 print(yellow('Warning!'), 'Skipping header of ' f'{output_file}') continue # Not account for the header as an error raise Exception( red('\nERROR!') + ' Line ' + yellow(f'{output_line}') + '\n\tin ' + yellow(f'{output_file}') + ' has < ' + blue(f'{GenericFormat.MIN_COLS}') + ' required ' + 'columns.\n\tPlease check the file.') try: tid: Id = Id(output_line[genfmt.tid - 1].strip(' "')) length: int = int(output_line[genfmt.len - 1].strip(' "')) if tid == genfmt.unc: # Avoid read score for unclass reads num_read += 1 nt_read += length num_uncl += 1 continue score: Score = Score( float(output_line[genfmt.sco - 1].strip(' "'))) except ValueError: if num_read == 0 and last_error_read < 0: last_error_read = 0 print(yellow('Warning!'), 'Skipping header of ' f'{output_file}') continue # Not account for the header as a failure print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 if num_read > 100 and num_errors > 0.5 * num_read: print( red('ERROR!'), 'Unreliable file processing: rate of problematic' f' reads is {num_errors/num_read*100:_d}, beyond' ' 50%, after 100 reads. Please check the format ' f'of the file "{output_file}".') raise else: continue num_read += 1 nt_read += length taxids.add(tid) # Save all the tids of classified reads if minscore is not None and score < minscore: continue # Discard read if low confidence try: all_scores[tid].append(score) except KeyError: all_scores[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.GENERIC: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: raise Exception(red('\nERROR!'), f'Generic: Unsupported Scoring "{scoring}"') # Return return output.getvalue(), stat, counts, out_scores