def read_nodes(self, nodes_file: Filename) -> None: """Build dicts of parent and rank for a given taxid (key)""" print('\033[90mLoading NCBI nodes...\033[0m', end='') sys.stdout.flush() try: with open(nodes_file, 'r') as file: for line in file: _tid, _parent, _rank, *_ = line.split('\t|\t') tid = Id(_tid) parent = Id(_parent) if self.collapse and parent == CELLULAR_ORGANISMS: self.parents[tid] = ROOT else: self.parents[tid] = parent rank: Rank try: rank = Rank[_rank.upper().replace(" ", "_")] except KeyError: raise UnsupportedTaxLevelError( f'Unknown tax level {_rank}') self.ranks[tid] = rank except OSError: print(red('ERROR!'), f'Cannot read {nodes_file}.') print(magenta('TIP:'), 'Did you select the right path with the "-n" option?') print(magenta('TIP:'), 'Did you use "Retaxdump" to install the dump files?') raise else: print('\033[92m OK! \033[0m')
def read_report(report_file: str) -> Tuple[str, Counter[Id], Dict[Id, Rank]]: """ Read Centrifuge/Kraken report file Args: report_file: report file name Returns: log string, abundances counter, taxlevel dict """ # TODO: Discontinued method, to be erased in a future release output: io.StringIO = io.StringIO(newline='') abundances: Counter[Id] = col.Counter() level_dic = {} output.write(f'\033[90mLoading report file {report_file}...\033[0m') try: with open(report_file, 'r') as file: for report_line in file: _, _, taxnum, taxlev, _tid, _ = report_line.split('\t') tid = Id(_tid) abundances[tid] = int(taxnum) level_dic[tid] = Rank.centrifuge(taxlev) except KeyboardInterrupt: print(gray(' User'), yellow('interrupted!')) raise except Exception: print(red('ERROR!'), 'Cannot read "' + report_file + '"') raise else: output.write('\033[92m OK! \033[0m\n') return output.getvalue(), abundances, level_dic
def read_report(report_file: str) -> Tuple[str, Counter[Id], Dict[Id, Rank]]: """ Read Centrifuge/Kraken report file Args: report_file: report file name Returns: log string, abundances counter, taxlevel dict """ output: io.StringIO = io.StringIO(newline='') abundances: Counter[Id] = col.Counter() level_dic = {} output.write(f'\033[90mLoading report file {report_file}...\033[0m') try: with open(report_file, 'r') as file: for report_line in file: _, _, taxnum, taxlev, _tid, _ = report_line.split('\t') tid = Id(_tid) abundances[tid] = int(taxnum) level_dic[tid] = Rank.centrifuge(taxlev) except: raise Exception('\n\033[91mERROR!\033[0m Cannot read "' + report_file + '"') else: output.write('\033[92m OK! \033[0m\n') return output.getvalue(), abundances, level_dic
def read_mock_files(mock: Filename) -> Counter[Id]: """Read a mock layout (.mck) file""" mock_layout: Counter[Id] = col.Counter() with open(mock, 'r') as file: vprint(gray('\nProcessing'), blue(mock), gray('file:\n')) for line in file: if line.startswith('#'): continue _tid, _num = line.split('\t') tid = Id(_tid) num = int(_num) mock_layout[tid] = num vprint(num, gray('\treads for taxid\t'), tid, '\t(', cyan(ncbi.get_name(tid)), ')\n') return mock_layout
def read_names(self, names_file: Filename) -> None: """Build dict with name for a given taxid (key).""" print('\033[90mLoading NCBI names...\033[0m', end='') sys.stdout.flush() try: with open(names_file, 'r') as file: for line in file: if 'scientific name' in line: tid, scientific_name, *_ = line.split('\t|\t') self.names[Id(tid)] = scientific_name except OSError: print(red('ERROR!'), f'Cannot read {names_file}.') print(magenta('TIP:'), 'Did you use "Retaxdump" to install the dump files?') raise else: print('\033[92m OK! \033[0m')
def mock_from_scratch(out: Filename, mock_layout: Counter[Id]) -> None: """Generate a mock Centrifuge output file from scratch""" with open(out, 'w') as fout: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write('readID\tseqID\ttaxID\tscore\t2ndBestScore\t' 'hitLength\tqueryLength\tnumMatches\n') reads_writen: int = 0 for numtid in mock_layout: tid = Id(numtid) # Convert to Id the excel integer maxhl: int = random.randint(args.random + 1, MAX_HIT_LENGTH) rank: str = str(ncbi.get_rank(tid)).lower() for _ in range(int(mock_layout[numtid])): hit_length = random.randint(args.random + 1, maxhl) fout.write(f'test{reads_writen}\t{rank}\t' f'{tid}\t{(hit_length-15)**2}\t' f'0\t{hit_length}\t{MAX_HIT_LENGTH}\t1\n') reads_writen += 1 vprint(reads_writen, 'reads', green('OK!\n'))
def mock_from_source(out: Filename, mock_layout: Counter[Id]) -> None: """Generate a mock Centrifuge output file from source file""" with open(out, 'w') as fout, open(args.file) as fcfg: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write(fcfg.readline()) # copy cfg output file header reads_writen: int = 0 for line in fcfg: tid = Id(line.split('\t')[2]) if mock_layout[tid]: fout.write(line) mock_layout[tid] -= 1 reads_writen += 1 if not sum(mock_layout.values()): vprint(reads_writen, 'reads', green('OK!\n')) break if sum(mock_layout.values()): print(red('ERROR!\n')) print(gray('Incomplete read copy by taxid:')) mock_layout = +mock_layout # Delete zero counts elements for tid in mock_layout: print(yellow(mock_layout[tid]), gray('reads missing for tid'), tid, '(', cyan(ncbi.get_name(tid)), ')\n')
def read_output( output_file: Filename, scoring: Scoring = Scoring.SHEL, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read Centrifuge output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: file.readline() # discard header for output_line in file: try: _, _, _tid, _score, _, _, _length, *_ = output_line.split( '\t') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue tid = Id(_tid) try: # From Centrifuge score get "single hit equivalent length" shel = Score(float(_score)**0.5 + 15) length = int(_length) except ValueError: print(yellow('Failure'), f'parsing score ({_score}) for ', f'query length {_length} for taxid {_tid}', f'in {output_file}. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue num_read += 1 nt_read += length if tid == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue else: taxids.add(tid) # Save all the tids of classified reads if minscore is not None and shel < minscore: continue # Ignore read if low confidence try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, scores=all_scores, lens=all_length, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f}' + gray(', max = ') + f'{stat.sco.maxi:.1f}' + gray(', avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Length: min = ') + f'{stat.len.mini}' + gray(', max = ') + f'{stat.len.maxi}' + gray(', avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f' Centrifuge: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
import collections as col import io import os from math import log10 from statistics import mean from typing import Tuple, Counter, Dict, List, Set from Bio import SeqIO from recentrifuge.config import Filename, Id, Score, Scoring from recentrifuge.config import gray, red, green, yellow from recentrifuge.rank import Rank from recentrifuge.stats import SampleStats # Centrifuge specific constants UNCLASSIFIED: Id = Id('0') def read_report(report_file: str) -> Tuple[str, Counter[Id], Dict[Id, Rank]]: """ Read Centrifuge/Kraken report file Args: report_file: report file name Returns: log string, abundances counter, taxlevel dict """ # TODO: Discontinued method, to be erased in a future release output: io.StringIO = io.StringIO(newline='')
def read_clark_output( output_file: Filename, scoring: Scoring = Scoring.CLARK_C, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read CLARK(-l)(-S) full mode output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_confs: Dict[Id, List[Score]] = {} all_gammas: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: # Check number of cols in header header = file.readline().split(',') if len(header) != 8: print( red('\nERROR! ') + 'CLARK output format of ', yellow(f'"{output_file}"'), 'not supported.') print(magenta('Expected:'), 'ID,Length,Gamma,1st,score1,2nd,score2,conf') print(magenta('Found:'), ','.join(header), end='') print(blue('HINT:'), 'Use CLARK, CLARK-l, or CLARK-S ' 'with full mode (', blue('-m 0'), ')') raise Exception('Unsupported file format. Aborting.') for raw_line in file: try: output_line = raw_line.strip() (_label, _length, _gamma, _tid1, _score1, _tid2, _score2, _conf) = output_line.split(',') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue try: length: int = int(_length) gamma: Score = Score(float(_gamma)) tid1: Id = Id(_tid1) score1: Score = Score(float(_score1)) tid2: Id = Id(_tid2) score2: Score = Score(float(_score2)) conf: Score = Score(float(_conf)) except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue num_read += 1 nt_read += length # Select tid and score between CLARK assignments 1 and 2 tid: Id = tid1 score: Score = score1 if tid1 == UNCLASSIFIED: if tid2 == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue else: # Majority of read unclassified tid = tid2 score = score2 conf = Score(1 - conf) # Get CLARK's h2/(h1+h2) # From CLARK_C(S) score get "single hit equivalent length" shel: Score = Score(score + K_MER_SIZE) taxids.add(tid) # Save all the selected tids (tid1 or tid2) if minscore is not None: # Decide if ignore read if low score if scoring is Scoring.CLARK_C: if conf < minscore: continue elif scoring is Scoring.CLARK_G: if gamma < minscore: continue else: if shel < minscore: continue try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_confs[tid].append(conf) except KeyError: all_confs[tid] = [ conf, ] try: all_gammas[tid].append(gamma) except KeyError: all_gammas[tid] = [ gamma, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, scores2=all_confs, scores3=all_gammas, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Hit (score): min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Conf. score: min = ') + f'{stat.sco2.mini:.1f},' + gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') + f'{stat.sco2.mean:.1f}\n') output.write( gray(' Gamma score: min = ') + f'{stat.sco3.mini:.1f},' + gray(' max = ') + f'{stat.sco3.maxi:.1f},' + gray(' avr = ') + f'{stat.sco3.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.CLARK_C: out_scores = { tid: Score(mean(all_confs[tid]) * 100) for tid in all_confs } elif scoring is Scoring.CLARK_G: out_scores = {tid: Score(mean(all_gammas[tid])) for tid in all_gammas} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f'clark: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
""" import collections as col import io import os from math import log10 from statistics import mean from typing import Tuple, Counter, Dict, List, Set from recentrifuge.config import Filename, Id, Score, Scoring from recentrifuge.config import gray, red, green, yellow, blue, magenta from recentrifuge.stats import SampleStats # CLARK specific constants UNCLASSIFIED: Id = Id('NA') K_MER_SIZE: int = 31 # Default k-mer size for CLARK(S) def read_clark_output( output_file: Filename, scoring: Scoring = Scoring.CLARK_C, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read CLARK(-l)(-S) full mode output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification
def read_plasmids(self, plasmid_file: Filename) -> None: """Read, check and include plasmid data""" print('\033[90mLoading LMAT plasmids...\033[0m', end='') sys.stdout.flush() pattern1 = re.compile( r"""((?:"([\w\-\.\(\)/+=':,%\*\s]*)"$)|(?:^([\w\-\.\(\)/+=':\*\s]*(?:, (?:strain|isolate|plasmid) [\w\-/\.]*)*(?:, fragment \w*)?(?:, contig \w)?)(?:, a cloning vector)?(?=(?=(?:, (?:complete|partial) (?:plasmid |genomic )*(?:sequence|genome|cds|replicon))(?:\[sequence_id)*)|(?:, complete sequence)*, whole genome shotgun sequence|\[sequence_id)))""" # pylint: disable=line-too-long ) pattern2 = re.compile(r"""(^(?:[A-Za-z0-9/=\-\.{},]*(?: |.)){1,8})""") match: Counter = col.Counter() try: with open(plasmid_file, 'r') as file: for line in file: _tid, _parent, *_, last = line.rstrip('\n').split('\t') last = last.split(r'|')[-1] tid = Id(_tid) parent = Id(_parent) # Plasmids sanity checks if tid in self.parents: # if plasmid tid already in NCBI match['ERR1'] += 1 if self.debug: print(f'\033[93mPlasmid taxid ERROR!\033[0m' f' Taxid={tid} already a NCBI taxid. ' f'Declared parent is {parent} but ' f'NCBI parent is {self.parents[tid]}.') print('\tPlasmid details: ', last) continue elif tid == parent: # if plasmid and parent tids are equal match['ERR2'] += 1 if self.debug: print(f'\033[93mPlasmid parent taxid ERROR!\033[0m' f' Taxid={tid} and parent={parent}.') print('\t\t Plasmid details: ', last) continue else: # No problem, go ahead and add the plasmid! self.parents[tid] = parent # Plasmid name extraction by regular expressions name: str try: name = pattern1.search(last).group(1) # type: ignore name = 'Plasmid ' + name.strip(r'"').strip(',') except AttributeError: try: name = pattern2.search( # type: ignore last).group(1).strip() name = 'Plasmid ' + name except AttributeError: name = 'Plasmid ' + tid match['FAIL'] += 1 else: match['PAT2'] += 1 else: match['PAT1'] += 1 self.names[tid] = name except OSError: print('\033[93mWARNING\033[0m: Cannot read "' + plasmid_file + '". Plasmid taxids not loaded!') print(magenta('TIP:'), 'Manual installation of the plasmids file required.') raise else: # Statistics about plasmids print( '\033[92m OK! \033[0m\n', '\033[90mPlasmid sanity check:\033[0m', f'\033[93m rejected\033[0m (taxid error) = {match["ERR1"]}', f'\033[93m rejected\033[0m (parent error) = {match["ERR2"]}') print('\033[90m Plasmid pattern matching:\033[0m', f'\033[90m 1st type =\033[0m {match["PAT1"]} ', f'\033[90m 2nd type =\033[0m {match["PAT2"]} ', f'\033[90m other =\033[0m {match["FAIL"]}')
def read_kraken_output( output_file: Filename, scoring: Scoring = Scoring.KRAKEN, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read Kraken output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_kmerel: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: # Check number of cols in header header = file.readline().split('\t') if len(header) != 5: print( red('\nERROR! ') + 'Kraken output format of ', yellow(f'"{output_file}"'), 'not supported.') print(magenta('Expected:'), 'C/U, ID, taxid, length, list of mappings') print(magenta('Found:'), '\t'.join(header), end='') print(blue('HINT:'), 'Use Kraken or Kraken2 direct output.') raise Exception('Unsupported file format. Aborting.') for raw_line in file: try: output_line = raw_line.strip() (_clas, _label, _tid, _length, _maps) = output_line.split('\t') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue try: length: int = sum(map(int, _length.split('|'))) num_read += 1 nt_read += length if _clas == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue tid: Id = Id(_tid) maps: List[str] = _maps.split() try: maps.remove('|:|') except ValueError: pass mappings: Counter[Id] = col.Counter() for pair in maps: couple: List[str] = pair.split(':') mappings[Id(couple[0])] += int(couple[1]) # From Kraken score get "single hit equivalent length" shel: Score = Score(mappings[tid] + K_MER_SIZE) score: Score = Score(mappings[tid] / sum(mappings.values()) * 100) # % relative to all k-mers except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue else: taxids.add(tid) # Save all the tids of classified reads if minscore is not None: # Decide if ignore read if low score if scoring is Scoring.KRAKEN: if score < minscore: continue else: if shel < minscore: continue try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_kmerel[tid].append(score) except KeyError: all_kmerel[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, scores2=all_kmerel, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores SHEL: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Coverage(%): min = ') + f'{stat.sco2.mini:.1f},' + gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') + f'{stat.sco2.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.KRAKEN: out_scores = {tid: Score(mean(all_kmerel[tid])) for tid in all_kmerel} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f'kraken: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
def __init__(self, frmt: str): def print_error(specifier): """GenericFormat constructor: print an informative error message""" print(red('ERROR!'), 'Generic --format string malformed:', blue(specifier), '\n\tPlease rerun with --help for details.') blocks: List[str] = [fld.strip() for fld in frmt.split(',')] if len(blocks) < self.MIN_FIELDS: print_error(f'Wrong number of fields (expected {self.MIN_FIELDS} ' f'found {len(blocks)}).') exit(2) try: fmt: Dict[str, str] = { pair.split(':')[0].strip(): pair.split(':')[1].strip() for pair in blocks } except IndexError: print_error('All fields need ":" separator.') exit(2) # Populate fields try: typ = fmt['TYP'] except KeyError: print_error('TYPe field is mandatory.') exit(2) try: self.typ: GenericType = GenericType[typ.upper()] except KeyError: print_error('Unknown file TYPe, valid options are ' + ' or '.join([str(t) for t in GenericType])) exit(2) try: self.tid: int = int(fmt['TID']) except KeyError: print_error('TaxID field is mandatory.') exit(2) except ValueError: print_error('TaxID field is an integer number of column.') exit(2) try: self.len: int = int(fmt['LEN']) except KeyError: print_error('LENgth field is mandatory.') exit(2) except ValueError: print_error('LENgth field is an integer number of column.') exit(2) try: self.sco: int = int(fmt['SCO']) except KeyError: print_error('SCOre field is mandatory.') exit(2) except ValueError: print_error('SCOre field is an integer number of column.') exit(2) try: self.unc: Id = Id(fmt['UNC']) except KeyError: print_error('UNClassified field is mandatory.') exit(2) # Check columns are different if (self.tid == self.len or self.tid == self.sco or self.len == self.sco): print_error('Different fields need different columns.') exit(2) # Check column numbers are positive if self.tid < 1 or self.len < 1 or self.sco < 1: print_error('Columns numbers should be positive integers.') exit(2)
def read_generic_output( output_file: Filename, scoring: Scoring = Scoring.GENERIC, minscore: Score = None, genfmt: GenericFormat = None ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read an output file from a generic classifier Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification genfmt: GenericFormat object specifying the files format Returns: log string, statistics, abundances counter, scores dict """ # Initialization of variables output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) # Check format if not isinstance(genfmt, GenericFormat): raise Exception( red('\nERROR!'), 'Missing GenericFormat when reading a generic output.') try: with open(output_file, 'r') as file: # Main loop processing each file line for raw_line in file: raw_line = raw_line.strip(' \n\t') splitting: str if genfmt.typ is GenericType.CSV: splitting = ',' elif genfmt.typ is GenericType.TSV: splitting = '\t' elif genfmt.typ is GenericType.SSV: splitting = ' ' else: raise Exception(f'ERROR! Unknown GenericType {genfmt.typ}') output_line: List[str] = raw_line.split(splitting) if len(output_line) < GenericFormat.MIN_COLS: if num_read == 0 and last_error_read < 0: last_error_read = 0 print(yellow('Warning!'), 'Skipping header of ' f'{output_file}') continue # Not account for the header as an error raise Exception( red('\nERROR!') + ' Line ' + yellow(f'{output_line}') + '\n\tin ' + yellow(f'{output_file}') + ' has < ' + blue(f'{GenericFormat.MIN_COLS}') + ' required ' + 'columns.\n\tPlease check the file.') try: tid: Id = Id(output_line[genfmt.tid - 1].strip(' "')) length: int = int(output_line[genfmt.len - 1].strip(' "')) if tid == genfmt.unc: # Avoid read score for unclass reads num_read += 1 nt_read += length num_uncl += 1 continue score: Score = Score( float(output_line[genfmt.sco - 1].strip(' "'))) except ValueError: if num_read == 0 and last_error_read < 0: last_error_read = 0 print(yellow('Warning!'), 'Skipping header of ' f'{output_file}') continue # Not account for the header as a failure print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 if num_read > 100 and num_errors > 0.5 * num_read: print( red('ERROR!'), 'Unreliable file processing: rate of problematic' f' reads is {num_errors/num_read*100:_d}, beyond' ' 50%, after 100 reads. Please check the format ' f'of the file "{output_file}".') raise else: continue num_read += 1 nt_read += length taxids.add(tid) # Save all the tids of classified reads if minscore is not None and score < minscore: continue # Discard read if low confidence try: all_scores[tid].append(score) except KeyError: all_scores[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.GENERIC: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: raise Exception(red('\nERROR!'), f'Generic: Unsupported Scoring "{scoring}"') # Return return output.getvalue(), stat, counts, out_scores