def check_debug(): """Check debugging mode""" if args.debug: print(blue('INFO:'), gray('Debugging mode activated')) print(blue('INFO:'), gray('Active parameters:')) for key, value in vars(args).items(): if value: print(gray(f'\t{key} ='), f'{value}')
def check_controls(): """Check and info about the control samples""" if args.controls: if args.controls > len(input_files): print(red(' ERROR!'), gray('More controls than samples')) exit(1) print(gray('Control(s) sample(s) for subtractions:')) for i in range(args.controls): print(blue(f'\t{input_files[i]}'))
def read_mock_files(mock: Filename) -> Counter[Id]: """Read a mock layout (.mck) file""" mock_layout: Counter[Id] = col.Counter() with open(mock, 'r') as file: vprint(gray('\nProcessing'), blue(mock), gray('file:\n')) for line in file: if line.startswith('#'): continue _tid, _num = line.split('\t') tid = Id(_tid) num = int(_num) mock_layout[tid] = num vprint(num, gray('\treads for taxid\t'), tid, '\t(', cyan(ncbi.get_name(tid)), ')\n') return mock_layout
def mock_from_scratch(out: Filename, mock_layout: Counter[TaxId]) -> None: """Generate a mock Centrifuge output file from scratch""" with open(out, 'w') as fout: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write('readID\tseqID\ttaxID\tscore\t2ndBestScore\t' 'hitLength\tqueryLength\tnumMatches\n') reads_writen: int = 0 for tid in mock_layout: maxhl: int = random.randint(args.random + 1, MAX_HIT_LENGTH) rank: str = str(ncbi.get_rank(tid)).lower() for _ in range(mock_layout[tid]): hit_length = random.randint(args.random + 1, maxhl) fout.write(f'test{reads_writen}\t{rank}\t' f'{tid}\t{(hit_length-15)**2}\t' f'0\t{hit_length}\t{MAX_HIT_LENGTH}\t1\n') reads_writen += 1 vprint(reads_writen, 'reads', green('OK!\n'))
def mock_from_scratch(out: Filename, mock_layout: Counter[Id]) -> None: """Generate a mock Centrifuge output file from scratch""" with open(out, 'w') as fout: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write('readID\tseqID\ttaxID\tscore\t2ndBestScore\t' 'hitLength\tqueryLength\tnumMatches\n') reads_writen: int = 0 for numtid in mock_layout: tid = Id(numtid) # Convert to Id the excel integer maxhl: int = random.randint(rnd + 1, MAX_HIT_LENGTH) rank: str = str(ncbi.get_rank(tid)).lower() for _ in range(int(mock_layout[numtid])): hit_length = random.randint(rnd + 1, maxhl) fout.write(f'test{reads_writen}\t{rank}\t' f'{tid}\t{(hit_length - 15) ** 2}\t' f'0\t{hit_length}\t{MAX_HIT_LENGTH}\t1\n') reads_writen += 1 vprint(reads_writen, 'reads', green('OK!\n')) if out == TEST_REXT_SMPL: # Test mode: create mock FASTQ for smpl mock_fastq(reads_writen)
def mock_from_source(out: Filename, mock_layout: Counter[Id]) -> None: """Generate a mock Centrifuge output file from source file""" with open(out, 'w') as fout, open(args.file) as fcfg: vprint(gray('Generating'), blue(out), gray('file... ')) fout.write(fcfg.readline()) # copy cfg output file header reads_writen: int = 0 for line in fcfg: tid = Id(line.split('\t')[2]) if mock_layout[tid]: fout.write(line) mock_layout[tid] -= 1 reads_writen += 1 if not sum(mock_layout.values()): vprint(reads_writen, 'reads', green('OK!\n')) break if sum(mock_layout.values()): print(red('ERROR!\n')) print(gray('Incomplete read copy by taxid:')) mock_layout = +mock_layout # Delete zero counts elements for tid in mock_layout: print(yellow(mock_layout[tid]), gray('reads missing for tid'), tid, '(', cyan(ncbi.get_name(tid)), ')\n')
def read_clark_output( output_file: Filename, scoring: Scoring = Scoring.CLARK_C, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read CLARK(-l)(-S) full mode output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_confs: Dict[Id, List[Score]] = {} all_gammas: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: # Check number of cols in header header = file.readline().split(',') if len(header) != 8: print( red('\nERROR! ') + 'CLARK output format of ', yellow(f'"{output_file}"'), 'not supported.') print(magenta('Expected:'), 'ID,Length,Gamma,1st,score1,2nd,score2,conf') print(magenta('Found:'), ','.join(header), end='') print(blue('HINT:'), 'Use CLARK, CLARK-l, or CLARK-S ' 'with full mode (', blue('-m 0'), ')') raise Exception('Unsupported file format. Aborting.') for raw_line in file: try: output_line = raw_line.strip() (_label, _length, _gamma, _tid1, _score1, _tid2, _score2, _conf) = output_line.split(',') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue try: length: int = int(_length) gamma: Score = Score(float(_gamma)) tid1: Id = Id(_tid1) score1: Score = Score(float(_score1)) tid2: Id = Id(_tid2) score2: Score = Score(float(_score2)) conf: Score = Score(float(_conf)) except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue num_read += 1 nt_read += length # Select tid and score between CLARK assignments 1 and 2 tid: Id = tid1 score: Score = score1 if tid1 == UNCLASSIFIED: if tid2 == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue else: # Majority of read unclassified tid = tid2 score = score2 conf = Score(1 - conf) # Get CLARK's h2/(h1+h2) # From CLARK_C(S) score get "single hit equivalent length" shel: Score = Score(score + K_MER_SIZE) taxids.add(tid) # Save all the selected tids (tid1 or tid2) if minscore is not None: # Decide if ignore read if low score if scoring is Scoring.CLARK_C: if conf < minscore: continue elif scoring is Scoring.CLARK_G: if gamma < minscore: continue else: if shel < minscore: continue try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_confs[tid].append(conf) except KeyError: all_confs[tid] = [ conf, ] try: all_gammas[tid].append(gamma) except KeyError: all_gammas[tid] = [ gamma, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, scores2=all_confs, scores3=all_gammas, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Hit (score): min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Conf. score: min = ') + f'{stat.sco2.mini:.1f},' + gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') + f'{stat.sco2.mean:.1f}\n') output.write( gray(' Gamma score: min = ') + f'{stat.sco3.mini:.1f},' + gray(' max = ') + f'{stat.sco3.maxi:.1f},' + gray(' avr = ') + f'{stat.sco3.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.CLARK_C: out_scores = { tid: Score(mean(all_confs[tid]) * 100) for tid in all_confs } elif scoring is Scoring.CLARK_G: out_scores = {tid: Score(mean(all_gammas[tid])) for tid in all_gammas} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f'clark: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
def robust_contamination_removal(): """Implement robust contamination removal algorithm.""" nonlocal exclude_sets, shared_crossover def compute_qn(data: List[float], dist: str = "Gauss") -> float: """Compute Qn robust estimator of scale (Rousseeuw, 1993)""" c_d: float # Select d parameter depending on the distribution if dist == "Gauss": c_d = 2.2219 elif dist == "Cauchy": # Heavy-tailed distribution c_d = 1.2071 elif dist == "NegExp": # Negative exponential (asymetric) c_d = 3.4760 else: raise Exception(red('\nERROR! ') + 'Unknown distribution') num: int = len(data) sort_data = sorted(data) pairwisedifs: List[float] = [] for (i, x_val) in enumerate(sort_data): for y_val in sort_data[i + 1:]: pairwisedifs.append(abs(x_val - y_val)) k: int = int(num * (num / 2 + 1) / 4) return c_d * sorted(pairwisedifs)[k - 1] exclude_sets = {smpl: set() for smpl in raws[controls:]} vwrite( gray('Robust contamination removal: ' 'Searching for contaminants...\n')) for tid in exclude_candidates: relfreq_ctrl: List[float] = [ accs[ctrl][tid] / accs[ctrl][ontology.ROOT] for ctrl in raws[:controls] ] relfreq_smpl: List[float] = [ accs[smpl][tid] / accs[smpl][ontology.ROOT] for smpl in raws[controls:] ] relfreq: List[float] = relfreq_ctrl + relfreq_smpl crossover: List[bool] # Crossover source (yes/no) # Just-controls contamination check if all([rf < EPS for rf in relfreq_smpl]): vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') continue # Go for next candidate # Critical contamination check if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(red('critical:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Severe contamination check if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(yellow('severe: \t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Mild contamination check if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Calculate median and MAD median but including controls mdn: float = statistics.median(relfreq) # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq]) q_n: float = compute_qn(relfreq, dist="NegExp") # Calculate crossover in samples outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n ordomag_lim: float = max( relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG crossover = [ rf > outlier_lim and rf > ordomag_lim for rf in relfreq[controls:] ] # Crossover contamination check if any(crossover): vwrite( magenta('crossover:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), gray('crossover:'), blst2str(crossover), '\n') # Exclude just for contaminated samples (not the source) vwrite(magenta('\t->'), gray(f'Include {tid} just in:')) for i in range(len(raws[controls:])): if not crossover[i]: exclude_sets[raws[i + controls]].add(tid) else: vwrite(f' {raws[i + controls]}') if all(crossover): # Shared taxon contaminating control(s) vwrite(' (', yellow('Shared crossover taxon!'), ')') shared_crossover.add(tid) vwrite('\n') continue # Other contamination: remove from all samples vwrite( gray('other cont:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid)
def process_output( *args, **kwargs ) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]: """ Process Centrifuge/LMAT output files (to be usually called in parallel!). """ # timing initialization start_time: float = time.perf_counter() # Recover input and parameters target_file: Filename = args[0] debug: bool = kwargs['debug'] is_ctrl: bool = args[1] if debug: print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'), target_file, gray('...')) sys.stdout.flush() taxonomy: Taxonomy = kwargs['taxonomy'] mintaxa: int = kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa'] minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore'] including: Set[TaxId] = taxonomy.including excluding: Set[TaxId] = taxonomy.excluding scoring: Scoring = kwargs['scoring'] lmat: bool = kwargs['lmat'] output: io.StringIO = io.StringIO(newline='') def vwrite(*args): """Print only if verbose/debug mode is enabled""" if kwargs['debug']: output.write(' '.join(str(item) for item in args)) sample: Sample = Sample(os.path.splitext(target_file)[0]) error: Err = Err.NO_ERROR # Read Centrifuge/LMAT output files to get abundances read_method: Callable[[Filename, Scoring, Optional[Score]], # Input Tuple[str, SampleStats, Counter[TaxId], Dict[TaxId, Score]] # Output ] if lmat: read_method = read_lmat_output else: read_method = read_output log: str counts: Counter[TaxId] scores: Dict[TaxId, Score] log, stat, counts, scores = read_method(target_file, scoring, minscore) output.write(log) # Update field in stat about control nature of the sample stat.is_ctrl = is_ctrl # Move cellular_organisms counts to root, in case if taxonomy.collapse and counts[CELLULAR_ORGANISMS]: vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS], gray('"CELLULAR_ORGANISMS" reads to "ROOT"... ')) if counts[ROOT]: stat.num_taxa -= 1 scores[ROOT] = ( (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] + scores[ROOT] * counts[ROOT]) / (counts[CELLULAR_ORGANISMS] + counts[ROOT])) else: scores[ROOT] = scores[CELLULAR_ORGANISMS] counts[ROOT] += counts[CELLULAR_ORGANISMS] counts[CELLULAR_ORGANISMS] = 0 scores[CELLULAR_ORGANISMS] = NO_SCORE # Remove root counts, in case if kwargs['root'] and counts[ROOT]: vwrite(gray('Removing'), counts[ROOT], gray('"ROOT" reads... ')) stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ROOT]) stat.num_taxa -= 1 counts[ROOT] = 0 scores[ROOT] = NO_SCORE vwrite(green('OK!'), '\n') # Building taxonomy tree output.write(gray('Building from raw data... ')) vwrite(gray('\n Building taxonomy tree with all-in-1... ')) tree = TaxTree() ancestors: Set[TaxId] orphans: Set[TaxId] ancestors, orphans = taxonomy.get_ancestors(counts.keys()) out = SampleDataByTaxId(['all']) tree.allin1(taxonomy=taxonomy, counts=counts, scores=scores, ancestors=ancestors, min_taxa=mintaxa, include=including, exclude=excluding, out=out) out.purge_counters() vwrite(green('OK!'), '\n') # Give stats about orphan taxid if debug: vwrite(gray(' Checking taxid loss (orphans)... ')) lost: int = 0 if orphans: for orphan in orphans: vwrite(yellow('Warning!'), f'Orphan taxid={orphan}\n') lost += counts[orphan] vwrite( yellow('WARNING!'), f'{len(orphans)} orphan taxids (' f'{len(orphans)/len(counts):.2%} of total)\n' f'{lost} orphan sequences (' f'{lost/sum(counts.values()):.3%} of total)\n') else: vwrite(green('OK!\n')) # Check the lost of taxids (plasmids typically) under some conditions if debug and not excluding and not including: vwrite(gray(' Additional checking of taxid loss... ')) lost = 0 for taxid in counts: if not out.counts[taxid]: lost += 1 vwrite(yellow('Warning!'), f'Lost taxid={taxid}: ' f'{taxonomy.get_name(taxid)}\n') if lost: vwrite( yellow('WARNING!'), f'Lost {lost} taxids (' f'{lost/len(counts):.2%} of total)' '\n') else: vwrite(green('OK!\n')) # Print last message and check if the sample is void if out.counts: output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') + green('OK!\n')) elif is_ctrl: output.write(sample + red(' ctrl VOID!\n')) error = Err.VOID_CTRL else: output.write(sample + blue(' sample ') + yellow('VOID\n')) error = Err.VOID_SAMPLE # Timing results output.write( gray('Load elapsed time: ') + f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n')) print(output.getvalue()) sys.stdout.flush() return sample, tree, out, stat, error
def process_output( *args, **kwargs) -> Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]: """ Process classifiers output files (to be usually called in parallel!). """ # timing initialization start_time: float = time.perf_counter() # Recover input and parameters target_file: Filename = args[0] debug: bool = kwargs['debug'] is_ctrl: bool = args[1] if debug: print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'), target_file, gray('...')) sys.stdout.flush() ontology: Ontology = kwargs['ontology'] mintaxa: Optional[int] = (kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa']) minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore'] including: Union[Tuple, Set[Id]] = ontology.including excluding: Union[Tuple, Set[Id]] = ontology.excluding scoring: Scoring = kwargs['scoring'] classifier: Classifier = kwargs['classifier'] genfmt: GenericFormat = kwargs['genfmt'] output: io.StringIO = io.StringIO(newline='') def vwrite(*args): """Print only if verbose/debug mode is enabled""" if kwargs['debug']: output.write(' '.join(str(item) for item in args)) sample: Sample = Sample(os.path.splitext(target_file)[0]) error: Err = Err.NO_ERROR # Read taxonomic classifier output files to get abundances read_method: Callable[ # Format: [[Input], Output] [Filename, Scoring, Optional[Score]], Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]] log: str stat: SampleStats counts: Counter[Id] scores: Dict[Id, Score] if classifier is Classifier.GENERIC: # Direct call to generic method log, stat, counts, scores = read_generic_output( target_file, scoring, minscore, genfmt) else: # Use read_method if classifier is Classifier.KRAKEN: read_method = read_kraken_output elif classifier is Classifier.CLARK: read_method = read_clark_output elif classifier is Classifier.LMAT: read_method = read_lmat_output elif classifier is Classifier.CENTRIFUGE: read_method = read_output else: raise Exception(red('\nERROR!'), f'taxclass: Unknown classifier "{classifier}".') log, stat, counts, scores = read_method(target_file, scoring, minscore) output.write(log) # Complete/Update fields in stats stat.is_ctrl = is_ctrl # set control nature of the sample if mintaxa is not None: # manual mintaxa has precedence over automatic stat.mintaxa = mintaxa else: # update local value with the automatically guessed value mintaxa = stat.mintaxa # Move cellular_organisms counts to root, in case if ontology.collapse and counts[CELLULAR_ORGANISMS]: vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS], gray('"CELLULAR_ORGANISMS" reads to "ROOT"... \n')) if counts[ontology.ROOT]: stat.decrease_filtered_taxids() scores[ontology.ROOT] = Score( (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] + scores[ontology.ROOT] * counts[ontology.ROOT]) / (counts[CELLULAR_ORGANISMS] + counts[ontology.ROOT])) else: scores[ontology.ROOT] = scores[CELLULAR_ORGANISMS] counts[ontology.ROOT] += counts[CELLULAR_ORGANISMS] counts[CELLULAR_ORGANISMS] = 0 scores[CELLULAR_ORGANISMS] = NO_SCORE # Remove root counts, in case if kwargs['root'] and counts[ontology.ROOT]: vwrite(gray('Removing'), counts[ontology.ROOT], gray('"ROOT" reads... ')) stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ontology.ROOT]) stat.decrease_filtered_taxids() counts[ontology.ROOT] = 0 scores[ontology.ROOT] = NO_SCORE vwrite(green('OK!'), '\n') # Building ontology tree output.write( gray('Building from raw data with mintaxa = ') + f'{mintaxa:_d}' + gray(' ... \n')) vwrite(gray(' Building ontology tree with all-in-1... ')) tree = TaxTree() ancestors: Set[Id] orphans: Set[Id] ancestors, orphans = ontology.get_ancestors(counts.keys()) out = SampleDataById(['all']) tree.allin1(ontology=ontology, counts=counts, scores=scores, ancestors=ancestors, min_taxa=mintaxa, include=including, exclude=excluding, out=out) out.purge_counters() vwrite(green('OK!'), '\n') # Stats: Complete final value for TaxIDs after tree building and folding final_taxids: int = len(out.counts) if out.counts is not None else 0 stat.set_final_taxids(final_taxids) # Check for additional loss of reads (due to include/exclude an orphans) output.write(gray(' Check for more seqs lost ([in/ex]clude affects)... ')) if out.counts is not None: discard: int = sum(counts.values()) - sum(out.counts.values()) if discard: output.write( blue('\n Info:') + f' {discard} ' + gray('additional seqs discarded (') + f'{discard/sum(counts.values()):.3%} ' + gray('of accepted)\n')) else: output.write(green('OK!\n')) else: output.write(red('No counts in sample tree!\n')) # Warn or give detailed stats about orphan taxid and orphan seqs if debug: vwrite(gray(' Checking taxid loss (orphans)... ')) lost: int = 0 if orphans: for orphan in orphans: vwrite(yellow(' Warning!'), gray('Orphan taxid'), f'{orphan}\n') lost += counts[orphan] vwrite( yellow(' WARNING!'), f'{len(orphans)} orphan taxids (' f'{len(orphans)/len(counts):.2%} of accepted)\n' f' and {lost} orphan sequences (' f'{lost/sum(counts.values()):.3%} of accepted)\n') else: vwrite(green('OK!\n')) elif orphans: output.write( yellow('\n Warning!') + f' {len(orphans)} orphan taxids' + gray(' (rerun with --debug for details)\n')) # Check the removal of TaxIDs (accumulation of leaves in parents) if debug and not excluding and including == {ontology.ROOT}: vwrite(gray(' Assess accumulation due to "folding the tree"...\n')) migrated: int = 0 if out.counts is not None: for taxid in counts: if out.counts[taxid] == 0: migrated += 1 vwrite( blue(' Info:'), gray(f'Folded TaxID {taxid} (') + f'{ontology.get_name(taxid)}' + gray(') with ') + f'{counts[taxid]}' + gray(' original seqs\n')) if migrated: vwrite( blue(' INFO:'), f'{migrated} TaxIDs folded (' f'{migrated/len(+counts):.2%} of TAF —TaxIDs after filtering—)' '\n') vwrite( blue(' INFO:'), f'Final assigned TaxIDs: {final_taxids} ' f'(reduced to {final_taxids/len(+counts):.2%} of ' 'number of TAF)\n') else: vwrite(blue(' INFO:'), gray('No migration!'), green('OK!\n')) # Print last message and check if the sample is void if out.counts: output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') + green('OK!\n')) elif is_ctrl: output.write(sample + red(' ctrl VOID!\n')) error = Err.VOID_CTRL else: output.write(sample + blue(' sample ') + yellow('VOID\n')) error = Err.VOID_SAMPLE # Timing results output.write( gray('Load elapsed time: ') + f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n')) print(output.getvalue()) sys.stdout.flush() return sample, tree, out, stat, error
def read_kraken_output( output_file: Filename, scoring: Scoring = Scoring.KRAKEN, minscore: Score = None, ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read Kraken output file Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification Returns: log string, statistics, abundances counter, scores dict """ output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_kmerel: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) try: with open(output_file, 'r') as file: # Check number of cols in header header = file.readline().split('\t') if len(header) != 5: print( red('\nERROR! ') + 'Kraken output format of ', yellow(f'"{output_file}"'), 'not supported.') print(magenta('Expected:'), 'C/U, ID, taxid, length, list of mappings') print(magenta('Found:'), '\t'.join(header), end='') print(blue('HINT:'), 'Use Kraken or Kraken2 direct output.') raise Exception('Unsupported file format. Aborting.') for raw_line in file: try: output_line = raw_line.strip() (_clas, _label, _tid, _length, _maps) = output_line.split('\t') except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue try: length: int = sum(map(int, _length.split('|'))) num_read += 1 nt_read += length if _clas == UNCLASSIFIED: # Just count unclassified reads num_uncl += 1 continue tid: Id = Id(_tid) maps: List[str] = _maps.split() try: maps.remove('|:|') except ValueError: pass mappings: Counter[Id] = col.Counter() for pair in maps: couple: List[str] = pair.split(':') mappings[Id(couple[0])] += int(couple[1]) # From Kraken score get "single hit equivalent length" shel: Score = Score(mappings[tid] + K_MER_SIZE) score: Score = Score(mappings[tid] / sum(mappings.values()) * 100) # % relative to all k-mers except ValueError: print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 continue else: taxids.add(tid) # Save all the tids of classified reads if minscore is not None: # Decide if ignore read if low score if scoring is Scoring.KRAKEN: if score < minscore: continue else: if shel < minscore: continue try: all_scores[tid].append(shel) except KeyError: all_scores[tid] = [ shel, ] try: all_kmerel[tid].append(score) except KeyError: all_kmerel[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, scores2=all_kmerel, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores SHEL: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Coverage(%): min = ') + f'{stat.sco2.mini:.1f},' + gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') + f'{stat.sco2.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.SHEL: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.KRAKEN: out_scores = {tid: Score(mean(all_kmerel[tid])) for tid in all_kmerel} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: print(red('ERROR!'), f'kraken: Unsupported Scoring "{scoring}"') raise Exception('Unsupported scoring') # Return return output.getvalue(), stat, counts, out_scores
def print_error(specifier): """GenericFormat constructor: print an informative error message""" print(red('ERROR!'), 'Generic --format string malformed:', blue(specifier), '\n\tPlease rerun with --help for details.')
def read_generic_output( output_file: Filename, scoring: Scoring = Scoring.GENERIC, minscore: Score = None, genfmt: GenericFormat = None ) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]: """ Read an output file from a generic classifier Args: output_file: output file name scoring: type of scoring to be applied (see Scoring class) minscore: minimum confidence level for the classification genfmt: GenericFormat object specifying the files format Returns: log string, statistics, abundances counter, scores dict """ # Initialization of variables output: io.StringIO = io.StringIO(newline='') all_scores: Dict[Id, List[Score]] = {} all_length: Dict[Id, List[int]] = {} taxids: Set[Id] = set() num_read: int = 0 nt_read: int = 0 num_uncl: int = 0 last_error_read: int = -1 # Number of read of the last error num_errors: int = 0 # Number or reads discarded due to error output.write(gray(f'Loading output file {output_file}... ')) # Check format if not isinstance(genfmt, GenericFormat): raise Exception( red('\nERROR!'), 'Missing GenericFormat when reading a generic output.') try: with open(output_file, 'r') as file: # Main loop processing each file line for raw_line in file: raw_line = raw_line.strip(' \n\t') splitting: str if genfmt.typ is GenericType.CSV: splitting = ',' elif genfmt.typ is GenericType.TSV: splitting = '\t' elif genfmt.typ is GenericType.SSV: splitting = ' ' else: raise Exception(f'ERROR! Unknown GenericType {genfmt.typ}') output_line: List[str] = raw_line.split(splitting) if len(output_line) < GenericFormat.MIN_COLS: if num_read == 0 and last_error_read < 0: last_error_read = 0 print(yellow('Warning!'), 'Skipping header of ' f'{output_file}') continue # Not account for the header as an error raise Exception( red('\nERROR!') + ' Line ' + yellow(f'{output_line}') + '\n\tin ' + yellow(f'{output_file}') + ' has < ' + blue(f'{GenericFormat.MIN_COLS}') + ' required ' + 'columns.\n\tPlease check the file.') try: tid: Id = Id(output_line[genfmt.tid - 1].strip(' "')) length: int = int(output_line[genfmt.len - 1].strip(' "')) if tid == genfmt.unc: # Avoid read score for unclass reads num_read += 1 nt_read += length num_uncl += 1 continue score: Score = Score( float(output_line[genfmt.sco - 1].strip(' "'))) except ValueError: if num_read == 0 and last_error_read < 0: last_error_read = 0 print(yellow('Warning!'), 'Skipping header of ' f'{output_file}') continue # Not account for the header as a failure print( yellow('Failure'), 'parsing line elements:' f' {output_line} in {output_file}' '. Ignoring line!') last_error_read = num_read + 1 num_errors += 1 if num_read > 100 and num_errors > 0.5 * num_read: print( red('ERROR!'), 'Unreliable file processing: rate of problematic' f' reads is {num_errors/num_read*100:_d}, beyond' ' 50%, after 100 reads. Please check the format ' f'of the file "{output_file}".') raise else: continue num_read += 1 nt_read += length taxids.add(tid) # Save all the tids of classified reads if minscore is not None and score < minscore: continue # Discard read if low confidence try: all_scores[tid].append(score) except KeyError: all_scores[tid] = [ score, ] try: all_length[tid].append(length) except KeyError: all_length[tid] = [ length, ] except FileNotFoundError: raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"') if last_error_read == num_read + 1: # Check error in last line: truncated! print(yellow('Warning!'), f'{output_file} seems truncated!') counts: Counter[Id] = col.Counter( {tid: len(all_scores[tid]) for tid in all_scores}) output.write(green('OK!\n')) if num_read == 0: raise Exception( red('\nERROR! ') + f'Cannot read any sequence from "{output_file}"') filt_seqs: int = sum([len(scores) for scores in all_scores.values()]) if filt_seqs == 0: raise Exception(red('\nERROR! ') + 'No sequence passed the filter!') # Get statistics stat: SampleStats = SampleStats(minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores, seq_read=num_read, seq_unclas=num_uncl, seq_filt=filt_seqs, tid_clas=len(taxids)) # Output statistics if num_errors: output.write( gray(' Seqs fail: ') + red(f'{num_errors:_d}\t') + gray('(Last error in read ') + red(f'{last_error_read}') + gray(')\n')) output.write( gray(' Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') + f'{stat.nt_read}' + gray(']\n')) output.write( gray(' Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') + f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n')) output.write( gray(' Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') + f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n')) output.write( gray(' Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n') output.write( gray(' Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') + f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n') output.write( gray(' TaxIds: by classifier = ') + f'{stat.tid.clas}' + gray(', by filter = ') + f'{stat.tid.filt}\n') # Select score output out_scores: Dict[Id, Score] if scoring is Scoring.GENERIC: out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores} elif scoring is Scoring.LENGTH: out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length} elif scoring is Scoring.LOGLENGTH: out_scores = { tid: Score(log10(mean(all_length[tid]))) for tid in all_length } elif scoring is Scoring.NORMA: scores: Dict[Id, Score] = { tid: Score(mean(all_scores[tid])) for tid in all_scores } lengths: Dict[Id, Score] = { tid: Score(mean(all_length[tid])) for tid in all_length } out_scores = { tid: Score(scores[tid] / lengths[tid] * 100) for tid in scores } else: raise Exception(red('\nERROR!'), f'Generic: Unsupported Scoring "{scoring}"') # Return return output.getvalue(), stat, counts, out_scores