def cross_analysis(iteration, raw): """Cross analysis: exclusive and part of shared&ctrl""" nonlocal shared_counts, shared_score nonlocal shared_ctrl_counts, shared_ctrl_score def partial_shared_update(i): """Perform shared and shared-control taxa partial evaluations""" nonlocal shared_counts, shared_score nonlocal shared_ctrl_counts, shared_ctrl_score if i == 0: # 1st iteration: Initialize shared abundance and score shared_counts.update(sub_shared_counts) shared_score.update(sub_shared_score) elif i < controls: # Just update shared abundance and score shared_counts &= sub_shared_counts shared_score &= sub_shared_score elif i == controls: # Initialize shared-control counters shared_counts &= sub_shared_counts shared_score &= sub_shared_score shared_ctrl_counts.update(sub_shared_counts) shared_ctrl_score.update(sub_shared_score) elif controls: # Both: Accumulate shared abundance and score shared_counts &= sub_shared_counts shared_score &= sub_shared_score shared_ctrl_counts &= sub_shared_counts shared_ctrl_score &= sub_shared_score else: # Both: Accumulate shared abundance and score (no controls) shared_counts &= sub_shared_counts shared_score &= sub_shared_score exclude: Set[Id] = set() # Get taxids at this rank that are present in the other samples for sample in (smpl for smpl in raws if smpl != raw): exclude.update(taxids[sample][rank]) exclude.update(excluding) # Add explicit excluding taxa if any output.write(f' \033[90mExclusive: From \033[0m{raw}\033[90m ' f'excluding {len(exclude)} taxa. ' f'Generating sample...\033[0m') exclude_tree = TaxTree() exclude_out = SampleDataById(['counts', 'scores', 'accs']) exclude_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=exclude, out=exclude_out) exclude_out.purge_counters() if exclude_out.counts: # Avoid adding empty samples sample = Sample(f'{raw}_{STR_EXCLUSIVE}_{rank.name.lower()}') samples.append(sample) counts[sample] = exclude_out.get_counts() accs[sample] = exclude_out.get_accs() scores[sample] = exclude_out.get_scores() output.write('\033[92m OK! \033[0m\n') else: output.write('\033[93m VOID \033[0m\n') # Get partial abundance and score for the shared analysis sub_shared_tree = TaxTree() sub_shared_out = SampleDataById(['shared', 'accs']) sub_shared_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=excluding, out=sub_shared_out) sub_shared_out.purge_counters() # Scale scores by abundance sub_shared_counts: SharedCounter = sub_shared_out.get_shared_counts() sub_shared_score: SharedCounter = sub_shared_out.get_shared_scores() sub_shared_score *= sub_shared_counts partial_shared_update(iteration)
def process_output( *args, **kwargs ) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]: """ Process Centrifuge/LMAT output files (to be usually called in parallel!). """ # timing initialization start_time: float = time.perf_counter() # Recover input and parameters target_file: Filename = args[0] debug: bool = kwargs['debug'] is_ctrl: bool = args[1] if debug: print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'), target_file, gray('...')) sys.stdout.flush() taxonomy: Taxonomy = kwargs['taxonomy'] mintaxa: int = kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa'] minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore'] including: Set[TaxId] = taxonomy.including excluding: Set[TaxId] = taxonomy.excluding scoring: Scoring = kwargs['scoring'] lmat: bool = kwargs['lmat'] output: io.StringIO = io.StringIO(newline='') def vwrite(*args): """Print only if verbose/debug mode is enabled""" if kwargs['debug']: output.write(' '.join(str(item) for item in args)) sample: Sample = Sample(os.path.splitext(target_file)[0]) error: Err = Err.NO_ERROR # Read Centrifuge/LMAT output files to get abundances read_method: Callable[[Filename, Scoring, Optional[Score]], # Input Tuple[str, SampleStats, Counter[TaxId], Dict[TaxId, Score]] # Output ] if lmat: read_method = read_lmat_output else: read_method = read_output log: str counts: Counter[TaxId] scores: Dict[TaxId, Score] log, stat, counts, scores = read_method(target_file, scoring, minscore) output.write(log) # Update field in stat about control nature of the sample stat.is_ctrl = is_ctrl # Move cellular_organisms counts to root, in case if taxonomy.collapse and counts[CELLULAR_ORGANISMS]: vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS], gray('"CELLULAR_ORGANISMS" reads to "ROOT"... ')) if counts[ROOT]: stat.num_taxa -= 1 scores[ROOT] = ( (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] + scores[ROOT] * counts[ROOT]) / (counts[CELLULAR_ORGANISMS] + counts[ROOT])) else: scores[ROOT] = scores[CELLULAR_ORGANISMS] counts[ROOT] += counts[CELLULAR_ORGANISMS] counts[CELLULAR_ORGANISMS] = 0 scores[CELLULAR_ORGANISMS] = NO_SCORE # Remove root counts, in case if kwargs['root'] and counts[ROOT]: vwrite(gray('Removing'), counts[ROOT], gray('"ROOT" reads... ')) stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ROOT]) stat.num_taxa -= 1 counts[ROOT] = 0 scores[ROOT] = NO_SCORE vwrite(green('OK!'), '\n') # Building taxonomy tree output.write(gray('Building from raw data... ')) vwrite(gray('\n Building taxonomy tree with all-in-1... ')) tree = TaxTree() ancestors: Set[TaxId] orphans: Set[TaxId] ancestors, orphans = taxonomy.get_ancestors(counts.keys()) out = SampleDataByTaxId(['all']) tree.allin1(taxonomy=taxonomy, counts=counts, scores=scores, ancestors=ancestors, min_taxa=mintaxa, include=including, exclude=excluding, out=out) out.purge_counters() vwrite(green('OK!'), '\n') # Give stats about orphan taxid if debug: vwrite(gray(' Checking taxid loss (orphans)... ')) lost: int = 0 if orphans: for orphan in orphans: vwrite(yellow('Warning!'), f'Orphan taxid={orphan}\n') lost += counts[orphan] vwrite( yellow('WARNING!'), f'{len(orphans)} orphan taxids (' f'{len(orphans)/len(counts):.2%} of total)\n' f'{lost} orphan sequences (' f'{lost/sum(counts.values()):.3%} of total)\n') else: vwrite(green('OK!\n')) # Check the lost of taxids (plasmids typically) under some conditions if debug and not excluding and not including: vwrite(gray(' Additional checking of taxid loss... ')) lost = 0 for taxid in counts: if not out.counts[taxid]: lost += 1 vwrite(yellow('Warning!'), f'Lost taxid={taxid}: ' f'{taxonomy.get_name(taxid)}\n') if lost: vwrite( yellow('WARNING!'), f'Lost {lost} taxids (' f'{lost/len(counts):.2%} of total)' '\n') else: vwrite(green('OK!\n')) # Print last message and check if the sample is void if out.counts: output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') + green('OK!\n')) elif is_ctrl: output.write(sample + red(' ctrl VOID!\n')) error = Err.VOID_CTRL else: output.write(sample + blue(' sample ') + yellow('VOID\n')) error = Err.VOID_SAMPLE # Timing results output.write( gray('Load elapsed time: ') + f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n')) print(output.getvalue()) sys.stdout.flush() return sample, tree, out, stat, error
def control_analysis(): """Perform last steps of control and shared controls analysis""" nonlocal shared_ctrl_counts, shared_ctrl_score def robust_contamination_removal(): """Implement robust contamination removal algorithm.""" nonlocal exclude_sets, shared_crossover def compute_qn(data: List[float], dist: str = "Gauss") -> float: """Compute Qn robust estimator of scale (Rousseeuw, 1993)""" c_d: float # Select d parameter depending on the distribution if dist == "Gauss": c_d = 2.2219 elif dist == "Cauchy": # Heavy-tailed distribution c_d = 1.2071 elif dist == "NegExp": # Negative exponential (asymetric) c_d = 3.4760 else: raise Exception(red('\nERROR! ') + 'Unknown distribution') num: int = len(data) sort_data = sorted(data) pairwisedifs: List[float] = [] for (i, x_val) in enumerate(sort_data): for y_val in sort_data[i + 1:]: pairwisedifs.append(abs(x_val - y_val)) k: int = int(num * (num / 2 + 1) / 4) return c_d * sorted(pairwisedifs)[k - 1] exclude_sets = {smpl: set() for smpl in raws[controls:]} vwrite( gray('Robust contamination removal: ' 'Searching for contaminants...\n')) for tid in exclude_candidates: relfreq_ctrl: List[float] = [ accs[ctrl][tid] / accs[ctrl][ontology.ROOT] for ctrl in raws[:controls] ] relfreq_smpl: List[float] = [ accs[smpl][tid] / accs[smpl][ontology.ROOT] for smpl in raws[controls:] ] relfreq: List[float] = relfreq_ctrl + relfreq_smpl crossover: List[bool] # Crossover source (yes/no) # Just-controls contamination check if all([rf < EPS for rf in relfreq_smpl]): vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') continue # Go for next candidate # Critical contamination check if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(red('critical:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Severe contamination check if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(yellow('severe: \t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Mild contamination check if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Calculate median and MAD median but including controls mdn: float = statistics.median(relfreq) # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq]) q_n: float = compute_qn(relfreq, dist="NegExp") # Calculate crossover in samples outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n ordomag_lim: float = max( relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG crossover = [ rf > outlier_lim and rf > ordomag_lim for rf in relfreq[controls:] ] # Crossover contamination check if any(crossover): vwrite( magenta('crossover:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), gray('crossover:'), blst2str(crossover), '\n') # Exclude just for contaminated samples (not the source) vwrite(magenta('\t->'), gray(f'Include {tid} just in:')) for i in range(len(raws[controls:])): if not crossover[i]: exclude_sets[raws[i + controls]].add(tid) else: vwrite(f' {raws[i + controls]}') if all(crossover): # Shared taxon contaminating control(s) vwrite(' (', yellow('Shared crossover taxon!'), ')') shared_crossover.add(tid) vwrite('\n') continue # Other contamination: remove from all samples vwrite( gray('other cont:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) # Get taxids at this rank that are present in the control samples exclude_candidates: Set[Id] = set() for i in range(controls): exclude_candidates.update(taxids[raws[i]][rank]) exclude_sets: Dict[Sample, Set[Id]] shared_crossover: Set[Id] = set() # Shared taxa contaminating controls if controls and (len(raws) - controls >= ROBUST_MIN_SAMPLES): robust_contamination_removal() else: # If this case, just apply strict control exclude_sets = { file: exclude_candidates for file in raws[controls::] } # Add explicit excluding taxa (if any) to exclude sets for exclude_set in exclude_sets.values(): exclude_set.update(excluding) exclude_candidates.update(excluding) # Process each sample excluding control taxa for raw in raws[controls:]: output.write( gray(' Ctrl: From') + f' {raw} ' + gray(f'excluding {len(exclude_sets[raw])} ctrl taxa. ' f'Generating sample... ')) ctrl_tree = TaxTree() ctrl_out = SampleDataById(['counts', 'scores', 'accs']) ctrl_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=exclude_sets[raw], out=ctrl_out) ctrl_out.purge_counters() if ctrl_out.counts: # Avoid adding empty samples sample = Sample(f'{raw}_{STR_CONTROL}_{rank.name.lower()}') samples.append(sample) counts[sample] = ctrl_out.get_counts() accs[sample] = ctrl_out.get_accs() scores[sample] = ctrl_out.get_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n')) def shared_ctrl_analysis(): """Perform last steps of shared taxa analysis""" shared_ctrl_tree: TaxTree = TaxTree() shared_ctrl_out: SampleDataById = SampleDataById( ['shared', 'accs']) shared_ctrl_tree.allin1(ontology=ontology, counts=shared_ctrl_counts, scores=shared_ctrl_score, min_taxa=get_shared_mintaxa(), include=including, exclude=(exclude_candidates - shared_crossover), out=shared_ctrl_out) shared_ctrl_out.purge_counters() out_counts: SharedCounter = shared_ctrl_out.get_shared_counts() output.write( gray(f' Ctrl-shared: Including {len(out_counts)}' ' shared taxa. Generating sample... ')) if out_counts: sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}') samples.append(sample) counts[Sample(sample)] = out_counts accs[Sample(sample)] = shared_ctrl_out.get_accs() scores[sample] = shared_ctrl_out.get_shared_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n')) # Shared-control taxa final analysis if shared_ctrl_counts: # Normalize scaled scores by total abundance shared_ctrl_score /= (+shared_ctrl_counts) # Get averaged abundance by number of samples minus ctrl samples shared_ctrl_counts //= (len(raws) - controls) shared_ctrl_analysis() else: output.write( gray(' Ctrl-shared: No taxa! ') + yellow('VOID') + gray(' sample.\n'))
def process_output( *args, **kwargs) -> Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]: """ Process classifiers output files (to be usually called in parallel!). """ # timing initialization start_time: float = time.perf_counter() # Recover input and parameters target_file: Filename = args[0] debug: bool = kwargs['debug'] is_ctrl: bool = args[1] if debug: print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'), target_file, gray('...')) sys.stdout.flush() ontology: Ontology = kwargs['ontology'] mintaxa: Optional[int] = (kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa']) minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore'] including: Union[Tuple, Set[Id]] = ontology.including excluding: Union[Tuple, Set[Id]] = ontology.excluding scoring: Scoring = kwargs['scoring'] classifier: Classifier = kwargs['classifier'] genfmt: GenericFormat = kwargs['genfmt'] output: io.StringIO = io.StringIO(newline='') def vwrite(*args): """Print only if verbose/debug mode is enabled""" if kwargs['debug']: output.write(' '.join(str(item) for item in args)) sample: Sample = Sample(os.path.splitext(target_file)[0]) error: Err = Err.NO_ERROR # Read taxonomic classifier output files to get abundances read_method: Callable[ # Format: [[Input], Output] [Filename, Scoring, Optional[Score]], Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]] log: str stat: SampleStats counts: Counter[Id] scores: Dict[Id, Score] if classifier is Classifier.GENERIC: # Direct call to generic method log, stat, counts, scores = read_generic_output( target_file, scoring, minscore, genfmt) else: # Use read_method if classifier is Classifier.KRAKEN: read_method = read_kraken_output elif classifier is Classifier.CLARK: read_method = read_clark_output elif classifier is Classifier.LMAT: read_method = read_lmat_output elif classifier is Classifier.CENTRIFUGE: read_method = read_output else: raise Exception(red('\nERROR!'), f'taxclass: Unknown classifier "{classifier}".') log, stat, counts, scores = read_method(target_file, scoring, minscore) output.write(log) # Complete/Update fields in stats stat.is_ctrl = is_ctrl # set control nature of the sample if mintaxa is not None: # manual mintaxa has precedence over automatic stat.mintaxa = mintaxa else: # update local value with the automatically guessed value mintaxa = stat.mintaxa # Move cellular_organisms counts to root, in case if ontology.collapse and counts[CELLULAR_ORGANISMS]: vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS], gray('"CELLULAR_ORGANISMS" reads to "ROOT"... \n')) if counts[ontology.ROOT]: stat.decrease_filtered_taxids() scores[ontology.ROOT] = Score( (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] + scores[ontology.ROOT] * counts[ontology.ROOT]) / (counts[CELLULAR_ORGANISMS] + counts[ontology.ROOT])) else: scores[ontology.ROOT] = scores[CELLULAR_ORGANISMS] counts[ontology.ROOT] += counts[CELLULAR_ORGANISMS] counts[CELLULAR_ORGANISMS] = 0 scores[CELLULAR_ORGANISMS] = NO_SCORE # Remove root counts, in case if kwargs['root'] and counts[ontology.ROOT]: vwrite(gray('Removing'), counts[ontology.ROOT], gray('"ROOT" reads... ')) stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ontology.ROOT]) stat.decrease_filtered_taxids() counts[ontology.ROOT] = 0 scores[ontology.ROOT] = NO_SCORE vwrite(green('OK!'), '\n') # Building ontology tree output.write( gray('Building from raw data with mintaxa = ') + f'{mintaxa:_d}' + gray(' ... \n')) vwrite(gray(' Building ontology tree with all-in-1... ')) tree = TaxTree() ancestors: Set[Id] orphans: Set[Id] ancestors, orphans = ontology.get_ancestors(counts.keys()) out = SampleDataById(['all']) tree.allin1(ontology=ontology, counts=counts, scores=scores, ancestors=ancestors, min_taxa=mintaxa, include=including, exclude=excluding, out=out) out.purge_counters() vwrite(green('OK!'), '\n') # Stats: Complete final value for TaxIDs after tree building and folding final_taxids: int = len(out.counts) if out.counts is not None else 0 stat.set_final_taxids(final_taxids) # Check for additional loss of reads (due to include/exclude an orphans) output.write(gray(' Check for more seqs lost ([in/ex]clude affects)... ')) if out.counts is not None: discard: int = sum(counts.values()) - sum(out.counts.values()) if discard: output.write( blue('\n Info:') + f' {discard} ' + gray('additional seqs discarded (') + f'{discard/sum(counts.values()):.3%} ' + gray('of accepted)\n')) else: output.write(green('OK!\n')) else: output.write(red('No counts in sample tree!\n')) # Warn or give detailed stats about orphan taxid and orphan seqs if debug: vwrite(gray(' Checking taxid loss (orphans)... ')) lost: int = 0 if orphans: for orphan in orphans: vwrite(yellow(' Warning!'), gray('Orphan taxid'), f'{orphan}\n') lost += counts[orphan] vwrite( yellow(' WARNING!'), f'{len(orphans)} orphan taxids (' f'{len(orphans)/len(counts):.2%} of accepted)\n' f' and {lost} orphan sequences (' f'{lost/sum(counts.values()):.3%} of accepted)\n') else: vwrite(green('OK!\n')) elif orphans: output.write( yellow('\n Warning!') + f' {len(orphans)} orphan taxids' + gray(' (rerun with --debug for details)\n')) # Check the removal of TaxIDs (accumulation of leaves in parents) if debug and not excluding and including == {ontology.ROOT}: vwrite(gray(' Assess accumulation due to "folding the tree"...\n')) migrated: int = 0 if out.counts is not None: for taxid in counts: if out.counts[taxid] == 0: migrated += 1 vwrite( blue(' Info:'), gray(f'Folded TaxID {taxid} (') + f'{ontology.get_name(taxid)}' + gray(') with ') + f'{counts[taxid]}' + gray(' original seqs\n')) if migrated: vwrite( blue(' INFO:'), f'{migrated} TaxIDs folded (' f'{migrated/len(+counts):.2%} of TAF —TaxIDs after filtering—)' '\n') vwrite( blue(' INFO:'), f'Final assigned TaxIDs: {final_taxids} ' f'(reduced to {final_taxids/len(+counts):.2%} of ' 'number of TAF)\n') else: vwrite(blue(' INFO:'), gray('No migration!'), green('OK!\n')) # Print last message and check if the sample is void if out.counts: output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') + green('OK!\n')) elif is_ctrl: output.write(sample + red(' ctrl VOID!\n')) error = Err.VOID_CTRL else: output.write(sample + blue(' sample ') + yellow('VOID\n')) error = Err.VOID_SAMPLE # Timing results output.write( gray('Load elapsed time: ') + f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n')) print(output.getvalue()) sys.stdout.flush() return sample, tree, out, stat, error