def cross_analysis(iteration, raw): """Cross analysis: exclusive and part of shared&ctrl""" nonlocal shared_counts, shared_score nonlocal shared_ctrl_counts, shared_ctrl_score def partial_shared_update(i): """Perform shared and shared-control taxa partial evaluations""" nonlocal shared_counts, shared_score nonlocal shared_ctrl_counts, shared_ctrl_score if i == 0: # 1st iteration: Initialize shared abundance and score shared_counts.update(sub_shared_counts) shared_score.update(sub_shared_score) elif i < controls: # Just update shared abundance and score shared_counts &= sub_shared_counts shared_score &= sub_shared_score elif i == controls: # Initialize shared-control counters shared_counts &= sub_shared_counts shared_score &= sub_shared_score shared_ctrl_counts.update(sub_shared_counts) shared_ctrl_score.update(sub_shared_score) elif controls: # Both: Accumulate shared abundance and score shared_counts &= sub_shared_counts shared_score &= sub_shared_score shared_ctrl_counts &= sub_shared_counts shared_ctrl_score &= sub_shared_score else: # Both: Accumulate shared abundance and score (no controls) shared_counts &= sub_shared_counts shared_score &= sub_shared_score exclude: Set[Id] = set() # Get taxids at this rank that are present in the other samples for sample in (smpl for smpl in raws if smpl != raw): exclude.update(taxids[sample][rank]) exclude.update(excluding) # Add explicit excluding taxa if any output.write(f' \033[90mExclusive: From \033[0m{raw}\033[90m ' f'excluding {len(exclude)} taxa. ' f'Generating sample...\033[0m') exclude_tree = TaxTree() exclude_out = SampleDataById(['counts', 'scores', 'accs']) exclude_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=exclude, out=exclude_out) exclude_out.purge_counters() if exclude_out.counts: # Avoid adding empty samples sample = Sample(f'{raw}_{STR_EXCLUSIVE}_{rank.name.lower()}') samples.append(sample) counts[sample] = exclude_out.get_counts() accs[sample] = exclude_out.get_accs() scores[sample] = exclude_out.get_scores() output.write('\033[92m OK! \033[0m\n') else: output.write('\033[93m VOID \033[0m\n') # Get partial abundance and score for the shared analysis sub_shared_tree = TaxTree() sub_shared_out = SampleDataById(['shared', 'accs']) sub_shared_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=excluding, out=sub_shared_out) sub_shared_out.purge_counters() # Scale scores by abundance sub_shared_counts: SharedCounter = sub_shared_out.get_shared_counts() sub_shared_score: SharedCounter = sub_shared_out.get_shared_scores() sub_shared_score *= sub_shared_counts partial_shared_update(iteration)
def control_analysis(): """Perform last steps of control and shared controls analysis""" nonlocal shared_ctrl_counts, shared_ctrl_score def robust_contamination_removal(): """Implement robust contamination removal algorithm.""" nonlocal exclude_sets, shared_crossover def compute_qn(data: List[float], dist: str = "Gauss") -> float: """Compute Qn robust estimator of scale (Rousseeuw, 1993)""" c_d: float # Select d parameter depending on the distribution if dist == "Gauss": c_d = 2.2219 elif dist == "Cauchy": # Heavy-tailed distribution c_d = 1.2071 elif dist == "NegExp": # Negative exponential (asymetric) c_d = 3.4760 else: raise Exception(red('\nERROR! ') + 'Unknown distribution') num: int = len(data) sort_data = sorted(data) pairwisedifs: List[float] = [] for (i, x_val) in enumerate(sort_data): for y_val in sort_data[i + 1:]: pairwisedifs.append(abs(x_val - y_val)) k: int = int(num * (num / 2 + 1) / 4) return c_d * sorted(pairwisedifs)[k - 1] exclude_sets = {smpl: set() for smpl in raws[controls:]} vwrite( gray('Robust contamination removal: ' 'Searching for contaminants...\n')) for tid in exclude_candidates: relfreq_ctrl: List[float] = [ accs[ctrl][tid] / accs[ctrl][ontology.ROOT] for ctrl in raws[:controls] ] relfreq_smpl: List[float] = [ accs[smpl][tid] / accs[smpl][ontology.ROOT] for smpl in raws[controls:] ] relfreq: List[float] = relfreq_ctrl + relfreq_smpl crossover: List[bool] # Crossover source (yes/no) # Just-controls contamination check if all([rf < EPS for rf in relfreq_smpl]): vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') continue # Go for next candidate # Critical contamination check if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(red('critical:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Severe contamination check if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(yellow('severe: \t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Mild contamination check if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]): vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) continue # Go for next candidate # Calculate median and MAD median but including controls mdn: float = statistics.median(relfreq) # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq]) q_n: float = compute_qn(relfreq, dist="NegExp") # Calculate crossover in samples outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n ordomag_lim: float = max( relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG crossover = [ rf > outlier_lim and rf > ordomag_lim for rf in relfreq[controls:] ] # Crossover contamination check if any(crossover): vwrite( magenta('crossover:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), gray('crossover:'), blst2str(crossover), '\n') # Exclude just for contaminated samples (not the source) vwrite(magenta('\t->'), gray(f'Include {tid} just in:')) for i in range(len(raws[controls:])): if not crossover[i]: exclude_sets[raws[i + controls]].add(tid) else: vwrite(f' {raws[i + controls]}') if all(crossover): # Shared taxon contaminating control(s) vwrite(' (', yellow('Shared crossover taxon!'), ')') shared_crossover.add(tid) vwrite('\n') continue # Other contamination: remove from all samples vwrite( gray('other cont:\t'), tid, ontology.get_name(tid), green(f'lims: [{outlier_lim:.1g}]' + ('<' if outlier_lim < ordomag_lim else '>') + f'[{ordomag_lim:.1g}]'), gray('relfreq:'), fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n') for exclude_set in exclude_sets.values(): exclude_set.add(tid) # Get taxids at this rank that are present in the control samples exclude_candidates: Set[Id] = set() for i in range(controls): exclude_candidates.update(taxids[raws[i]][rank]) exclude_sets: Dict[Sample, Set[Id]] shared_crossover: Set[Id] = set() # Shared taxa contaminating controls if controls and (len(raws) - controls >= ROBUST_MIN_SAMPLES): robust_contamination_removal() else: # If this case, just apply strict control exclude_sets = { file: exclude_candidates for file in raws[controls::] } # Add explicit excluding taxa (if any) to exclude sets for exclude_set in exclude_sets.values(): exclude_set.update(excluding) exclude_candidates.update(excluding) # Process each sample excluding control taxa for raw in raws[controls:]: output.write( gray(' Ctrl: From') + f' {raw} ' + gray(f'excluding {len(exclude_sets[raw])} ctrl taxa. ' f'Generating sample... ')) ctrl_tree = TaxTree() ctrl_out = SampleDataById(['counts', 'scores', 'accs']) ctrl_tree.allin1(ontology=ontology, counts=counts[raw], scores=scores[raw], min_taxa=mintaxas[raw], min_rank=rank, just_min_rank=True, include=including, exclude=exclude_sets[raw], out=ctrl_out) ctrl_out.purge_counters() if ctrl_out.counts: # Avoid adding empty samples sample = Sample(f'{raw}_{STR_CONTROL}_{rank.name.lower()}') samples.append(sample) counts[sample] = ctrl_out.get_counts() accs[sample] = ctrl_out.get_accs() scores[sample] = ctrl_out.get_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n')) def shared_ctrl_analysis(): """Perform last steps of shared taxa analysis""" shared_ctrl_tree: TaxTree = TaxTree() shared_ctrl_out: SampleDataById = SampleDataById( ['shared', 'accs']) shared_ctrl_tree.allin1(ontology=ontology, counts=shared_ctrl_counts, scores=shared_ctrl_score, min_taxa=get_shared_mintaxa(), include=including, exclude=(exclude_candidates - shared_crossover), out=shared_ctrl_out) shared_ctrl_out.purge_counters() out_counts: SharedCounter = shared_ctrl_out.get_shared_counts() output.write( gray(f' Ctrl-shared: Including {len(out_counts)}' ' shared taxa. Generating sample... ')) if out_counts: sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}') samples.append(sample) counts[Sample(sample)] = out_counts accs[Sample(sample)] = shared_ctrl_out.get_accs() scores[sample] = shared_ctrl_out.get_shared_scores() output.write(green('OK!\n')) else: output.write(yellow('VOID\n')) # Shared-control taxa final analysis if shared_ctrl_counts: # Normalize scaled scores by total abundance shared_ctrl_score /= (+shared_ctrl_counts) # Get averaged abundance by number of samples minus ctrl samples shared_ctrl_counts //= (len(raws) - controls) shared_ctrl_analysis() else: output.write( gray(' Ctrl-shared: No taxa! ') + yellow('VOID') + gray(' sample.\n'))