Example #1
0
    def cross_analysis(iteration, raw):
        """Cross analysis: exclusive and part of shared&ctrl"""
        nonlocal shared_counts, shared_score
        nonlocal shared_ctrl_counts, shared_ctrl_score

        def partial_shared_update(i):
            """Perform shared and shared-control taxa partial evaluations"""
            nonlocal shared_counts, shared_score
            nonlocal shared_ctrl_counts, shared_ctrl_score
            if i == 0:  # 1st iteration: Initialize shared abundance and score
                shared_counts.update(sub_shared_counts)
                shared_score.update(sub_shared_score)
            elif i < controls:  # Just update shared abundance and score
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
            elif i == controls:  # Initialize shared-control counters
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
                shared_ctrl_counts.update(sub_shared_counts)
                shared_ctrl_score.update(sub_shared_score)
            elif controls:  # Both: Accumulate shared abundance and score
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
                shared_ctrl_counts &= sub_shared_counts
                shared_ctrl_score &= sub_shared_score
            else:  # Both: Accumulate shared abundance and score (no controls)
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score

        exclude: Set[Id] = set()
        # Get taxids at this rank that are present in the other samples
        for sample in (smpl for smpl in raws if smpl != raw):
            exclude.update(taxids[sample][rank])
        exclude.update(excluding)  # Add explicit excluding taxa if any
        output.write(f'  \033[90mExclusive: From \033[0m{raw}\033[90m '
                     f'excluding {len(exclude)} taxa. '
                     f'Generating sample...\033[0m')

        exclude_tree = TaxTree()
        exclude_out = SampleDataById(['counts', 'scores', 'accs'])
        exclude_tree.allin1(ontology=ontology,
                            counts=counts[raw],
                            scores=scores[raw],
                            min_taxa=mintaxas[raw],
                            min_rank=rank,
                            just_min_rank=True,
                            include=including,
                            exclude=exclude,
                            out=exclude_out)
        exclude_out.purge_counters()
        if exclude_out.counts:  # Avoid adding empty samples
            sample = Sample(f'{raw}_{STR_EXCLUSIVE}_{rank.name.lower()}')
            samples.append(sample)
            counts[sample] = exclude_out.get_counts()
            accs[sample] = exclude_out.get_accs()
            scores[sample] = exclude_out.get_scores()
            output.write('\033[92m OK! \033[0m\n')
        else:
            output.write('\033[93m VOID \033[0m\n')

        # Get partial abundance and score for the shared analysis
        sub_shared_tree = TaxTree()
        sub_shared_out = SampleDataById(['shared', 'accs'])
        sub_shared_tree.allin1(ontology=ontology,
                               counts=counts[raw],
                               scores=scores[raw],
                               min_taxa=mintaxas[raw],
                               min_rank=rank,
                               just_min_rank=True,
                               include=including,
                               exclude=excluding,
                               out=sub_shared_out)
        sub_shared_out.purge_counters()
        # Scale scores by abundance
        sub_shared_counts: SharedCounter = sub_shared_out.get_shared_counts()
        sub_shared_score: SharedCounter = sub_shared_out.get_shared_scores()
        sub_shared_score *= sub_shared_counts
        partial_shared_update(iteration)
Example #2
0
    def control_analysis():
        """Perform last steps of control and shared controls analysis"""
        nonlocal shared_ctrl_counts, shared_ctrl_score

        def robust_contamination_removal():
            """Implement robust contamination removal algorithm."""
            nonlocal exclude_sets, shared_crossover

            def compute_qn(data: List[float], dist: str = "Gauss") -> float:
                """Compute Qn robust estimator of scale (Rousseeuw, 1993)"""
                c_d: float  # Select d parameter depending on the distribution
                if dist == "Gauss":
                    c_d = 2.2219
                elif dist == "Cauchy":  # Heavy-tailed distribution
                    c_d = 1.2071
                elif dist == "NegExp":  # Negative exponential (asymetric)
                    c_d = 3.4760
                else:
                    raise Exception(red('\nERROR! ') + 'Unknown distribution')
                num: int = len(data)
                sort_data = sorted(data)
                pairwisedifs: List[float] = []
                for (i, x_val) in enumerate(sort_data):
                    for y_val in sort_data[i + 1:]:
                        pairwisedifs.append(abs(x_val - y_val))
                k: int = int(num * (num / 2 + 1) / 4)
                return c_d * sorted(pairwisedifs)[k - 1]

            exclude_sets = {smpl: set() for smpl in raws[controls:]}
            vwrite(
                gray('Robust contamination removal: '
                     'Searching for contaminants...\n'))
            for tid in exclude_candidates:
                relfreq_ctrl: List[float] = [
                    accs[ctrl][tid] / accs[ctrl][ontology.ROOT]
                    for ctrl in raws[:controls]
                ]
                relfreq_smpl: List[float] = [
                    accs[smpl][tid] / accs[smpl][ontology.ROOT]
                    for smpl in raws[controls:]
                ]
                relfreq: List[float] = relfreq_ctrl + relfreq_smpl
                crossover: List[bool]  # Crossover source (yes/no)
                # Just-controls contamination check
                if all([rf < EPS for rf in relfreq_smpl]):
                    vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    continue  # Go for next candidate
                # Critical contamination check
                if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(red('critical:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Severe contamination check
                if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(yellow('severe: \t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Mild contamination check
                if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Calculate median and MAD median but including controls
                mdn: float = statistics.median(relfreq)
                # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq])
                q_n: float = compute_qn(relfreq, dist="NegExp")
                # Calculate crossover in samples
                outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n
                ordomag_lim: float = max(
                    relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG
                crossover = [
                    rf > outlier_lim and rf > ordomag_lim
                    for rf in relfreq[controls:]
                ]
                # Crossover contamination check
                if any(crossover):
                    vwrite(
                        magenta('crossover:\t'), tid, ontology.get_name(tid),
                        green(f'lims: [{outlier_lim:.1g}]' +
                              ('<' if outlier_lim < ordomag_lim else '>') +
                              f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                        fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                        gray('crossover:'), blst2str(crossover), '\n')
                    # Exclude just for contaminated samples (not the source)
                    vwrite(magenta('\t->'), gray(f'Include {tid} just in:'))
                    for i in range(len(raws[controls:])):
                        if not crossover[i]:
                            exclude_sets[raws[i + controls]].add(tid)
                        else:
                            vwrite(f' {raws[i + controls]}')
                    if all(crossover):  # Shared taxon contaminating control(s)
                        vwrite(' (', yellow('Shared crossover taxon!'), ')')
                        shared_crossover.add(tid)
                    vwrite('\n')
                    continue
                # Other contamination: remove from all samples
                vwrite(
                    gray('other cont:\t'), tid, ontology.get_name(tid),
                    green(f'lims: [{outlier_lim:.1g}]' +
                          ('<' if outlier_lim < ordomag_lim else '>') +
                          f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                    fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n')
                for exclude_set in exclude_sets.values():
                    exclude_set.add(tid)

        # Get taxids at this rank that are present in the control samples
        exclude_candidates: Set[Id] = set()
        for i in range(controls):
            exclude_candidates.update(taxids[raws[i]][rank])
        exclude_sets: Dict[Sample, Set[Id]]
        shared_crossover: Set[Id] = set()  # Shared taxa contaminating controls
        if controls and (len(raws) - controls >= ROBUST_MIN_SAMPLES):
            robust_contamination_removal()
        else:  # If this case, just apply strict control
            exclude_sets = {
                file: exclude_candidates
                for file in raws[controls::]
            }
        # Add explicit excluding taxa (if any) to exclude sets
        for exclude_set in exclude_sets.values():
            exclude_set.update(excluding)
        exclude_candidates.update(excluding)
        # Process each sample excluding control taxa
        for raw in raws[controls:]:
            output.write(
                gray('  Ctrl: From') + f' {raw} ' +
                gray(f'excluding {len(exclude_sets[raw])} ctrl taxa. '
                     f'Generating sample... '))
            ctrl_tree = TaxTree()
            ctrl_out = SampleDataById(['counts', 'scores', 'accs'])
            ctrl_tree.allin1(ontology=ontology,
                             counts=counts[raw],
                             scores=scores[raw],
                             min_taxa=mintaxas[raw],
                             min_rank=rank,
                             just_min_rank=True,
                             include=including,
                             exclude=exclude_sets[raw],
                             out=ctrl_out)
            ctrl_out.purge_counters()
            if ctrl_out.counts:  # Avoid adding empty samples
                sample = Sample(f'{raw}_{STR_CONTROL}_{rank.name.lower()}')
                samples.append(sample)
                counts[sample] = ctrl_out.get_counts()
                accs[sample] = ctrl_out.get_accs()
                scores[sample] = ctrl_out.get_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        def shared_ctrl_analysis():
            """Perform last steps of shared taxa analysis"""
            shared_ctrl_tree: TaxTree = TaxTree()
            shared_ctrl_out: SampleDataById = SampleDataById(
                ['shared', 'accs'])
            shared_ctrl_tree.allin1(ontology=ontology,
                                    counts=shared_ctrl_counts,
                                    scores=shared_ctrl_score,
                                    min_taxa=get_shared_mintaxa(),
                                    include=including,
                                    exclude=(exclude_candidates -
                                             shared_crossover),
                                    out=shared_ctrl_out)
            shared_ctrl_out.purge_counters()
            out_counts: SharedCounter = shared_ctrl_out.get_shared_counts()
            output.write(
                gray(f'  Ctrl-shared: Including {len(out_counts)}'
                     ' shared taxa. Generating sample... '))
            if out_counts:
                sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}')
                samples.append(sample)
                counts[Sample(sample)] = out_counts
                accs[Sample(sample)] = shared_ctrl_out.get_accs()
                scores[sample] = shared_ctrl_out.get_shared_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        # Shared-control taxa final analysis
        if shared_ctrl_counts:
            # Normalize scaled scores by total abundance
            shared_ctrl_score /= (+shared_ctrl_counts)
            # Get averaged abundance by number of samples minus ctrl samples
            shared_ctrl_counts //= (len(raws) - controls)
            shared_ctrl_analysis()
        else:
            output.write(
                gray('  Ctrl-shared: No taxa! ') + yellow('VOID') +
                gray(' sample.\n'))