Ejemplo n.º 1
0
def summarize_analysis(
        *args,
        **kwargs) -> Tuple[Optional[Sample], Counter[Id], Counter[Id], Scores]:
    """
    Summarize for a cross-analysis (to be usually called in parallel!).
    """
    # Recover input and parameters
    analysis: str = args[0]
    ontology: Ontology = kwargs['ontology']
    # TODO: Delete the following comment lines in a future release
    # including = ontology.including   # See comment below for the reason
    # excluding = ontology.excluding   # in/excluding are not used anymore
    counts: Dict[Sample, Counter[Id]] = kwargs['counts']
    scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores']
    samples: List[Sample] = kwargs['samples']
    output: io.StringIO = io.StringIO(newline='')

    # Declare/define variables
    summary_counts: Counter[Id] = col.Counter()
    summary_acc: Counter[Id] = col.Counter()
    summary_score: Scores = Scores({})
    summary: Optional[Sample] = None

    output.write(gray('Summary for ') + analysis + gray('... '))

    target_samples: List[Sample] = [
        smpl for smpl in samples if smpl.startswith(analysis)
    ]
    assert len(target_samples) >= 1, \
        red('ERROR! ') + analysis + gray(' has no samples to summarize!')
    for smpl in target_samples:
        summary_counts += counts[smpl]
        summary_score.update(scores[smpl])

    tree = TaxTree()
    tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score)
    tree.subtract()
    tree.shape()
    summary_counts.clear()
    summary_score.clear()
    # Avoid including/excluding here as get_taxa is not as 'clever' as allin1
    #  and taxa are already included/excluded in the derived samples
    tree.get_taxa(counts=summary_counts,
                  accs=summary_acc,
                  scores=summary_score)
    summary_counts = +summary_counts  # remove counts <= 0
    if summary_counts:  # Avoid returning empty sample (summary would be None)
        summary = Sample(f'{analysis}_{STR_SUMMARY}')
        output.write(
            gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') +
            green(' OK!\n'))
    else:
        output.write(yellow(' VOID\n'))
    # Print output and return
    print(output.getvalue(), end='')
    sys.stdout.flush()
    return summary, summary_counts, summary_acc, summary_score
Ejemplo n.º 2
0
def summarize_analysis(
        *args, **kwargs) -> Tuple[Sample, Counter[Id], Counter[Id], Scores]:
    """
    Summarize for a cross-analysis (to be usually called in parallel!).
    """
    # Recover input and parameters
    analysis: str = args[0]
    ontology: Ontology = kwargs['ontology']
    including = ontology.including
    excluding = ontology.excluding
    counts: Dict[Sample, Counter[Id]] = kwargs['counts']
    scores: Dict[Sample, Dict[Id, Score]] = kwargs['scores']
    samples: List[Sample] = kwargs['samples']
    output: io.StringIO = io.StringIO(newline='')

    # Declare/define variables
    summary_counts: Counter[Id] = Counter()
    summary_acc: Counter[Id] = Counter()
    summary_score: Scores = Scores({})
    summary: Sample = None

    output.write(gray('Summary for ') + analysis + gray('... '))

    target_samples: List[Sample] = [
        smpl for smpl in samples if smpl.startswith(analysis)
    ]
    assert len(target_samples) >= 1, \
        red('ERROR! ') + analysis + gray(' has no samples to summarize!')
    for smpl in target_samples:
        summary_counts += counts[smpl]
        summary_score.update(scores[smpl])

    tree = TaxTree()
    tree.grow(ontology=ontology, counts=summary_counts, scores=summary_score)
    tree.subtract()
    tree.shape()
    summary_counts.clear()
    summary_score.clear()
    tree.get_taxa(counts=summary_counts,
                  accs=summary_acc,
                  scores=summary_score,
                  include=including,
                  exclude=excluding)
    summary_counts = +summary_counts  # remove counts <= 0
    if summary_counts:  # Avoid returning empty sample (summary would be None)
        summary = Sample(f'{analysis}_{STR_SUMMARY}')
        output.write(
            gray('(') + cyan(f'{len(target_samples)}') + gray(' samples)') +
            green(' OK!\n'))
    else:
        output.write(yellow(' VOID\n'))
    # Print output and return
    print(output.getvalue(), end='')
    sys.stdout.flush()
    return summary, summary_counts, summary_acc, summary_score
Ejemplo n.º 3
0
 def shared_ctrl_analysis():
     """Perform last steps of shared taxa analysis"""
     shared_ctrl_tree: TaxTree = TaxTree()
     shared_ctrl_out: SampleDataById = SampleDataById(
         ['shared', 'accs'])
     shared_ctrl_tree.allin1(ontology=ontology,
                             counts=shared_ctrl_counts,
                             scores=shared_ctrl_score,
                             min_taxa=get_shared_mintaxa(),
                             include=including,
                             exclude=(exclude_candidates -
                                      shared_crossover),
                             out=shared_ctrl_out)
     shared_ctrl_out.purge_counters()
     out_counts: SharedCounter = shared_ctrl_out.get_shared_counts()
     output.write(
         gray(f'  Ctrl-shared: Including {len(out_counts)}'
              ' shared taxa. Generating sample... '))
     if out_counts:
         sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}')
         samples.append(sample)
         counts[Sample(sample)] = out_counts
         accs[Sample(sample)] = shared_ctrl_out.get_accs()
         scores[sample] = shared_ctrl_out.get_shared_scores()
         output.write(green('OK!\n'))
     else:
         output.write(yellow('VOID\n'))
Ejemplo n.º 4
0
def write_lineage(
    ontology: Ontology,
    parents: Parents,
    names: Dict[Id, str],
    tree: TaxTree,
    lineage_file: str,
    nodes: Counter[Id],
    collapse: bool = True,
) -> str:
    """
    Writes a lineage file understandable by Krona.

    Args:
        ontology: any Ontology object.
        parents: dictionary of parents for every Id.
        names: dictionary of names for every Id.
        tree: a TaxTree structure.
        lineage_file: name of the lineage file
        nodes: a counter for TaxIds
        collapse: This bool controls the collapse of taxid 131567
            (cellular organisms) and is True by default

    Returns: A string with the output messages

    """
    log, taxids_dic = tree.get_lineage(ontology, parents, iter(nodes))
    output: io.StringIO = io.StringIO(newline='')
    output.write(log)
    # TODO: Generalize this for non-NCBI taxonomies (collapse is specific)
    if collapse:  # Collapse taxid 131567 (cellular organisms) if desired
        for tid in taxids_dic:
            if len(taxids_dic[tid]) > 2:  # Not collapse for unclassified
                try:
                    taxids_dic[tid].remove(CELLULAR_ORGANISMS)
                except ValueError:
                    pass
    lineage_dic = {
        tax_id: [names[tid] for tid in taxids_dic[tax_id]]
        for tax_id in taxids_dic
    }
    output.write(f'  \033[90mSaving lineage file {lineage_file} with '
                 f'{len(nodes)} nodes...\033[0m')
    with open(lineage_file, 'w', newline='') as tsv_handle:
        tsvwriter = csv.writer(tsv_handle, dialect='krona')
        tsvwriter.writerow(["#taxID", "Lineage"])
        for tid in nodes:
            counts: str = str(nodes[tid])  # nodes[tid] is a int
            row: List[Union[Id, str]] = [
                counts,
            ]
            row.extend(lineage_dic[tid])
            tsvwriter.writerow(row)
    output.write('\033[92m OK! \033[0m\n')
    return output.getvalue()
Ejemplo n.º 5
0
    def cross_analysis(iteration, raw):
        """Cross analysis: exclusive and part of shared&ctrl"""
        nonlocal shared_counts, shared_score
        nonlocal shared_ctrl_counts, shared_ctrl_score

        def partial_shared_update(i):
            """Perform shared and shared-control taxa partial evaluations"""
            nonlocal shared_counts, shared_score
            nonlocal shared_ctrl_counts, shared_ctrl_score
            if i == 0:  # 1st iteration: Initialize shared abundance and score
                shared_counts.update(sub_shared_counts)
                shared_score.update(sub_shared_score)
            elif i < controls:  # Just update shared abundance and score
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
            elif i == controls:  # Initialize shared-control counters
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
                shared_ctrl_counts.update(sub_shared_counts)
                shared_ctrl_score.update(sub_shared_score)
            elif controls:  # Both: Accumulate shared abundance and score
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score
                shared_ctrl_counts &= sub_shared_counts
                shared_ctrl_score &= sub_shared_score
            else:  # Both: Accumulate shared abundance and score (no controls)
                shared_counts &= sub_shared_counts
                shared_score &= sub_shared_score

        exclude: Set[Id] = set()
        # Get taxids at this rank that are present in the other samples
        for sample in (smpl for smpl in raws if smpl != raw):
            exclude.update(taxids[sample][rank])
        exclude.update(excluding)  # Add explicit excluding taxa if any
        output.write(f'  \033[90mExclusive: From \033[0m{raw}\033[90m '
                     f'excluding {len(exclude)} taxa. '
                     f'Generating sample...\033[0m')

        exclude_tree = TaxTree()
        exclude_out = SampleDataById(['counts', 'scores', 'accs'])
        exclude_tree.allin1(ontology=ontology,
                            counts=counts[raw],
                            scores=scores[raw],
                            min_taxa=mintaxas[raw],
                            min_rank=rank,
                            just_min_rank=True,
                            include=including,
                            exclude=exclude,
                            out=exclude_out)
        exclude_out.purge_counters()
        if exclude_out.counts:  # Avoid adding empty samples
            sample = Sample(f'{raw}_{STR_EXCLUSIVE}_{rank.name.lower()}')
            samples.append(sample)
            counts[sample] = exclude_out.get_counts()
            accs[sample] = exclude_out.get_accs()
            scores[sample] = exclude_out.get_scores()
            output.write('\033[92m OK! \033[0m\n')
        else:
            output.write('\033[93m VOID \033[0m\n')

        # Get partial abundance and score for the shared analysis
        sub_shared_tree = TaxTree()
        sub_shared_out = SampleDataById(['shared', 'accs'])
        sub_shared_tree.allin1(ontology=ontology,
                               counts=counts[raw],
                               scores=scores[raw],
                               min_taxa=mintaxas[raw],
                               min_rank=rank,
                               just_min_rank=True,
                               include=including,
                               exclude=excluding,
                               out=sub_shared_out)
        sub_shared_out.purge_counters()
        # Scale scores by abundance
        sub_shared_counts: SharedCounter = sub_shared_out.get_shared_counts()
        sub_shared_score: SharedCounter = sub_shared_out.get_shared_scores()
        sub_shared_score *= sub_shared_counts
        partial_shared_update(iteration)
Ejemplo n.º 6
0
    def control_analysis():
        """Perform last steps of control and shared controls analysis"""
        nonlocal shared_ctrl_counts, shared_ctrl_score

        def robust_contamination_removal():
            """Implement robust contamination removal algorithm."""
            nonlocal exclude_sets, shared_crossover

            def compute_qn(data: List[float], dist: str = "Gauss") -> float:
                """Compute Qn robust estimator of scale (Rousseeuw, 1993)"""
                c_d: float  # Select d parameter depending on the distribution
                if dist == "Gauss":
                    c_d = 2.2219
                elif dist == "Cauchy":  # Heavy-tailed distribution
                    c_d = 1.2071
                elif dist == "NegExp":  # Negative exponential (asymetric)
                    c_d = 3.4760
                else:
                    raise Exception(red('\nERROR! ') + 'Unknown distribution')
                num: int = len(data)
                sort_data = sorted(data)
                pairwisedifs: List[float] = []
                for (i, x_val) in enumerate(sort_data):
                    for y_val in sort_data[i + 1:]:
                        pairwisedifs.append(abs(x_val - y_val))
                k: int = int(num * (num / 2 + 1) / 4)
                return c_d * sorted(pairwisedifs)[k - 1]

            exclude_sets = {smpl: set() for smpl in raws[controls:]}
            vwrite(
                gray('Robust contamination removal: '
                     'Searching for contaminants...\n'))
            for tid in exclude_candidates:
                relfreq_ctrl: List[float] = [
                    accs[ctrl][tid] / accs[ctrl][ontology.ROOT]
                    for ctrl in raws[:controls]
                ]
                relfreq_smpl: List[float] = [
                    accs[smpl][tid] / accs[smpl][ontology.ROOT]
                    for smpl in raws[controls:]
                ]
                relfreq: List[float] = relfreq_ctrl + relfreq_smpl
                crossover: List[bool]  # Crossover source (yes/no)
                # Just-controls contamination check
                if all([rf < EPS for rf in relfreq_smpl]):
                    vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    continue  # Go for next candidate
                # Critical contamination check
                if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(red('critical:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Severe contamination check
                if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(yellow('severe: \t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Mild contamination check
                if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Calculate median and MAD median but including controls
                mdn: float = statistics.median(relfreq)
                # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq])
                q_n: float = compute_qn(relfreq, dist="NegExp")
                # Calculate crossover in samples
                outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n
                ordomag_lim: float = max(
                    relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG
                crossover = [
                    rf > outlier_lim and rf > ordomag_lim
                    for rf in relfreq[controls:]
                ]
                # Crossover contamination check
                if any(crossover):
                    vwrite(
                        magenta('crossover:\t'), tid, ontology.get_name(tid),
                        green(f'lims: [{outlier_lim:.1g}]' +
                              ('<' if outlier_lim < ordomag_lim else '>') +
                              f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                        fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                        gray('crossover:'), blst2str(crossover), '\n')
                    # Exclude just for contaminated samples (not the source)
                    vwrite(magenta('\t->'), gray(f'Include {tid} just in:'))
                    for i in range(len(raws[controls:])):
                        if not crossover[i]:
                            exclude_sets[raws[i + controls]].add(tid)
                        else:
                            vwrite(f' {raws[i + controls]}')
                    if all(crossover):  # Shared taxon contaminating control(s)
                        vwrite(' (', yellow('Shared crossover taxon!'), ')')
                        shared_crossover.add(tid)
                    vwrite('\n')
                    continue
                # Other contamination: remove from all samples
                vwrite(
                    gray('other cont:\t'), tid, ontology.get_name(tid),
                    green(f'lims: [{outlier_lim:.1g}]' +
                          ('<' if outlier_lim < ordomag_lim else '>') +
                          f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                    fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n')
                for exclude_set in exclude_sets.values():
                    exclude_set.add(tid)

        # Get taxids at this rank that are present in the control samples
        exclude_candidates: Set[Id] = set()
        for i in range(controls):
            exclude_candidates.update(taxids[raws[i]][rank])
        exclude_sets: Dict[Sample, Set[Id]]
        shared_crossover: Set[Id] = set()  # Shared taxa contaminating controls
        if controls and (len(raws) - controls >= ROBUST_MIN_SAMPLES):
            robust_contamination_removal()
        else:  # If this case, just apply strict control
            exclude_sets = {
                file: exclude_candidates
                for file in raws[controls::]
            }
        # Add explicit excluding taxa (if any) to exclude sets
        for exclude_set in exclude_sets.values():
            exclude_set.update(excluding)
        exclude_candidates.update(excluding)
        # Process each sample excluding control taxa
        for raw in raws[controls:]:
            output.write(
                gray('  Ctrl: From') + f' {raw} ' +
                gray(f'excluding {len(exclude_sets[raw])} ctrl taxa. '
                     f'Generating sample... '))
            ctrl_tree = TaxTree()
            ctrl_out = SampleDataById(['counts', 'scores', 'accs'])
            ctrl_tree.allin1(ontology=ontology,
                             counts=counts[raw],
                             scores=scores[raw],
                             min_taxa=mintaxas[raw],
                             min_rank=rank,
                             just_min_rank=True,
                             include=including,
                             exclude=exclude_sets[raw],
                             out=ctrl_out)
            ctrl_out.purge_counters()
            if ctrl_out.counts:  # Avoid adding empty samples
                sample = Sample(f'{raw}_{STR_CONTROL}_{rank.name.lower()}')
                samples.append(sample)
                counts[sample] = ctrl_out.get_counts()
                accs[sample] = ctrl_out.get_accs()
                scores[sample] = ctrl_out.get_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        def shared_ctrl_analysis():
            """Perform last steps of shared taxa analysis"""
            shared_ctrl_tree: TaxTree = TaxTree()
            shared_ctrl_out: SampleDataById = SampleDataById(
                ['shared', 'accs'])
            shared_ctrl_tree.allin1(ontology=ontology,
                                    counts=shared_ctrl_counts,
                                    scores=shared_ctrl_score,
                                    min_taxa=get_shared_mintaxa(),
                                    include=including,
                                    exclude=(exclude_candidates -
                                             shared_crossover),
                                    out=shared_ctrl_out)
            shared_ctrl_out.purge_counters()
            out_counts: SharedCounter = shared_ctrl_out.get_shared_counts()
            output.write(
                gray(f'  Ctrl-shared: Including {len(out_counts)}'
                     ' shared taxa. Generating sample... '))
            if out_counts:
                sample = Sample(f'{STR_CONTROL_SHARED}_{rank.name.lower()}')
                samples.append(sample)
                counts[Sample(sample)] = out_counts
                accs[Sample(sample)] = shared_ctrl_out.get_accs()
                scores[sample] = shared_ctrl_out.get_shared_scores()
                output.write(green('OK!\n'))
            else:
                output.write(yellow('VOID\n'))

        # Shared-control taxa final analysis
        if shared_ctrl_counts:
            # Normalize scaled scores by total abundance
            shared_ctrl_score /= (+shared_ctrl_counts)
            # Get averaged abundance by number of samples minus ctrl samples
            shared_ctrl_counts //= (len(raws) - controls)
            shared_ctrl_analysis()
        else:
            output.write(
                gray('  Ctrl-shared: No taxa! ') + yellow('VOID') +
                gray(' sample.\n'))
Ejemplo n.º 7
0
def main():
    """Main entry point to script."""
    # Argument Parser Configuration
    parser = argparse.ArgumentParser(
        description='Extract reads following Centrifuge/Kraken output',
        epilog=f'%(prog)s  - {__author__} - {__date__}')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=f'%(prog)s release {__version__} ({__date__})')
    parser.add_argument('-f',
                        '--file',
                        action='store',
                        metavar='FILE',
                        required=True,
                        help='Centrifuge output file.')
    parser.add_argument('-l',
                        '--limit',
                        action='store',
                        metavar='NUMBER',
                        type=int,
                        default=None,
                        help=('Limit of FASTQ reads to extract. '
                              'Default: no limit'))
    parser.add_argument(
        '-m',
        '--maxreads',
        action='store',
        metavar='NUMBER',
        type=int,
        default=None,
        help=('Maximum number of FASTQ reads to search for the taxa. '
              'Default: no maximum'))
    parser.add_argument(
        '-n',
        '--nodespath',
        action='store',
        metavar='PATH',
        default=TAXDUMP_PATH,
        help=('path for the nodes information files (nodes.dmp and names.dmp' +
              ' from NCBI'))
    parser.add_argument(
        '-i',
        '--include',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to include a taxon and all underneath ' +
              '(multiple -i is available to include several taxid). ' +
              'By default all the taxa is considered for inclusion.'))
    parser.add_argument(
        '-x',
        '--exclude',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to exclude a taxon and all underneath ' +
              '(multiple -x is available to exclude several taxid)'))
    parser.add_argument(
        '-y',
        '--minscore',
        action='store',
        metavar='NUMBER',
        type=lambda txt: Score(float(txt)),
        default=None,
        help=('minimum score/confidence of the classification of a read '
              'to pass the quality filter; all pass by default'))
    filein = parser.add_mutually_exclusive_group(required=True)
    filein.add_argument('-q',
                        '--fastq',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Single FASTQ file (no paired-ends)')
    filein.add_argument('-1',
                        '--mate1',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 1s '
                        '(filename usually includes _1)')
    parser.add_argument('-2',
                        '--mate2',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 2s '
                        '(filename usually includes _2)')

    # timing initialization
    start_time: float = time.time()
    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} =-= {__date__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    args = parser.parse_args()
    output_file = args.file
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    excluding: Set[TaxId] = set(args.exclude)
    including: Set[TaxId] = set(args.include)
    fastq_1: Filename
    fastq_2: Filename = args.mate2
    if not fastq_2:
        fastq_1 = args.fastq
    else:
        fastq_1 = args.mate1

    # Load NCBI nodes, names and build children
    plasmidfile: Filename = None
    ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, False,
                              excluding, including)

    # Build taxonomy tree
    print(gray('Building taxonomy tree...'), end='')
    sys.stdout.flush()
    tree = TaxTree()
    tree.grow(taxonomy=ncbi, look_ancestors=False)
    print(green(' OK!'))

    # Get the taxa
    print(gray('Filtering taxa...'), end='')
    sys.stdout.flush()
    ranks: Ranks = Ranks({})
    tree.get_taxa(ranks=ranks, include=including, exclude=excluding)
    print(green(' OK!'))
    taxids: Set[TaxId] = set(ranks)
    taxlevels: TaxLevels = Rank.ranks_to_taxlevels(ranks)
    num_taxlevels = Counter({rank: len(taxlevels[rank]) for rank in taxlevels})
    num_taxlevels = +num_taxlevels

    # Statistics about including taxa
    print(f'  {len(taxids)}\033[90m taxid selected in \033[0m', end='')
    print(f'{len(num_taxlevels)}\033[90m different taxonomical levels:\033[0m')
    for rank in num_taxlevels:
        print(f'  Number of different {rank}: {num_taxlevels[rank]}')
    assert taxids, red('ERROR! No taxids to search for!')

    # Get the records
    records: List[SeqRecord] = []
    num_seqs: int = 0
    # timing initialization
    start_time_load: float = time.perf_counter()
    print(gray(f'Loading output file {output_file}...'), end='')
    sys.stdout.flush()
    try:
        with open(output_file, 'rU') as file:
            file.readline()  # discard header
            for num_seqs, record in enumerate(SeqIO.parse(file, 'centrifuge')):
                tid: TaxId = record.annotations['taxID']
                if tid not in taxids:
                    continue  # Ignore read if low confidence
                score: Score = Score(record.annotations['score'])
                if args.minscore is not None and score < args.minscore:
                    continue
                records.append(record)
    except FileNotFoundError:
        raise Exception(red('ERROR!') + 'Cannot read "' + output_file + '"')
    print(green(' OK!'))

    # Basic records statistics
    print(
        gray('  Load elapsed time: ') +
        f'{time.perf_counter() - start_time_load:.3g}' + gray(' sec'))
    print(f'  \033[90mMatching reads: \033[0m{len(records):_d} \033[90m\t'
          f'(\033[0m{len(records)/num_seqs:.4%}\033[90m of sample)')
    sys.stdout.flush()

    # FASTQ sequence dealing
    # records_ids: List[SeqRecord] = [record.id for record in records]
    records_ids: Set[SeqRecord] = {record.id for record in records}
    seqs1: List[SeqRecord] = []
    seqs2: List[SeqRecord] = []
    extracted: int = 0
    i: int = 0
    if fastq_2:
        print(
            f'\033[90mLoading FASTQ files {fastq_1} and {fastq_2}...\n'
            f'Mseqs: \033[0m',
            end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1, open(fastq_2, 'rU') as file2:
                for i, (rec1, rec2) in enumerate(
                        zip(SeqIO.parse(file1, 'quickfastq'),
                            SeqIO.parse(file2, 'quickfastq'))):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        seqs2.append(rec2)
                        extracted += 1

        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ files')
    else:
        print(f'\033[90mLoading FASTQ files {fastq_1}...\n'
              f'Mseqs: \033[0m',
              end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1:
                for i, rec1 in enumerate(SeqIO.parse(file1, 'quickfastq')):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        extracted += 1
        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ file')
    print(cyan(f' {i/1e+6:.3g} Mseqs'), green('OK! '))

    def format_filename(fastq: Filename) -> Filename:
        """Auxiliary function to properly format the output filenames.

        Args:
            fastq: Complete filename of the fastq input file

        Returns: Filename of the rextracted fastq output file
        """
        fastq_filename, _ = os.path.splitext(fastq)
        output_list: List[str] = [fastq_filename, '_rxtr']
        if including:
            output_list.append('_incl')
            output_list.extend('_'.join(including))
        if excluding:
            output_list.append('_excl')
            output_list.extend('_'.join(excluding))
        output_list.append('.fastq')
        return Filename(''.join(output_list))

    filename1: Filename = format_filename(fastq_1)
    SeqIO.write(seqs1, filename1, 'quickfastq')
    print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename1)
    if fastq_2:
        filename2: Filename = format_filename(fastq_2)
        SeqIO.write(seqs2, filename2, 'quickfastq')
        print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'),
              filename2)

    # Timing results
    print(gray('Total elapsed time:'),
          time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
Ejemplo n.º 8
0
def process_report(
        *args, **kwargs
) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]:
    """
    Process Centrifuge/Kraken report files (to be usually called in parallel!).
    """
    # TODO: Full review to report support
    # Recover input and parameters
    filerep: Filename = args[0]
    taxonomy: Taxonomy = kwargs['taxonomy']
    mintaxa: int = kwargs['mintaxa']
    collapse: bool = taxonomy.collapse
    including: Set[TaxId] = taxonomy.including
    excluding: Set[TaxId] = taxonomy.excluding
    debug: bool = kwargs['debug']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(filerep)

    # Read Centrifuge/Kraken report file to get abundances
    log: str
    abundances: Counter[TaxId]
    log, abundances, _ = read_report(filerep)
    output.write(log)
    # Remove root counts, in case
    if kwargs['root']:
        vwrite(gray('Removing'), abundances[ROOT], gray('"ROOT" reads... '))
        abundances[ROOT] = 0
        vwrite(green('OK!'), '\n')

    # Build taxonomy tree
    output.write('  \033[90mBuilding taxonomy tree...\033[0m')
    tree = TaxTree()
    tree.grow(taxonomy=taxonomy,
              counts=abundances)  # Grow tax tree from root node
    output.write('\033[92m OK! \033[0m\n')

    # Prune the tree
    output.write('  \033[90mPruning taxonomy tree...\033[0m')
    tree.prune(mintaxa, None, collapse, debug)
    tree.shape()
    output.write('\033[92m OK! \033[0m\n')

    # Get the taxa with their abundances and taxonomical levels
    output.write('  \033[90mFiltering taxa...\033[0m')
    new_abund: Counter[TaxId] = col.Counter()
    new_accs: Counter[TaxId] = col.Counter()
    ranks: Ranks = Ranks({})
    tree.get_taxa(abundance=new_abund,
                  accs=new_accs,
                  ranks=ranks,
                  mindepth=0,
                  maxdepth=0,
                  include=including,
                  exclude=excluding)
    new_abund = +new_abund  # remove zero and negative counts
    if including or excluding:  # Recalculate accumulated counts
        new_tree = TaxTree()
        new_tree.grow(taxonomy, new_abund)  # Grow tree with new abund
        new_tree.shape()
        new_abund = col.Counter()  # Reset abundances
        new_accs = col.Counter()  # Reset accumulated
        new_tree.get_taxa(new_abund, new_accs)  # Get new accumulated counts
    out: SampleDataByTaxId = SampleDataByTaxId()
    out.set(counts=new_abund, ranks=ranks, accs=new_accs)
    output.write('\033[92m OK! \033[0m\n')
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, SampleStats(), Err.NO_ERROR
Ejemplo n.º 9
0
def process_output(
        *args, **kwargs
) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]:
    """
    Process Centrifuge/LMAT output files (to be usually called in parallel!).
    """
    # timing initialization
    start_time: float = time.perf_counter()
    # Recover input and parameters
    target_file: Filename = args[0]
    debug: bool = kwargs['debug']
    is_ctrl: bool = args[1]
    if debug:
        print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'),
              target_file, gray('...'))
        sys.stdout.flush()
    taxonomy: Taxonomy = kwargs['taxonomy']
    mintaxa: int = kwargs['ctrlmintaxa'] if is_ctrl else kwargs['mintaxa']
    minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore']
    including: Set[TaxId] = taxonomy.including
    excluding: Set[TaxId] = taxonomy.excluding
    scoring: Scoring = kwargs['scoring']
    lmat: bool = kwargs['lmat']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(os.path.splitext(target_file)[0])
    error: Err = Err.NO_ERROR
    # Read Centrifuge/LMAT output files to get abundances
    read_method: Callable[[Filename, Scoring, Optional[Score]],  # Input
                          Tuple[str, SampleStats, Counter[TaxId],
                                Dict[TaxId, Score]]  # Output
                          ]
    if lmat:
        read_method = read_lmat_output
    else:
        read_method = read_output
    log: str
    counts: Counter[TaxId]
    scores: Dict[TaxId, Score]
    log, stat, counts, scores = read_method(target_file, scoring, minscore)
    output.write(log)
    # Update field in stat about control nature of the sample
    stat.is_ctrl = is_ctrl
    # Move cellular_organisms counts to root, in case
    if taxonomy.collapse and counts[CELLULAR_ORGANISMS]:
        vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS],
               gray('"CELLULAR_ORGANISMS" reads to "ROOT"... '))
        if counts[ROOT]:
            stat.num_taxa -= 1
            scores[ROOT] = (
                (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] +
                 scores[ROOT] * counts[ROOT]) /
                (counts[CELLULAR_ORGANISMS] + counts[ROOT]))
        else:
            scores[ROOT] = scores[CELLULAR_ORGANISMS]
        counts[ROOT] += counts[CELLULAR_ORGANISMS]
        counts[CELLULAR_ORGANISMS] = 0
        scores[CELLULAR_ORGANISMS] = NO_SCORE
    # Remove root counts, in case
    if kwargs['root'] and counts[ROOT]:
        vwrite(gray('Removing'), counts[ROOT], gray('"ROOT" reads... '))
        stat.seq = stat.seq._replace(filt=stat.seq.filt - counts[ROOT])
        stat.num_taxa -= 1
        counts[ROOT] = 0
        scores[ROOT] = NO_SCORE
        vwrite(green('OK!'), '\n')

    # Building taxonomy tree
    output.write(gray('Building from raw data... '))
    vwrite(gray('\n  Building taxonomy tree with all-in-1... '))
    tree = TaxTree()
    ancestors: Set[TaxId]
    orphans: Set[TaxId]
    ancestors, orphans = taxonomy.get_ancestors(counts.keys())
    out = SampleDataByTaxId(['all'])
    tree.allin1(taxonomy=taxonomy,
                counts=counts,
                scores=scores,
                ancestors=ancestors,
                min_taxa=mintaxa,
                include=including,
                exclude=excluding,
                out=out)
    out.purge_counters()
    vwrite(green('OK!'), '\n')

    # Give stats about orphan taxid
    if debug:
        vwrite(gray('  Checking taxid loss (orphans)... '))
        lost: int = 0
        if orphans:
            for orphan in orphans:
                vwrite(yellow('Warning!'), f'Orphan taxid={orphan}\n')
                lost += counts[orphan]
            vwrite(
                yellow('WARNING!'), f'{len(orphans)} orphan taxids ('
                f'{len(orphans)/len(counts):.2%} of total)\n'
                f'{lost} orphan sequences ('
                f'{lost/sum(counts.values()):.3%} of total)\n')
        else:
            vwrite(green('OK!\n'))
    # Check the lost of taxids (plasmids typically) under some conditions
    if debug and not excluding and not including:
        vwrite(gray('  Additional checking of taxid loss... '))
        lost = 0
        for taxid in counts:
            if not out.counts[taxid]:
                lost += 1
                vwrite(yellow('Warning!'), f'Lost taxid={taxid}: '
                       f'{taxonomy.get_name(taxid)}\n')
        if lost:
            vwrite(
                yellow('WARNING!'), f'Lost {lost} taxids ('
                f'{lost/len(counts):.2%} of total)'
                '\n')
        else:
            vwrite(green('OK!\n'))

    # Print last message and check if the sample is void
    if out.counts:
        output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') +
                     green('OK!\n'))
    elif is_ctrl:
        output.write(sample + red(' ctrl VOID!\n'))
        error = Err.VOID_CTRL
    else:
        output.write(sample + blue(' sample ') + yellow('VOID\n'))
        error = Err.VOID_SAMPLE

    # Timing results
    output.write(
        gray('Load elapsed time: ') +
        f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n'))
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, stat, error
Ejemplo n.º 10
0
def process_output(
        *args,
        **kwargs) -> Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]:
    """
    Process classifiers output files (to be usually called in parallel!).
    """
    # timing initialization
    start_time: float = time.perf_counter()
    # Recover input and parameters
    target_file: Filename = args[0]
    debug: bool = kwargs['debug']
    is_ctrl: bool = args[1]
    if debug:
        print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'),
              target_file, gray('...'))
        sys.stdout.flush()
    ontology: Ontology = kwargs['ontology']
    mintaxa: Optional[int] = (kwargs['ctrlmintaxa']
                              if is_ctrl else kwargs['mintaxa'])
    minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore']
    including: Union[Tuple, Set[Id]] = ontology.including
    excluding: Union[Tuple, Set[Id]] = ontology.excluding
    scoring: Scoring = kwargs['scoring']
    classifier: Classifier = kwargs['classifier']
    genfmt: GenericFormat = kwargs['genfmt']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(os.path.splitext(target_file)[0])
    error: Err = Err.NO_ERROR
    # Read taxonomic classifier output files to get abundances
    read_method: Callable[  # Format: [[Input], Output]
        [Filename, Scoring, Optional[Score]],
        Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]]
    log: str
    stat: SampleStats
    counts: Counter[Id]
    scores: Dict[Id, Score]
    if classifier is Classifier.GENERIC:  # Direct call to generic method
        log, stat, counts, scores = read_generic_output(
            target_file, scoring, minscore, genfmt)
    else:  # Use read_method
        if classifier is Classifier.KRAKEN:
            read_method = read_kraken_output
        elif classifier is Classifier.CLARK:
            read_method = read_clark_output
        elif classifier is Classifier.LMAT:
            read_method = read_lmat_output
        elif classifier is Classifier.CENTRIFUGE:
            read_method = read_output
        else:
            raise Exception(red('\nERROR!'),
                            f'taxclass: Unknown classifier "{classifier}".')
        log, stat, counts, scores = read_method(target_file, scoring, minscore)
    output.write(log)
    # Complete/Update fields in stats
    stat.is_ctrl = is_ctrl  # set control nature of the sample
    if mintaxa is not None:  # manual mintaxa has precedence over automatic
        stat.mintaxa = mintaxa
    else:  # update local value with the automatically guessed value
        mintaxa = stat.mintaxa
    # Move cellular_organisms counts to root, in case
    if ontology.collapse and counts[CELLULAR_ORGANISMS]:
        vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS],
               gray('"CELLULAR_ORGANISMS" reads to "ROOT"... \n'))
        if counts[ontology.ROOT]:
            stat.decrease_filtered_taxids()
            scores[ontology.ROOT] = Score(
                (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] +
                 scores[ontology.ROOT] * counts[ontology.ROOT]) /
                (counts[CELLULAR_ORGANISMS] + counts[ontology.ROOT]))
        else:
            scores[ontology.ROOT] = scores[CELLULAR_ORGANISMS]
        counts[ontology.ROOT] += counts[CELLULAR_ORGANISMS]
        counts[CELLULAR_ORGANISMS] = 0
        scores[CELLULAR_ORGANISMS] = NO_SCORE
    # Remove root counts, in case
    if kwargs['root'] and counts[ontology.ROOT]:
        vwrite(gray('Removing'), counts[ontology.ROOT],
               gray('"ROOT" reads... '))
        stat.seq = stat.seq._replace(filt=stat.seq.filt -
                                     counts[ontology.ROOT])
        stat.decrease_filtered_taxids()
        counts[ontology.ROOT] = 0
        scores[ontology.ROOT] = NO_SCORE
        vwrite(green('OK!'), '\n')

    # Building ontology tree
    output.write(
        gray('Building from raw data with mintaxa = ') + f'{mintaxa:_d}' +
        gray(' ... \n'))
    vwrite(gray('  Building ontology tree with all-in-1... '))
    tree = TaxTree()
    ancestors: Set[Id]
    orphans: Set[Id]
    ancestors, orphans = ontology.get_ancestors(counts.keys())
    out = SampleDataById(['all'])
    tree.allin1(ontology=ontology,
                counts=counts,
                scores=scores,
                ancestors=ancestors,
                min_taxa=mintaxa,
                include=including,
                exclude=excluding,
                out=out)
    out.purge_counters()
    vwrite(green('OK!'), '\n')

    # Stats: Complete final value for TaxIDs after tree building and folding
    final_taxids: int = len(out.counts) if out.counts is not None else 0
    stat.set_final_taxids(final_taxids)

    # Check for additional loss of reads (due to include/exclude an orphans)
    output.write(gray('  Check for more seqs lost ([in/ex]clude affects)... '))
    if out.counts is not None:
        discard: int = sum(counts.values()) - sum(out.counts.values())
        if discard:
            output.write(
                blue('\n  Info:') + f' {discard} ' +
                gray('additional seqs discarded (') +
                f'{discard/sum(counts.values()):.3%} ' +
                gray('of accepted)\n'))
        else:
            output.write(green('OK!\n'))
    else:
        output.write(red('No counts in sample tree!\n'))
    # Warn or give detailed stats about orphan taxid and orphan seqs
    if debug:
        vwrite(gray('  Checking taxid loss (orphans)... '))
        lost: int = 0
        if orphans:
            for orphan in orphans:
                vwrite(yellow('  Warning!'), gray('Orphan taxid'),
                       f'{orphan}\n')
                lost += counts[orphan]
            vwrite(
                yellow('  WARNING!'), f'{len(orphans)} orphan taxids ('
                f'{len(orphans)/len(counts):.2%} of accepted)\n'
                f'    and {lost} orphan sequences ('
                f'{lost/sum(counts.values()):.3%} of accepted)\n')
        else:
            vwrite(green('OK!\n'))
    elif orphans:
        output.write(
            yellow('\n  Warning!') + f' {len(orphans)} orphan taxids' +
            gray(' (rerun with --debug for details)\n'))
    # Check the removal of TaxIDs (accumulation of leaves in parents)
    if debug and not excluding and including == {ontology.ROOT}:
        vwrite(gray('  Assess accumulation due to "folding the tree"...\n'))
        migrated: int = 0
        if out.counts is not None:
            for taxid in counts:
                if out.counts[taxid] == 0:
                    migrated += 1
                    vwrite(
                        blue('  Info:'),
                        gray(f'Folded TaxID {taxid} (') +
                        f'{ontology.get_name(taxid)}' + gray(') with ') +
                        f'{counts[taxid]}' + gray(' original seqs\n'))
        if migrated:
            vwrite(
                blue('  INFO:'), f'{migrated} TaxIDs folded ('
                f'{migrated/len(+counts):.2%} of TAF —TaxIDs after filtering—)'
                '\n')
            vwrite(
                blue('  INFO:'), f'Final assigned TaxIDs: {final_taxids} '
                f'(reduced to {final_taxids/len(+counts):.2%} of '
                'number of TAF)\n')
        else:
            vwrite(blue('  INFO:'), gray('No migration!'), green('OK!\n'))
    # Print last message and check if the sample is void
    if out.counts:
        output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') +
                     green('OK!\n'))
    elif is_ctrl:
        output.write(sample + red(' ctrl VOID!\n'))
        error = Err.VOID_CTRL
    else:
        output.write(sample + blue(' sample ') + yellow('VOID\n'))
        error = Err.VOID_SAMPLE

    # Timing results
    output.write(
        gray('Load elapsed time: ') +
        f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n'))
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, stat, error