def _debug_dummy_plot(
    taxonomy: Taxonomy,
    htmlfile: Filename,
    scoring: Scoring = Scoring.SHEL,
):
    """

    Generate dummy Krona plot via Krona 2.0 XML spec and exit

    """
    print(gray(f'Generating dummy Krona plot {htmlfile}...'), end='')
    sys.stdout.flush()
    samples: List[Sample] = [
        Sample('SINGLE'),
    ]
    krona: KronaTree = KronaTree(
        samples,
        min_score=Score(35),
        max_score=Score(100),
        scoring=scoring,
    )
    polytree: MultiTree = MultiTree(samples=samples)
    polytree.grow(ontology=taxonomy)
    polytree.toxml(ontology=taxonomy, krona=krona)
    krona.tohtml(htmlfile, pretty=True)
    print(green('OK!'))
    def generate_krona():
        """Generate Krona plot with all the results via Krona 2.0 XML spec"""

        print(gray('\nBuilding the taxonomy multiple tree... '), end='')
        sys.stdout.flush()
        krona: KronaTree = KronaTree(
            samples,
            num_raw_samples=len(raw_samples),
            stats=stats,
            min_score=Score(
                min([
                    min(scores[sample].values()) for sample in samples
                    if len(scores[sample])
                ])),
            max_score=Score(
                max([
                    max(scores[sample].values()) for sample in samples
                    if len(scores[sample])
                ])),
            scoring=scoring,
        )
        polytree.grow(ontology=ncbi,
                      abundances=counts,
                      accs=accs,
                      scores=scores)
        print(green('OK!'))
        print(gray('Generating final plot (') + magenta(htmlfile) +
              gray(')... '),
              end='')
        sys.stdout.flush()
        polytree.toxml(ontology=ncbi, krona=krona)
        krona.tohtml(htmlfile, pretty=False)
        print(green('OK!'))
Beispiel #3
0
 def swmean(cnt1: int, sco1: Score, cnt2: int,
            sco2: Score) -> Score:
     """Weighted mean of scores by counts"""
     if sco1 == NO_SCORE:
         return sco2
     elif sco2 == NO_SCORE:
         return sco1
     return Score((cnt1 * sco1 + cnt2 * sco2) / (cnt1 + cnt2))
Beispiel #4
0
def read_output(
    output_file: Filename,
    scoring: Scoring = Scoring.SHEL,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read Centrifuge output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error

    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            file.readline()  # discard header
            for output_line in file:
                try:
                    _, _, _tid, _score, _, _, _length, *_ = output_line.split(
                        '\t')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                tid = Id(_tid)
                try:
                    # From Centrifuge score get "single hit equivalent length"
                    shel = Score(float(_score)**0.5 + 15)
                    length = int(_length)
                except ValueError:
                    print(yellow('Failure'), f'parsing score ({_score}) for ',
                          f'query length {_length} for taxid {_tid}',
                          f'in {output_file}. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                num_read += 1
                nt_read += length
                if tid == UNCLASSIFIED:  # Just count unclassified reads
                    num_uncl += 1
                    continue
                else:
                    taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None and shel < minscore:
                    continue  # Ignore read if low confidence
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    scores=all_scores,
                                    lens=all_length,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f}' + gray(', max = ') +
        f'{stat.sco.maxi:.1f}' + gray(', avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Length: min = ') + f'{stat.len.mini}' + gray(', max = ') +
        f'{stat.len.maxi}' + gray(', avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f' Centrifuge: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Beispiel #5
0
def read_clark_output(
    output_file: Filename,
    scoring: Scoring = Scoring.CLARK_C,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read CLARK(-l)(-S) full mode output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_confs: Dict[Id, List[Score]] = {}
    all_gammas: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            # Check number of cols in header
            header = file.readline().split(',')
            if len(header) != 8:
                print(
                    red('\nERROR! ') + 'CLARK output format of ',
                    yellow(f'"{output_file}"'), 'not supported.')
                print(magenta('Expected:'),
                      'ID,Length,Gamma,1st,score1,2nd,score2,conf')
                print(magenta('Found:'), ','.join(header), end='')
                print(blue('HINT:'), 'Use CLARK, CLARK-l, or CLARK-S '
                      'with full mode (', blue('-m 0'), ')')
                raise Exception('Unsupported file format. Aborting.')
            for raw_line in file:
                try:
                    output_line = raw_line.strip()
                    (_label, _length, _gamma, _tid1, _score1, _tid2, _score2,
                     _conf) = output_line.split(',')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                try:
                    length: int = int(_length)
                    gamma: Score = Score(float(_gamma))
                    tid1: Id = Id(_tid1)
                    score1: Score = Score(float(_score1))
                    tid2: Id = Id(_tid2)
                    score2: Score = Score(float(_score2))
                    conf: Score = Score(float(_conf))
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                num_read += 1
                nt_read += length
                # Select tid and score between CLARK assignments 1 and 2
                tid: Id = tid1
                score: Score = score1
                if tid1 == UNCLASSIFIED:
                    if tid2 == UNCLASSIFIED:  # Just count unclassified reads
                        num_uncl += 1
                        continue
                    else:  # Majority of read unclassified
                        tid = tid2
                        score = score2
                        conf = Score(1 - conf)  # Get CLARK's h2/(h1+h2)
                # From CLARK_C(S) score get "single hit equivalent length"
                shel: Score = Score(score + K_MER_SIZE)
                taxids.add(tid)  # Save all the selected tids (tid1 or tid2)
                if minscore is not None:  # Decide if ignore read if low score
                    if scoring is Scoring.CLARK_C:
                        if conf < minscore:
                            continue
                    elif scoring is Scoring.CLARK_G:
                        if gamma < minscore:
                            continue
                    else:
                        if shel < minscore:
                            continue
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_confs[tid].append(conf)
                except KeyError:
                    all_confs[tid] = [
                        conf,
                    ]
                try:
                    all_gammas[tid].append(gamma)
                except KeyError:
                    all_gammas[tid] = [
                        gamma,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]

    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    scores2=all_confs,
                                    scores3=all_gammas,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Hit (score): min = ') + f'{stat.sco.mini:.1f},' +
        gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Conf. score: min = ') + f'{stat.sco2.mini:.1f},' +
        gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco2.mean:.1f}\n')
    output.write(
        gray('  Gamma score: min = ') + f'{stat.sco3.mini:.1f},' +
        gray(' max = ') + f'{stat.sco3.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco3.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.CLARK_C:
        out_scores = {
            tid: Score(mean(all_confs[tid]) * 100)
            for tid in all_confs
        }
    elif scoring is Scoring.CLARK_G:
        out_scores = {tid: Score(mean(all_gammas[tid])) for tid in all_gammas}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f'clark: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Beispiel #6
0
def read_lmat_output(
    output_file: Filename,
    scoring: Scoring = Scoring.LMAT,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read LMAT output (iterate over all the output files)

    Args:
        output_file: output file name (prefix)
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    nt_read: int = 0
    matchings: Counter[Match] = Counter()
    output_files: List[Filename] = []
    # Select files to process depending on if the output files are explicitly
    #  given or directory name is provided (all the output files there)
    if os.path.isdir(output_file):  # Just the directory name is provided
        dirname = os.path.normpath(output_file)
        for file in os.listdir(dirname):  # Add all LMAT output files in dir
            if ('_output' in file and file.endswith('.out')
                    and 'canVfin' not in file and 'pyLCA' not in file):
                output_files.append(Filename(file))
    else:  # Explicit path and file name prefix is given
        dirname, basename = os.path.split(output_file)
        for file in os.listdir(dirname):  # Add selected output files in dir
            if (file.startswith(basename) and file.endswith('.out')
                    and 'canVfin' not in file and 'pyLCA' not in file):
                output_files.append(Filename(file))
    if not output_files:
        raise Exception(
            f'\n\033[91mERROR!\033[0m Cannot read from "{output_file}"')
    # Read LMAT output files
    for output_name in output_files:
        path: Filename = Filename(os.path.join(dirname, output_name))
        output.write(f'\033[90mLoading output file {path}...\033[0m')
        try:
            with open(path, 'r') as io_file:
                for seq in SeqIO.parse(io_file, "lmat"):
                    tid: Id = seq.annotations['final_taxid']
                    score: Score = seq.annotations['final_score']
                    match: Match = Match.lmat(seq.annotations['final_match'])
                    matchings[match] += 1
                    length: int = len(seq)
                    nt_read += length
                    if minscore is not None:
                        if score < minscore:  # Ignore read if low score
                            continue
                    if match in [Match.DIRECTMATCH, Match.MULTIMATCH]:
                        try:
                            all_scores[tid].append(score)
                        except KeyError:
                            all_scores[tid] = [
                                score,
                            ]
                        try:
                            all_length[tid].append(length)
                        except KeyError:
                            all_length[tid] = [
                                length,
                            ]
        except FileNotFoundError:
            raise Exception(red('\nERROR!') + f'Cannot read "{path}"')
        output.write(green('OK!\n'))
    abundances: Counter[Id] = Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    # Basic output statistics
    read_seqs: int = sum(matchings.values())
    if read_seqs == 0:
        raise Exception(
            red('\nERROR! ') + f'Cannot read any sequence from"{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    scores=all_scores,
                                    lens=all_length,
                                    seq_read=read_seqs,
                                    seq_filt=filt_seqs,
                                    seq_clas=matchings[Match.DIRECT] +
                                    matchings[Match.MULTI])
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    multi_rel: float = matchings[Match.MULTI] / read_seqs
    direct_rel: float = matchings[Match.DIRECT] / read_seqs
    nodbhits_rel: float = matchings[Match.NODBHITS] / read_seqs
    tooshort_rel: float = matchings[Match.READTOOSHORT] / read_seqs
    lowscore_rel: float = matchings[Match.LOWSCORE] / read_seqs
    output.write(f'\033[90m  DB Matching: '
                 f'Multi =\033[0m {multi_rel:.1%}\033[90m  '
                 f'Direct =\033[0m {direct_rel:.1%}\033[90m  '
                 f'ReadTooShort =\033[0m {tooshort_rel:.1%}\033[90m  '
                 f'LowScore =\033[0m {lowscore_rel:.1%}\033[90m  '
                 f'NoDbHits =\033[0m {nodbhits_rel:.1%}\033[90m\n')
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') +
        f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(f'  {stat.num_taxa}' + gray(f' taxa with assigned reads\n'))
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.LMAT:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    else:
        print(red('ERROR!'), f' LMAT: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')  # Return
    return output.getvalue(), stat, abundances, out_scores
Beispiel #7
0
def main():
    """Main entry point to script."""
    # Argument Parser Configuration
    parser = argparse.ArgumentParser(
        description='Extract reads following Centrifuge/Kraken output',
        epilog=f'%(prog)s  - {__author__} - {__date__}')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=f'%(prog)s release {__version__} ({__date__})')
    parser.add_argument('-f',
                        '--file',
                        action='store',
                        metavar='FILE',
                        required=True,
                        help='Centrifuge output file.')
    parser.add_argument('-l',
                        '--limit',
                        action='store',
                        metavar='NUMBER',
                        type=int,
                        default=None,
                        help=('Limit of FASTQ reads to extract. '
                              'Default: no limit'))
    parser.add_argument(
        '-m',
        '--maxreads',
        action='store',
        metavar='NUMBER',
        type=int,
        default=None,
        help=('Maximum number of FASTQ reads to search for the taxa. '
              'Default: no maximum'))
    parser.add_argument(
        '-n',
        '--nodespath',
        action='store',
        metavar='PATH',
        default=TAXDUMP_PATH,
        help=('path for the nodes information files (nodes.dmp and names.dmp' +
              ' from NCBI'))
    parser.add_argument(
        '-i',
        '--include',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to include a taxon and all underneath ' +
              '(multiple -i is available to include several taxid). ' +
              'By default all the taxa is considered for inclusion.'))
    parser.add_argument(
        '-x',
        '--exclude',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to exclude a taxon and all underneath ' +
              '(multiple -x is available to exclude several taxid)'))
    parser.add_argument(
        '-y',
        '--minscore',
        action='store',
        metavar='NUMBER',
        type=lambda txt: Score(float(txt)),
        default=None,
        help=('minimum score/confidence of the classification of a read '
              'to pass the quality filter; all pass by default'))
    filein = parser.add_mutually_exclusive_group(required=True)
    filein.add_argument('-q',
                        '--fastq',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Single FASTQ file (no paired-ends)')
    filein.add_argument('-1',
                        '--mate1',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 1s '
                        '(filename usually includes _1)')
    parser.add_argument('-2',
                        '--mate2',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 2s '
                        '(filename usually includes _2)')

    # timing initialization
    start_time: float = time.time()
    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} =-= {__date__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    args = parser.parse_args()
    output_file = args.file
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    excluding: Set[TaxId] = set(args.exclude)
    including: Set[TaxId] = set(args.include)
    fastq_1: Filename
    fastq_2: Filename = args.mate2
    if not fastq_2:
        fastq_1 = args.fastq
    else:
        fastq_1 = args.mate1

    # Load NCBI nodes, names and build children
    plasmidfile: Filename = None
    ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, False,
                              excluding, including)

    # Build taxonomy tree
    print(gray('Building taxonomy tree...'), end='')
    sys.stdout.flush()
    tree = TaxTree()
    tree.grow(taxonomy=ncbi, look_ancestors=False)
    print(green(' OK!'))

    # Get the taxa
    print(gray('Filtering taxa...'), end='')
    sys.stdout.flush()
    ranks: Ranks = Ranks({})
    tree.get_taxa(ranks=ranks, include=including, exclude=excluding)
    print(green(' OK!'))
    taxids: Set[TaxId] = set(ranks)
    taxlevels: TaxLevels = Rank.ranks_to_taxlevels(ranks)
    num_taxlevels = Counter({rank: len(taxlevels[rank]) for rank in taxlevels})
    num_taxlevels = +num_taxlevels

    # Statistics about including taxa
    print(f'  {len(taxids)}\033[90m taxid selected in \033[0m', end='')
    print(f'{len(num_taxlevels)}\033[90m different taxonomical levels:\033[0m')
    for rank in num_taxlevels:
        print(f'  Number of different {rank}: {num_taxlevels[rank]}')
    assert taxids, red('ERROR! No taxids to search for!')

    # Get the records
    records: List[SeqRecord] = []
    num_seqs: int = 0
    # timing initialization
    start_time_load: float = time.perf_counter()
    print(gray(f'Loading output file {output_file}...'), end='')
    sys.stdout.flush()
    try:
        with open(output_file, 'rU') as file:
            file.readline()  # discard header
            for num_seqs, record in enumerate(SeqIO.parse(file, 'centrifuge')):
                tid: TaxId = record.annotations['taxID']
                if tid not in taxids:
                    continue  # Ignore read if low confidence
                score: Score = Score(record.annotations['score'])
                if args.minscore is not None and score < args.minscore:
                    continue
                records.append(record)
    except FileNotFoundError:
        raise Exception(red('ERROR!') + 'Cannot read "' + output_file + '"')
    print(green(' OK!'))

    # Basic records statistics
    print(
        gray('  Load elapsed time: ') +
        f'{time.perf_counter() - start_time_load:.3g}' + gray(' sec'))
    print(f'  \033[90mMatching reads: \033[0m{len(records):_d} \033[90m\t'
          f'(\033[0m{len(records)/num_seqs:.4%}\033[90m of sample)')
    sys.stdout.flush()

    # FASTQ sequence dealing
    # records_ids: List[SeqRecord] = [record.id for record in records]
    records_ids: Set[SeqRecord] = {record.id for record in records}
    seqs1: List[SeqRecord] = []
    seqs2: List[SeqRecord] = []
    extracted: int = 0
    i: int = 0
    if fastq_2:
        print(
            f'\033[90mLoading FASTQ files {fastq_1} and {fastq_2}...\n'
            f'Mseqs: \033[0m',
            end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1, open(fastq_2, 'rU') as file2:
                for i, (rec1, rec2) in enumerate(
                        zip(SeqIO.parse(file1, 'quickfastq'),
                            SeqIO.parse(file2, 'quickfastq'))):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        seqs2.append(rec2)
                        extracted += 1

        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ files')
    else:
        print(f'\033[90mLoading FASTQ files {fastq_1}...\n'
              f'Mseqs: \033[0m',
              end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1:
                for i, rec1 in enumerate(SeqIO.parse(file1, 'quickfastq')):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        extracted += 1
        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ file')
    print(cyan(f' {i/1e+6:.3g} Mseqs'), green('OK! '))

    def format_filename(fastq: Filename) -> Filename:
        """Auxiliary function to properly format the output filenames.

        Args:
            fastq: Complete filename of the fastq input file

        Returns: Filename of the rextracted fastq output file
        """
        fastq_filename, _ = os.path.splitext(fastq)
        output_list: List[str] = [fastq_filename, '_rxtr']
        if including:
            output_list.append('_incl')
            output_list.extend('_'.join(including))
        if excluding:
            output_list.append('_excl')
            output_list.extend('_'.join(excluding))
        output_list.append('.fastq')
        return Filename(''.join(output_list))

    filename1: Filename = format_filename(fastq_1)
    SeqIO.write(seqs1, filename1, 'quickfastq')
    print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename1)
    if fastq_2:
        filename2: Filename = format_filename(fastq_2)
        SeqIO.write(seqs2, filename2, 'quickfastq')
        print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'),
              filename2)

    # Timing results
    print(gray('Total elapsed time:'),
          time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
Beispiel #8
0
def read_output(
    output_file: Filename,
    scoring: Scoring = Scoring.SHEL,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[TaxId], Dict[TaxId, Score]]:
    """
    Read Centrifuge output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[TaxId, List[Score]] = {}
    all_length: Dict[TaxId, List[int]] = {}
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    error_read: int = None
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            file.readline()  # discard header
            for output_line in file:
                try:
                    _, _, _tid, _score, _, _, _length, *_ = output_line.split(
                        '\t')
                except ValueError:
                    print(
                        red('Error'), f'parsing line: ({output_line}) '
                        f'in {output_file}. Ignoring line!')
                    error_read = num_read + 1
                    continue
                tid = TaxId(_tid)
                try:
                    # From Centrifuge score get "single hit equivalent length"
                    shel = Score(float(_score)**0.5 + 15)
                    length = int(_length)
                except ValueError:
                    print(red('Error'), f'parsing score ({_score}) for query',
                          f'length ({_length}) for taxid {_tid}',
                          f'in {output_file}. Ignoring line!')
                    continue
                num_read += 1
                nt_read += length
                if tid == UNCLASSIFIED:  # Just count unclassified reads
                    num_uncl += 1
                    continue
                elif minscore is not None and shel < minscore:
                    continue  # Ignore read if low confidence
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if error_read == num_read + 1:  # Check if error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[TaxId] = Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') + f'Cannot read any sequence from"{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    scores=all_scores,
                                    lens=all_length,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs)
    # Output statistics
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') +
        f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(f'  {stat.num_taxa}' + gray(f' taxa with assigned reads\n'))
    # Select score output
    out_scores: Dict[TaxId, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[TaxId, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[TaxId, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        raise Exception(f'\n\033[91mERROR!\033[0m Unknown Scoring "{scoring}"')
    # Return
    return output.getvalue(), stat, counts, out_scores
 def configure_parser():
     """Argument Parser Configuration"""
     parser = argparse.ArgumentParser(
         description='Analyze results of metagenomic taxonomic classifiers',
         epilog=f'%(prog)s  - Release {__version__} - {__date__}' + LICENSE,
         formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument(
         '-V',
         '--version',
         action='version',
         version=f'%(prog)s version {__version__} released in {__date__}')
     parser_in = parser.add_argument_group(
         'input', 'Define Recentrifuge input files and formats')
     parser_in.add_argument('-n',
                            '--nodespath',
                            action='store',
                            metavar='PATH',
                            default=TAXDUMP_PATH,
                            help=('path for the nodes information files '
                                  '(nodes.dmp and names.dmp from NCBI)'))
     parser_filein = parser_in.add_mutually_exclusive_group(required=True)
     parser_filein.add_argument(
         '-f',
         '--file',
         action='append',
         metavar='FILE',
         type=Filename,
         help=('Centrifuge output files. If a single directory is entered, '
               'every .out file inside will be taken as a different sample.'
               ' Multiple -f is available to include several samples.'))
     parser_filein.add_argument(
         '-l',
         '--lmat',
         action='append',
         metavar='FILE',
         type=Filename,
         default=None,
         help=('LMAT output dir or file prefix. If just "." is entered, '
               'every subdirectory under the current directory will be '
               'taken as a sample and scanned looking for LMAT output files'
               '. Multiple -l is available to include several samples.'))
     parser_filein.add_argument(
         '-k',
         '--clark',
         action='append',
         metavar='FILE',
         type=Filename,
         help=('CLARK(S) output files. If a single directory is entered, '
               'every .csv file inside will be taken as a different sample.'
               ' Multiple -k is available to include several samples.'))
     parser_filein.add_argument(
         '-r',
         '--report',
         action='append',
         metavar='FILE',
         type=Filename,
         help=('Centrifuge/Kraken report files '
               '(multiple -r is available to include several samples)'))
     parser_out = parser.add_argument_group(
         'output', 'Related to the Recentrifuge output files')
     parser_out.add_argument(
         '-o',
         '--outhtml',
         action='store',
         metavar='FILE',
         type=Filename,
         help='HTML output file (if not given, the filename will be '
         'inferred from input files)')
     parser_out.add_argument(
         '-e',
         '--excel',
         action='store',
         metavar='OUTPUT_TYPE',
         choices=[str(excel) for excel in Excel],
         default=str(Excel(0)),
         help=(f'type of excel report to be generated, and can be one of '
               f'{[str(excel) for excel in Excel]}'))
     parser_coarse = parser.add_argument_group(
         'tuning', 'Coarse tuning of algorithm parameters')
     parser_cross = parser_coarse.add_mutually_exclusive_group(
         required=False)
     parser_cross.add_argument(
         '-c',
         '--controls',
         action='store',
         metavar='CONTROLS_NUMBER',
         type=int,
         default=0,
         help=('this number of first samples will be treated as negative '
               'controls; default is no controls'))
     parser_coarse.add_argument(
         '-s',
         '--scoring',
         action='store',
         metavar='SCORING',
         choices=[str(each_score) for each_score in Scoring],
         default=str(Scoring(0)),
         help=(f'type of scoring to be applied, and can be one of '
               f'{[str(scoring) for scoring in Scoring]}'))
     parser_coarse.add_argument(
         '-y',
         '--minscore',
         action='store',
         metavar='NUMBER',
         type=lambda txt: Score(float(txt)),
         default=None,
         help=('minimum score/confidence of the classification of a read '
               'to pass the quality filter; all pass by default'))
     parser_coarse.add_argument(
         '-m',
         '--mintaxa',
         action='store',
         metavar='INT',
         type=int,
         default=DEFMINTAXA,
         help='minimum taxa to avoid collapsing one level to the parent one'
     )
     parser_coarse.add_argument(
         '-x',
         '--exclude',
         action='append',
         metavar='TAXID',
         type=Id,
         default=[],
         help=('NCBI taxid code to exclude a taxon and all underneath '
               '(multiple -x is available to exclude several taxid)'))
     parser_coarse.add_argument(
         '-i',
         '--include',
         action='append',
         metavar='TAXID',
         type=Id,
         default=[],
         help=('NCBI taxid code to include a taxon and all underneath '
               '(multiple -i is available to include several taxid); '
               'by default, all the taxa are considered for inclusion'))
     parser_cross.add_argument('-a',
                               '--avoidcross',
                               action='store_true',
                               help='avoid cross analysis')
     parser_fine = parser.add_argument_group(
         'fine tuning', 'Fine tuning of algorithm parameters')
     parser_fine.add_argument(
         '-z',
         '--ctrlminscore',
         action='store',
         metavar='NUMBER',
         type=lambda txt: Score(float(txt)),
         default=None,
         help=('minimum score/confidence of the classification of a read '
               'in control samples to pass the quality filter; if defaults '
               'to "minscore"'))
     parser_fine.add_argument(
         '-w',
         '--ctrlmintaxa',
         action='store',
         metavar='INT',
         type=int,
         default=None,
         help='minimum taxa to avoid collapsing one level to the parent one'
         ' in control samples; it defaults to "mintaxa"')
     parser_fine.add_argument(
         '-u',
         '--summary',
         action='store',
         metavar='OPTION',
         choices=['add', 'only', 'avoid'],
         default='add',
         help=(
             'select to "add" summary samples to other samples, or to '
             '"only" show summary samples or to "avoid" summaries at all'))
     parser_fine.add_argument(
         '-t',
         '--takeoutroot',
         action='store_true',
         help='remove counts directly assigned to the "root" level')
     parser_fine.add_argument('--nokollapse',
                              action='store_true',
                              help='show the "cellular organisms" taxon')
     parser_mode = parser.add_argument_group('advanced',
                                             'Advanced modes of running')
     parser_mode.add_argument(
         '--dummy',  # hidden flag: just generate a dummy plot for JS debug
         action='store_true',
         help=argparse.SUPPRESS)
     parser_mode.add_argument(
         '-g',
         '--debug',
         action='store_true',
         help='increase output verbosity and perform additional checks')
     parser_mode.add_argument('--sequential',
                              action='store_true',
                              help='deactivate parallel processing')
     return parser
Beispiel #10
0
def process_output(
        *args,
        **kwargs) -> Tuple[Sample, TaxTree, SampleDataById, SampleStats, Err]:
    """
    Process classifiers output files (to be usually called in parallel!).
    """
    # timing initialization
    start_time: float = time.perf_counter()
    # Recover input and parameters
    target_file: Filename = args[0]
    debug: bool = kwargs['debug']
    is_ctrl: bool = args[1]
    if debug:
        print(gray('Processing'), blue('ctrl' if is_ctrl else 'sample'),
              target_file, gray('...'))
        sys.stdout.flush()
    ontology: Ontology = kwargs['ontology']
    mintaxa: Optional[int] = (kwargs['ctrlmintaxa']
                              if is_ctrl else kwargs['mintaxa'])
    minscore: Score = kwargs['ctrlminscore'] if is_ctrl else kwargs['minscore']
    including: Union[Tuple, Set[Id]] = ontology.including
    excluding: Union[Tuple, Set[Id]] = ontology.excluding
    scoring: Scoring = kwargs['scoring']
    classifier: Classifier = kwargs['classifier']
    genfmt: GenericFormat = kwargs['genfmt']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(os.path.splitext(target_file)[0])
    error: Err = Err.NO_ERROR
    # Read taxonomic classifier output files to get abundances
    read_method: Callable[  # Format: [[Input], Output]
        [Filename, Scoring, Optional[Score]],
        Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]]
    log: str
    stat: SampleStats
    counts: Counter[Id]
    scores: Dict[Id, Score]
    if classifier is Classifier.GENERIC:  # Direct call to generic method
        log, stat, counts, scores = read_generic_output(
            target_file, scoring, minscore, genfmt)
    else:  # Use read_method
        if classifier is Classifier.KRAKEN:
            read_method = read_kraken_output
        elif classifier is Classifier.CLARK:
            read_method = read_clark_output
        elif classifier is Classifier.LMAT:
            read_method = read_lmat_output
        elif classifier is Classifier.CENTRIFUGE:
            read_method = read_output
        else:
            raise Exception(red('\nERROR!'),
                            f'taxclass: Unknown classifier "{classifier}".')
        log, stat, counts, scores = read_method(target_file, scoring, minscore)
    output.write(log)
    # Complete/Update fields in stats
    stat.is_ctrl = is_ctrl  # set control nature of the sample
    if mintaxa is not None:  # manual mintaxa has precedence over automatic
        stat.mintaxa = mintaxa
    else:  # update local value with the automatically guessed value
        mintaxa = stat.mintaxa
    # Move cellular_organisms counts to root, in case
    if ontology.collapse and counts[CELLULAR_ORGANISMS]:
        vwrite(gray('Moving'), counts[CELLULAR_ORGANISMS],
               gray('"CELLULAR_ORGANISMS" reads to "ROOT"... \n'))
        if counts[ontology.ROOT]:
            stat.decrease_filtered_taxids()
            scores[ontology.ROOT] = Score(
                (scores[CELLULAR_ORGANISMS] * counts[CELLULAR_ORGANISMS] +
                 scores[ontology.ROOT] * counts[ontology.ROOT]) /
                (counts[CELLULAR_ORGANISMS] + counts[ontology.ROOT]))
        else:
            scores[ontology.ROOT] = scores[CELLULAR_ORGANISMS]
        counts[ontology.ROOT] += counts[CELLULAR_ORGANISMS]
        counts[CELLULAR_ORGANISMS] = 0
        scores[CELLULAR_ORGANISMS] = NO_SCORE
    # Remove root counts, in case
    if kwargs['root'] and counts[ontology.ROOT]:
        vwrite(gray('Removing'), counts[ontology.ROOT],
               gray('"ROOT" reads... '))
        stat.seq = stat.seq._replace(filt=stat.seq.filt -
                                     counts[ontology.ROOT])
        stat.decrease_filtered_taxids()
        counts[ontology.ROOT] = 0
        scores[ontology.ROOT] = NO_SCORE
        vwrite(green('OK!'), '\n')

    # Building ontology tree
    output.write(
        gray('Building from raw data with mintaxa = ') + f'{mintaxa:_d}' +
        gray(' ... \n'))
    vwrite(gray('  Building ontology tree with all-in-1... '))
    tree = TaxTree()
    ancestors: Set[Id]
    orphans: Set[Id]
    ancestors, orphans = ontology.get_ancestors(counts.keys())
    out = SampleDataById(['all'])
    tree.allin1(ontology=ontology,
                counts=counts,
                scores=scores,
                ancestors=ancestors,
                min_taxa=mintaxa,
                include=including,
                exclude=excluding,
                out=out)
    out.purge_counters()
    vwrite(green('OK!'), '\n')

    # Stats: Complete final value for TaxIDs after tree building and folding
    final_taxids: int = len(out.counts) if out.counts is not None else 0
    stat.set_final_taxids(final_taxids)

    # Check for additional loss of reads (due to include/exclude an orphans)
    output.write(gray('  Check for more seqs lost ([in/ex]clude affects)... '))
    if out.counts is not None:
        discard: int = sum(counts.values()) - sum(out.counts.values())
        if discard:
            output.write(
                blue('\n  Info:') + f' {discard} ' +
                gray('additional seqs discarded (') +
                f'{discard/sum(counts.values()):.3%} ' +
                gray('of accepted)\n'))
        else:
            output.write(green('OK!\n'))
    else:
        output.write(red('No counts in sample tree!\n'))
    # Warn or give detailed stats about orphan taxid and orphan seqs
    if debug:
        vwrite(gray('  Checking taxid loss (orphans)... '))
        lost: int = 0
        if orphans:
            for orphan in orphans:
                vwrite(yellow('  Warning!'), gray('Orphan taxid'),
                       f'{orphan}\n')
                lost += counts[orphan]
            vwrite(
                yellow('  WARNING!'), f'{len(orphans)} orphan taxids ('
                f'{len(orphans)/len(counts):.2%} of accepted)\n'
                f'    and {lost} orphan sequences ('
                f'{lost/sum(counts.values()):.3%} of accepted)\n')
        else:
            vwrite(green('OK!\n'))
    elif orphans:
        output.write(
            yellow('\n  Warning!') + f' {len(orphans)} orphan taxids' +
            gray(' (rerun with --debug for details)\n'))
    # Check the removal of TaxIDs (accumulation of leaves in parents)
    if debug and not excluding and including == {ontology.ROOT}:
        vwrite(gray('  Assess accumulation due to "folding the tree"...\n'))
        migrated: int = 0
        if out.counts is not None:
            for taxid in counts:
                if out.counts[taxid] == 0:
                    migrated += 1
                    vwrite(
                        blue('  Info:'),
                        gray(f'Folded TaxID {taxid} (') +
                        f'{ontology.get_name(taxid)}' + gray(') with ') +
                        f'{counts[taxid]}' + gray(' original seqs\n'))
        if migrated:
            vwrite(
                blue('  INFO:'), f'{migrated} TaxIDs folded ('
                f'{migrated/len(+counts):.2%} of TAF —TaxIDs after filtering—)'
                '\n')
            vwrite(
                blue('  INFO:'), f'Final assigned TaxIDs: {final_taxids} '
                f'(reduced to {final_taxids/len(+counts):.2%} of '
                'number of TAF)\n')
        else:
            vwrite(blue('  INFO:'), gray('No migration!'), green('OK!\n'))
    # Print last message and check if the sample is void
    if out.counts:
        output.write(sample + blue(' ctrl ' if is_ctrl else ' sample ') +
                     green('OK!\n'))
    elif is_ctrl:
        output.write(sample + red(' ctrl VOID!\n'))
        error = Err.VOID_CTRL
    else:
        output.write(sample + blue(' sample ') + yellow('VOID\n'))
        error = Err.VOID_SAMPLE

    # Timing results
    output.write(
        gray('Load elapsed time: ') +
        f'{time.perf_counter() - start_time:.3g}' + gray(' sec\n'))
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, stat, error
Beispiel #11
0
def read_kraken_output(
    output_file: Filename,
    scoring: Scoring = Scoring.KRAKEN,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read Kraken output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_kmerel: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            # Check number of cols in header
            header = file.readline().split('\t')
            if len(header) != 5:
                print(
                    red('\nERROR! ') + 'Kraken output format of ',
                    yellow(f'"{output_file}"'), 'not supported.')
                print(magenta('Expected:'),
                      'C/U, ID, taxid, length, list of mappings')
                print(magenta('Found:'), '\t'.join(header), end='')
                print(blue('HINT:'), 'Use Kraken or Kraken2 direct output.')
                raise Exception('Unsupported file format. Aborting.')
            for raw_line in file:
                try:
                    output_line = raw_line.strip()
                    (_clas, _label, _tid, _length,
                     _maps) = output_line.split('\t')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                try:
                    length: int = sum(map(int, _length.split('|')))
                    num_read += 1
                    nt_read += length
                    if _clas == UNCLASSIFIED:  # Just count unclassified reads
                        num_uncl += 1
                        continue
                    tid: Id = Id(_tid)
                    maps: List[str] = _maps.split()
                    try:
                        maps.remove('|:|')
                    except ValueError:
                        pass
                    mappings: Counter[Id] = col.Counter()
                    for pair in maps:
                        couple: List[str] = pair.split(':')
                        mappings[Id(couple[0])] += int(couple[1])
                    # From Kraken score get "single hit equivalent length"
                    shel: Score = Score(mappings[tid] + K_MER_SIZE)
                    score: Score = Score(mappings[tid] /
                                         sum(mappings.values()) *
                                         100)  # % relative to all k-mers
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                else:
                    taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None:  # Decide if ignore read if low score
                    if scoring is Scoring.KRAKEN:
                        if score < minscore:
                            continue
                    else:
                        if shel < minscore:
                            continue
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_kmerel[tid].append(score)
                except KeyError:
                    all_kmerel[tid] = [
                        score,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    scores2=all_kmerel,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores SHEL: min = ') + f'{stat.sco.mini:.1f},' +
        gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Coverage(%): min = ') + f'{stat.sco2.mini:.1f},' +
        gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco2.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.KRAKEN:
        out_scores = {tid: Score(mean(all_kmerel[tid])) for tid in all_kmerel}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f'kraken: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Beispiel #12
0
def read_generic_output(
    output_file: Filename,
    scoring: Scoring = Scoring.GENERIC,
    minscore: Score = None,
    genfmt: GenericFormat = None
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read an output file from a generic classifier

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification
        genfmt: GenericFormat object specifying the files format

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    # Initialization of variables
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    # Check format
    if not isinstance(genfmt, GenericFormat):
        raise Exception(
            red('\nERROR!'),
            'Missing GenericFormat when reading a generic output.')
    try:
        with open(output_file, 'r') as file:
            # Main loop processing each file line
            for raw_line in file:
                raw_line = raw_line.strip(' \n\t')
                splitting: str
                if genfmt.typ is GenericType.CSV:
                    splitting = ','
                elif genfmt.typ is GenericType.TSV:
                    splitting = '\t'
                elif genfmt.typ is GenericType.SSV:
                    splitting = ' '
                else:
                    raise Exception(f'ERROR! Unknown GenericType {genfmt.typ}')
                output_line: List[str] = raw_line.split(splitting)
                if len(output_line) < GenericFormat.MIN_COLS:
                    if num_read == 0 and last_error_read < 0:
                        last_error_read = 0
                        print(yellow('Warning!'), 'Skipping header of '
                              f'{output_file}')
                        continue  # Not account for the header as an error
                    raise Exception(
                        red('\nERROR!') + ' Line ' + yellow(f'{output_line}') +
                        '\n\tin ' + yellow(f'{output_file}') + ' has < ' +
                        blue(f'{GenericFormat.MIN_COLS}') + ' required ' +
                        'columns.\n\tPlease check the file.')
                try:
                    tid: Id = Id(output_line[genfmt.tid - 1].strip(' "'))
                    length: int = int(output_line[genfmt.len - 1].strip(' "'))
                    if tid == genfmt.unc:  # Avoid read score for unclass reads
                        num_read += 1
                        nt_read += length
                        num_uncl += 1
                        continue
                    score: Score = Score(
                        float(output_line[genfmt.sco - 1].strip(' "')))
                except ValueError:
                    if num_read == 0 and last_error_read < 0:
                        last_error_read = 0
                        print(yellow('Warning!'), 'Skipping header of '
                              f'{output_file}')
                        continue  # Not account for the header as a failure
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    if num_read > 100 and num_errors > 0.5 * num_read:
                        print(
                            red('ERROR!'),
                            'Unreliable file processing: rate of problematic'
                            f' reads is {num_errors/num_read*100:_d}, beyond'
                            ' 50%, after 100 reads. Please check the format '
                            f'of the file "{output_file}".')
                        raise
                    else:
                        continue
                num_read += 1
                nt_read += length
                taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None and score < minscore:
                    continue  # Discard read if low confidence
                try:
                    all_scores[tid].append(score)
                except KeyError:
                    all_scores[tid] = [
                        score,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') +
        f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.GENERIC:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        raise Exception(red('\nERROR!'),
                        f'Generic: Unsupported Scoring "{scoring}"')
    # Return
    return output.getvalue(), stat, counts, out_scores