Beispiel #1
0
    def read_nodes(self, nodes_file: Filename) -> None:
        """Build dicts of parent and rank for a given taxid (key)"""
        print('\033[90mLoading NCBI nodes...\033[0m', end='')
        sys.stdout.flush()
        try:
            with open(nodes_file, 'r') as file:
                for line in file:
                    _tid, _parent, _rank, *_ = line.split('\t|\t')
                    tid = Id(_tid)
                    parent = Id(_parent)
                    if self.collapse and parent == CELLULAR_ORGANISMS:
                        self.parents[tid] = ROOT
                    else:
                        self.parents[tid] = parent
                    rank: Rank
                    try:
                        rank = Rank[_rank.upper().replace(" ", "_")]
                    except KeyError:
                        raise UnsupportedTaxLevelError(
                            f'Unknown tax level {_rank}')
                    self.ranks[tid] = rank

        except OSError:
            print(red('ERROR!'), f'Cannot read {nodes_file}.')
            print(magenta('TIP:'),
                  'Did you select the right path with the "-n" option?')
            print(magenta('TIP:'),
                  'Did you use "Retaxdump" to install the dump files?')
            raise
        else:
            print('\033[92m OK! \033[0m')
Beispiel #2
0
def read_report(report_file: str) -> Tuple[str, Counter[Id], Dict[Id, Rank]]:
    """
    Read Centrifuge/Kraken report file

    Args:
        report_file: report file name

    Returns:
        log string, abundances counter, taxlevel dict

    """
    # TODO: Discontinued method, to be erased in a future release
    output: io.StringIO = io.StringIO(newline='')
    abundances: Counter[Id] = col.Counter()
    level_dic = {}
    output.write(f'\033[90mLoading report file {report_file}...\033[0m')
    try:
        with open(report_file, 'r') as file:
            for report_line in file:
                _, _, taxnum, taxlev, _tid, _ = report_line.split('\t')
                tid = Id(_tid)
                abundances[tid] = int(taxnum)
                level_dic[tid] = Rank.centrifuge(taxlev)
    except KeyboardInterrupt:
        print(gray(' User'), yellow('interrupted!'))
        raise
    except Exception:
        print(red('ERROR!'), 'Cannot read "' + report_file + '"')
        raise
    else:
        output.write('\033[92m OK! \033[0m\n')
    return output.getvalue(), abundances, level_dic
Beispiel #3
0
def read_report(report_file: str) -> Tuple[str, Counter[Id], Dict[Id, Rank]]:
    """
    Read Centrifuge/Kraken report file

    Args:
        report_file: report file name

    Returns:
        log string, abundances counter, taxlevel dict

    """
    output: io.StringIO = io.StringIO(newline='')
    abundances: Counter[Id] = col.Counter()
    level_dic = {}
    output.write(f'\033[90mLoading report file {report_file}...\033[0m')
    try:
        with open(report_file, 'r') as file:
            for report_line in file:
                _, _, taxnum, taxlev, _tid, _ = report_line.split('\t')
                tid = Id(_tid)
                abundances[tid] = int(taxnum)
                level_dic[tid] = Rank.centrifuge(taxlev)
    except:
        raise Exception('\n\033[91mERROR!\033[0m Cannot read "' + report_file +
                        '"')
    else:
        output.write('\033[92m OK! \033[0m\n')
    return output.getvalue(), abundances, level_dic
Beispiel #4
0
 def read_mock_files(mock: Filename) -> Counter[Id]:
     """Read a mock layout (.mck) file"""
     mock_layout: Counter[Id] = col.Counter()
     with open(mock, 'r') as file:
         vprint(gray('\nProcessing'), blue(mock), gray('file:\n'))
         for line in file:
             if line.startswith('#'):
                 continue
             _tid, _num = line.split('\t')
             tid = Id(_tid)
             num = int(_num)
             mock_layout[tid] = num
             vprint(num, gray('\treads for taxid\t'), tid, '\t(',
                    cyan(ncbi.get_name(tid)), ')\n')
     return mock_layout
Beispiel #5
0
 def read_names(self, names_file: Filename) -> None:
     """Build dict with name for a given taxid (key)."""
     print('\033[90mLoading NCBI names...\033[0m', end='')
     sys.stdout.flush()
     try:
         with open(names_file, 'r') as file:
             for line in file:
                 if 'scientific name' in line:
                     tid, scientific_name, *_ = line.split('\t|\t')
                     self.names[Id(tid)] = scientific_name
     except OSError:
         print(red('ERROR!'), f'Cannot read {names_file}.')
         print(magenta('TIP:'),
               'Did you use "Retaxdump" to install the dump files?')
         raise
     else:
         print('\033[92m OK! \033[0m')
Beispiel #6
0
 def mock_from_scratch(out: Filename, mock_layout: Counter[Id]) -> None:
     """Generate a mock Centrifuge output file from scratch"""
     with open(out, 'w') as fout:
         vprint(gray('Generating'), blue(out), gray('file... '))
         fout.write('readID\tseqID\ttaxID\tscore\t2ndBestScore\t'
                    'hitLength\tqueryLength\tnumMatches\n')
         reads_writen: int = 0
         for numtid in mock_layout:
             tid = Id(numtid)  # Convert to Id the excel integer
             maxhl: int = random.randint(args.random + 1, MAX_HIT_LENGTH)
             rank: str = str(ncbi.get_rank(tid)).lower()
             for _ in range(int(mock_layout[numtid])):
                 hit_length = random.randint(args.random + 1, maxhl)
                 fout.write(f'test{reads_writen}\t{rank}\t'
                            f'{tid}\t{(hit_length-15)**2}\t'
                            f'0\t{hit_length}\t{MAX_HIT_LENGTH}\t1\n')
                 reads_writen += 1
         vprint(reads_writen, 'reads', green('OK!\n'))
Beispiel #7
0
 def mock_from_source(out: Filename, mock_layout: Counter[Id]) -> None:
     """Generate a mock Centrifuge output file from source file"""
     with open(out, 'w') as fout, open(args.file) as fcfg:
         vprint(gray('Generating'), blue(out), gray('file... '))
         fout.write(fcfg.readline())  # copy cfg output file header
         reads_writen: int = 0
         for line in fcfg:
             tid = Id(line.split('\t')[2])
             if mock_layout[tid]:
                 fout.write(line)
                 mock_layout[tid] -= 1
                 reads_writen += 1
                 if not sum(mock_layout.values()):
                     vprint(reads_writen, 'reads', green('OK!\n'))
                     break
     if sum(mock_layout.values()):
         print(red('ERROR!\n'))
         print(gray('Incomplete read copy by taxid:'))
         mock_layout = +mock_layout  # Delete zero counts elements
         for tid in mock_layout:
             print(yellow(mock_layout[tid]), gray('reads missing for tid'),
                   tid, '(', cyan(ncbi.get_name(tid)), ')\n')
Beispiel #8
0
def read_output(
    output_file: Filename,
    scoring: Scoring = Scoring.SHEL,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read Centrifuge output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error

    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            file.readline()  # discard header
            for output_line in file:
                try:
                    _, _, _tid, _score, _, _, _length, *_ = output_line.split(
                        '\t')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                tid = Id(_tid)
                try:
                    # From Centrifuge score get "single hit equivalent length"
                    shel = Score(float(_score)**0.5 + 15)
                    length = int(_length)
                except ValueError:
                    print(yellow('Failure'), f'parsing score ({_score}) for ',
                          f'query length {_length} for taxid {_tid}',
                          f'in {output_file}. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                num_read += 1
                nt_read += length
                if tid == UNCLASSIFIED:  # Just count unclassified reads
                    num_uncl += 1
                    continue
                else:
                    taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None and shel < minscore:
                    continue  # Ignore read if low confidence
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    scores=all_scores,
                                    lens=all_length,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f}' + gray(', max = ') +
        f'{stat.sco.maxi:.1f}' + gray(', avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Length: min = ') + f'{stat.len.mini}' + gray(', max = ') +
        f'{stat.len.maxi}' + gray(', avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f' Centrifuge: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Beispiel #9
0
import collections as col
import io
import os
from math import log10
from statistics import mean
from typing import Tuple, Counter, Dict, List, Set

from Bio import SeqIO

from recentrifuge.config import Filename, Id, Score, Scoring
from recentrifuge.config import gray, red, green, yellow
from recentrifuge.rank import Rank
from recentrifuge.stats import SampleStats

# Centrifuge specific constants
UNCLASSIFIED: Id = Id('0')


def read_report(report_file: str) -> Tuple[str, Counter[Id], Dict[Id, Rank]]:
    """
    Read Centrifuge/Kraken report file

    Args:
        report_file: report file name

    Returns:
        log string, abundances counter, taxlevel dict

    """
    # TODO: Discontinued method, to be erased in a future release
    output: io.StringIO = io.StringIO(newline='')
Beispiel #10
0
def read_clark_output(
    output_file: Filename,
    scoring: Scoring = Scoring.CLARK_C,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read CLARK(-l)(-S) full mode output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_confs: Dict[Id, List[Score]] = {}
    all_gammas: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            # Check number of cols in header
            header = file.readline().split(',')
            if len(header) != 8:
                print(
                    red('\nERROR! ') + 'CLARK output format of ',
                    yellow(f'"{output_file}"'), 'not supported.')
                print(magenta('Expected:'),
                      'ID,Length,Gamma,1st,score1,2nd,score2,conf')
                print(magenta('Found:'), ','.join(header), end='')
                print(blue('HINT:'), 'Use CLARK, CLARK-l, or CLARK-S '
                      'with full mode (', blue('-m 0'), ')')
                raise Exception('Unsupported file format. Aborting.')
            for raw_line in file:
                try:
                    output_line = raw_line.strip()
                    (_label, _length, _gamma, _tid1, _score1, _tid2, _score2,
                     _conf) = output_line.split(',')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                try:
                    length: int = int(_length)
                    gamma: Score = Score(float(_gamma))
                    tid1: Id = Id(_tid1)
                    score1: Score = Score(float(_score1))
                    tid2: Id = Id(_tid2)
                    score2: Score = Score(float(_score2))
                    conf: Score = Score(float(_conf))
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                num_read += 1
                nt_read += length
                # Select tid and score between CLARK assignments 1 and 2
                tid: Id = tid1
                score: Score = score1
                if tid1 == UNCLASSIFIED:
                    if tid2 == UNCLASSIFIED:  # Just count unclassified reads
                        num_uncl += 1
                        continue
                    else:  # Majority of read unclassified
                        tid = tid2
                        score = score2
                        conf = Score(1 - conf)  # Get CLARK's h2/(h1+h2)
                # From CLARK_C(S) score get "single hit equivalent length"
                shel: Score = Score(score + K_MER_SIZE)
                taxids.add(tid)  # Save all the selected tids (tid1 or tid2)
                if minscore is not None:  # Decide if ignore read if low score
                    if scoring is Scoring.CLARK_C:
                        if conf < minscore:
                            continue
                    elif scoring is Scoring.CLARK_G:
                        if gamma < minscore:
                            continue
                    else:
                        if shel < minscore:
                            continue
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_confs[tid].append(conf)
                except KeyError:
                    all_confs[tid] = [
                        conf,
                    ]
                try:
                    all_gammas[tid].append(gamma)
                except KeyError:
                    all_gammas[tid] = [
                        gamma,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]

    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    scores2=all_confs,
                                    scores3=all_gammas,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Hit (score): min = ') + f'{stat.sco.mini:.1f},' +
        gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Conf. score: min = ') + f'{stat.sco2.mini:.1f},' +
        gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco2.mean:.1f}\n')
    output.write(
        gray('  Gamma score: min = ') + f'{stat.sco3.mini:.1f},' +
        gray(' max = ') + f'{stat.sco3.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco3.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.CLARK_C:
        out_scores = {
            tid: Score(mean(all_confs[tid]) * 100)
            for tid in all_confs
        }
    elif scoring is Scoring.CLARK_G:
        out_scores = {tid: Score(mean(all_gammas[tid])) for tid in all_gammas}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f'clark: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Beispiel #11
0
"""

import collections as col
import io
import os
from math import log10
from statistics import mean
from typing import Tuple, Counter, Dict, List, Set

from recentrifuge.config import Filename, Id, Score, Scoring
from recentrifuge.config import gray, red, green, yellow, blue, magenta
from recentrifuge.stats import SampleStats

# CLARK specific constants
UNCLASSIFIED: Id = Id('NA')
K_MER_SIZE: int = 31  # Default k-mer size for CLARK(S)


def read_clark_output(
    output_file: Filename,
    scoring: Scoring = Scoring.CLARK_C,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read CLARK(-l)(-S) full mode output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification
Beispiel #12
0
 def read_plasmids(self, plasmid_file: Filename) -> None:
     """Read, check and include plasmid data"""
     print('\033[90mLoading LMAT plasmids...\033[0m', end='')
     sys.stdout.flush()
     pattern1 = re.compile(
         r"""((?:"([\w\-\.\(\)/+=':,%\*\s]*)"$)|(?:^([\w\-\.\(\)/+=':\*\s]*(?:, (?:strain|isolate|plasmid) [\w\-/\.]*)*(?:, fragment \w*)?(?:, contig \w)?)(?:, a cloning vector)?(?=(?=(?:, (?:complete|partial) (?:plasmid |genomic )*(?:sequence|genome|cds|replicon))(?:\[sequence_id)*)|(?:, complete sequence)*, whole genome shotgun sequence|\[sequence_id)))"""  # pylint: disable=line-too-long
     )
     pattern2 = re.compile(r"""(^(?:[A-Za-z0-9/=\-\.{},]*(?: |.)){1,8})""")
     match: Counter = col.Counter()
     try:
         with open(plasmid_file, 'r') as file:
             for line in file:
                 _tid, _parent, *_, last = line.rstrip('\n').split('\t')
                 last = last.split(r'|')[-1]
                 tid = Id(_tid)
                 parent = Id(_parent)
                 # Plasmids sanity checks
                 if tid in self.parents:  # if plasmid tid already in NCBI
                     match['ERR1'] += 1
                     if self.debug:
                         print(f'\033[93mPlasmid taxid ERROR!\033[0m'
                               f' Taxid={tid} already a NCBI taxid. '
                               f'Declared parent is {parent} but '
                               f'NCBI parent is {self.parents[tid]}.')
                         print('\tPlasmid details: ', last)
                     continue
                 elif tid == parent:  # if plasmid and parent tids are equal
                     match['ERR2'] += 1
                     if self.debug:
                         print(f'\033[93mPlasmid parent taxid ERROR!\033[0m'
                               f' Taxid={tid} and parent={parent}.')
                         print('\t\t   Plasmid details: ', last)
                     continue
                 else:  # No problem, go ahead and add the plasmid!
                     self.parents[tid] = parent
                 # Plasmid name extraction by regular expressions
                 name: str
                 try:
                     name = pattern1.search(last).group(1)  # type: ignore
                     name = 'Plasmid ' + name.strip(r'"').strip(',')
                 except AttributeError:
                     try:
                         name = pattern2.search(  # type: ignore
                             last).group(1).strip()
                         name = 'Plasmid ' + name
                     except AttributeError:
                         name = 'Plasmid ' + tid
                         match['FAIL'] += 1
                     else:
                         match['PAT2'] += 1
                 else:
                     match['PAT1'] += 1
                 self.names[tid] = name
     except OSError:
         print('\033[93mWARNING\033[0m: Cannot read "' + plasmid_file +
               '". Plasmid taxids not loaded!')
         print(magenta('TIP:'),
               'Manual installation of the plasmids file required.')
         raise
     else:  # Statistics about plasmids
         print(
             '\033[92m OK! \033[0m\n',
             '\033[90mPlasmid sanity check:\033[0m',
             f'\033[93m rejected\033[0m (taxid error) = {match["ERR1"]}',
             f'\033[93m rejected\033[0m (parent error) = {match["ERR2"]}')
         print('\033[90m Plasmid pattern matching:\033[0m',
               f'\033[90m 1st type =\033[0m {match["PAT1"]} ',
               f'\033[90m 2nd type =\033[0m {match["PAT2"]} ',
               f'\033[90m other =\033[0m {match["FAIL"]}')
Beispiel #13
0
def read_kraken_output(
    output_file: Filename,
    scoring: Scoring = Scoring.KRAKEN,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read Kraken output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_kmerel: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            # Check number of cols in header
            header = file.readline().split('\t')
            if len(header) != 5:
                print(
                    red('\nERROR! ') + 'Kraken output format of ',
                    yellow(f'"{output_file}"'), 'not supported.')
                print(magenta('Expected:'),
                      'C/U, ID, taxid, length, list of mappings')
                print(magenta('Found:'), '\t'.join(header), end='')
                print(blue('HINT:'), 'Use Kraken or Kraken2 direct output.')
                raise Exception('Unsupported file format. Aborting.')
            for raw_line in file:
                try:
                    output_line = raw_line.strip()
                    (_clas, _label, _tid, _length,
                     _maps) = output_line.split('\t')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                try:
                    length: int = sum(map(int, _length.split('|')))
                    num_read += 1
                    nt_read += length
                    if _clas == UNCLASSIFIED:  # Just count unclassified reads
                        num_uncl += 1
                        continue
                    tid: Id = Id(_tid)
                    maps: List[str] = _maps.split()
                    try:
                        maps.remove('|:|')
                    except ValueError:
                        pass
                    mappings: Counter[Id] = col.Counter()
                    for pair in maps:
                        couple: List[str] = pair.split(':')
                        mappings[Id(couple[0])] += int(couple[1])
                    # From Kraken score get "single hit equivalent length"
                    shel: Score = Score(mappings[tid] + K_MER_SIZE)
                    score: Score = Score(mappings[tid] /
                                         sum(mappings.values()) *
                                         100)  # % relative to all k-mers
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                else:
                    taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None:  # Decide if ignore read if low score
                    if scoring is Scoring.KRAKEN:
                        if score < minscore:
                            continue
                    else:
                        if shel < minscore:
                            continue
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_kmerel[tid].append(score)
                except KeyError:
                    all_kmerel[tid] = [
                        score,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    scores2=all_kmerel,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores SHEL: min = ') + f'{stat.sco.mini:.1f},' +
        gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Coverage(%): min = ') + f'{stat.sco2.mini:.1f},' +
        gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco2.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.KRAKEN:
        out_scores = {tid: Score(mean(all_kmerel[tid])) for tid in all_kmerel}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f'kraken: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Beispiel #14
0
    def __init__(self, frmt: str):
        def print_error(specifier):
            """GenericFormat constructor: print an informative error message"""
            print(red('ERROR!'), 'Generic --format string malformed:',
                  blue(specifier), '\n\tPlease rerun with --help for details.')

        blocks: List[str] = [fld.strip() for fld in frmt.split(',')]
        if len(blocks) < self.MIN_FIELDS:
            print_error(f'Wrong number of fields (expected {self.MIN_FIELDS} '
                        f'found {len(blocks)}).')
            exit(2)
        try:
            fmt: Dict[str, str] = {
                pair.split(':')[0].strip(): pair.split(':')[1].strip()
                for pair in blocks
            }
        except IndexError:
            print_error('All fields need ":" separator.')
            exit(2)
        # Populate fields
        try:
            typ = fmt['TYP']
        except KeyError:
            print_error('TYPe field is mandatory.')
            exit(2)
        try:
            self.typ: GenericType = GenericType[typ.upper()]
        except KeyError:
            print_error('Unknown file TYPe, valid options are ' +
                        ' or '.join([str(t) for t in GenericType]))
            exit(2)
        try:
            self.tid: int = int(fmt['TID'])
        except KeyError:
            print_error('TaxID field is mandatory.')
            exit(2)
        except ValueError:
            print_error('TaxID field is an integer number of column.')
            exit(2)
        try:
            self.len: int = int(fmt['LEN'])
        except KeyError:
            print_error('LENgth field is mandatory.')
            exit(2)
        except ValueError:
            print_error('LENgth field is an integer number of column.')
            exit(2)
        try:
            self.sco: int = int(fmt['SCO'])
        except KeyError:
            print_error('SCOre field is mandatory.')
            exit(2)
        except ValueError:
            print_error('SCOre field is an integer number of column.')
            exit(2)
        try:
            self.unc: Id = Id(fmt['UNC'])
        except KeyError:
            print_error('UNClassified field is mandatory.')
            exit(2)
        # Check columns are different
        if (self.tid == self.len or self.tid == self.sco
                or self.len == self.sco):
            print_error('Different fields need different columns.')
            exit(2)
        # Check column numbers are positive
        if self.tid < 1 or self.len < 1 or self.sco < 1:
            print_error('Columns numbers should be positive integers.')
            exit(2)
Beispiel #15
0
def read_generic_output(
    output_file: Filename,
    scoring: Scoring = Scoring.GENERIC,
    minscore: Score = None,
    genfmt: GenericFormat = None
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read an output file from a generic classifier

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification
        genfmt: GenericFormat object specifying the files format

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    # Initialization of variables
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    # Check format
    if not isinstance(genfmt, GenericFormat):
        raise Exception(
            red('\nERROR!'),
            'Missing GenericFormat when reading a generic output.')
    try:
        with open(output_file, 'r') as file:
            # Main loop processing each file line
            for raw_line in file:
                raw_line = raw_line.strip(' \n\t')
                splitting: str
                if genfmt.typ is GenericType.CSV:
                    splitting = ','
                elif genfmt.typ is GenericType.TSV:
                    splitting = '\t'
                elif genfmt.typ is GenericType.SSV:
                    splitting = ' '
                else:
                    raise Exception(f'ERROR! Unknown GenericType {genfmt.typ}')
                output_line: List[str] = raw_line.split(splitting)
                if len(output_line) < GenericFormat.MIN_COLS:
                    if num_read == 0 and last_error_read < 0:
                        last_error_read = 0
                        print(yellow('Warning!'), 'Skipping header of '
                              f'{output_file}')
                        continue  # Not account for the header as an error
                    raise Exception(
                        red('\nERROR!') + ' Line ' + yellow(f'{output_line}') +
                        '\n\tin ' + yellow(f'{output_file}') + ' has < ' +
                        blue(f'{GenericFormat.MIN_COLS}') + ' required ' +
                        'columns.\n\tPlease check the file.')
                try:
                    tid: Id = Id(output_line[genfmt.tid - 1].strip(' "'))
                    length: int = int(output_line[genfmt.len - 1].strip(' "'))
                    if tid == genfmt.unc:  # Avoid read score for unclass reads
                        num_read += 1
                        nt_read += length
                        num_uncl += 1
                        continue
                    score: Score = Score(
                        float(output_line[genfmt.sco - 1].strip(' "')))
                except ValueError:
                    if num_read == 0 and last_error_read < 0:
                        last_error_read = 0
                        print(yellow('Warning!'), 'Skipping header of '
                              f'{output_file}')
                        continue  # Not account for the header as a failure
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    if num_read > 100 and num_errors > 0.5 * num_read:
                        print(
                            red('ERROR!'),
                            'Unreliable file processing: rate of problematic'
                            f' reads is {num_errors/num_read*100:_d}, beyond'
                            ' 50%, after 100 reads. Please check the format '
                            f'of the file "{output_file}".')
                        raise
                    else:
                        continue
                num_read += 1
                nt_read += length
                taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None and score < minscore:
                    continue  # Discard read if low confidence
                try:
                    all_scores[tid].append(score)
                except KeyError:
                    all_scores[tid] = [
                        score,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') +
        f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.GENERIC:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        raise Exception(red('\nERROR!'),
                        f'Generic: Unsupported Scoring "{scoring}"')
    # Return
    return output.getvalue(), stat, counts, out_scores