Ejemplo n.º 1
0
    def read_nodes(self, nodes_file: Filename) -> None:
        """Build dicts of parent and rank for a given taxid (key)"""
        print('\033[90mLoading NCBI nodes...\033[0m', end='')
        sys.stdout.flush()
        try:
            with open(nodes_file, 'r') as file:
                for line in file:
                    _tid, _parent, _rank, *_ = line.split('\t|\t')
                    tid = Id(_tid)
                    parent = Id(_parent)
                    if self.collapse and parent == CELLULAR_ORGANISMS:
                        self.parents[tid] = ROOT
                    else:
                        self.parents[tid] = parent
                    rank: Rank
                    try:
                        rank = Rank[_rank.upper().replace(" ", "_")]
                    except KeyError:
                        raise UnsupportedTaxLevelError(
                            f'Unknown tax level {_rank}')
                    self.ranks[tid] = rank

        except OSError:
            print(red('ERROR!'), f'Cannot read {nodes_file}.')
            print(magenta('TIP:'),
                  'Did you select the right path with the "-n" option?')
            print(magenta('TIP:'),
                  'Did you use "Retaxdump" to install the dump files?')
            raise
        else:
            print('\033[92m OK! \033[0m')
Ejemplo n.º 2
0
    def generate_krona():
        """Generate Krona plot with all the results via Krona 2.0 XML spec"""

        print(gray('\nBuilding the taxonomy multiple tree... '), end='')
        sys.stdout.flush()
        krona: KronaTree = KronaTree(
            samples,
            num_raw_samples=len(raw_samples),
            stats=stats,
            min_score=Score(
                min([
                    min(scores[sample].values()) for sample in samples
                    if len(scores[sample])
                ])),
            max_score=Score(
                max([
                    max(scores[sample].values()) for sample in samples
                    if len(scores[sample])
                ])),
            scoring=scoring,
        )
        polytree.grow(ontology=ncbi,
                      abundances=counts,
                      accs=accs,
                      scores=scores)
        print(green('OK!'))
        print(gray('Generating final plot (') + magenta(htmlfile) +
              gray(')... '),
              end='')
        sys.stdout.flush()
        polytree.toxml(ontology=ncbi, krona=krona)
        krona.tohtml(htmlfile, pretty=False)
        print(green('OK!'))
Ejemplo n.º 3
0
 def read_names(self, names_file: Filename) -> None:
     """Build dict with name for a given taxid (key)."""
     print('\033[90mLoading NCBI names...\033[0m', end='')
     sys.stdout.flush()
     try:
         with open(names_file, 'r') as file:
             for line in file:
                 if 'scientific name' in line:
                     tid, scientific_name, *_ = line.split('\t|\t')
                     self.names[Id(tid)] = scientific_name
     except OSError:
         print(red('ERROR!'), f'Cannot read {names_file}.')
         print(magenta('TIP:'),
               'Did you use "Retaxdump" to install the dump files?')
         raise
     else:
         print('\033[92m OK! \033[0m')
Ejemplo n.º 4
0
    def mock_fastq(num_reads: int) -> None:
        """Do the job in case of Excel file with all the details"""
        def fastq_seqs(alphabet=single_letter_alphabet):
            """Generator function that creates mock fastq sequences
            """
            for seq in range(num_reads):
                yield SeqRecord(Seq('AGTC', alphabet),
                                id=f'test{seq}',
                                name=f'test{seq}',
                                description=f'test{seq}',
                                annotations={'quality': '@@@@'})

        print(gray('Writing'),
              magenta(f'{num_reads}'),
              gray('reads in'),
              TEST_REXT_FSTQ,
              gray('...'),
              end='',
              flush=True)
        SeqIO.write((sq for sq in fastq_seqs()), TEST_REXT_FSTQ, 'quickfastq')
        print(green(' OK!'))
Ejemplo n.º 5
0
def read_clark_output(
    output_file: Filename,
    scoring: Scoring = Scoring.CLARK_C,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read CLARK(-l)(-S) full mode output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_confs: Dict[Id, List[Score]] = {}
    all_gammas: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            # Check number of cols in header
            header = file.readline().split(',')
            if len(header) != 8:
                print(
                    red('\nERROR! ') + 'CLARK output format of ',
                    yellow(f'"{output_file}"'), 'not supported.')
                print(magenta('Expected:'),
                      'ID,Length,Gamma,1st,score1,2nd,score2,conf')
                print(magenta('Found:'), ','.join(header), end='')
                print(blue('HINT:'), 'Use CLARK, CLARK-l, or CLARK-S '
                      'with full mode (', blue('-m 0'), ')')
                raise Exception('Unsupported file format. Aborting.')
            for raw_line in file:
                try:
                    output_line = raw_line.strip()
                    (_label, _length, _gamma, _tid1, _score1, _tid2, _score2,
                     _conf) = output_line.split(',')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                try:
                    length: int = int(_length)
                    gamma: Score = Score(float(_gamma))
                    tid1: Id = Id(_tid1)
                    score1: Score = Score(float(_score1))
                    tid2: Id = Id(_tid2)
                    score2: Score = Score(float(_score2))
                    conf: Score = Score(float(_conf))
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                num_read += 1
                nt_read += length
                # Select tid and score between CLARK assignments 1 and 2
                tid: Id = tid1
                score: Score = score1
                if tid1 == UNCLASSIFIED:
                    if tid2 == UNCLASSIFIED:  # Just count unclassified reads
                        num_uncl += 1
                        continue
                    else:  # Majority of read unclassified
                        tid = tid2
                        score = score2
                        conf = Score(1 - conf)  # Get CLARK's h2/(h1+h2)
                # From CLARK_C(S) score get "single hit equivalent length"
                shel: Score = Score(score + K_MER_SIZE)
                taxids.add(tid)  # Save all the selected tids (tid1 or tid2)
                if minscore is not None:  # Decide if ignore read if low score
                    if scoring is Scoring.CLARK_C:
                        if conf < minscore:
                            continue
                    elif scoring is Scoring.CLARK_G:
                        if gamma < minscore:
                            continue
                    else:
                        if shel < minscore:
                            continue
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_confs[tid].append(conf)
                except KeyError:
                    all_confs[tid] = [
                        conf,
                    ]
                try:
                    all_gammas[tid].append(gamma)
                except KeyError:
                    all_gammas[tid] = [
                        gamma,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]

    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    scores2=all_confs,
                                    scores3=all_gammas,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Hit (score): min = ') + f'{stat.sco.mini:.1f},' +
        gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Conf. score: min = ') + f'{stat.sco2.mini:.1f},' +
        gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco2.mean:.1f}\n')
    output.write(
        gray('  Gamma score: min = ') + f'{stat.sco3.mini:.1f},' +
        gray(' max = ') + f'{stat.sco3.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco3.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.CLARK_C:
        out_scores = {
            tid: Score(mean(all_confs[tid]) * 100)
            for tid in all_confs
        }
    elif scoring is Scoring.CLARK_G:
        out_scores = {tid: Score(mean(all_gammas[tid])) for tid in all_gammas}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f'clark: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Ejemplo n.º 6
0
 def blst2str(lst: List[bool]) -> str:
     """Convert a list of booleans into a nice string"""
     return ('[' + (', '.join(magenta('T') if elm else 'F'
                              for elm in lst)) + ']')
Ejemplo n.º 7
0
        def robust_contamination_removal():
            """Implement robust contamination removal algorithm."""
            nonlocal exclude_sets, shared_crossover

            def compute_qn(data: List[float], dist: str = "Gauss") -> float:
                """Compute Qn robust estimator of scale (Rousseeuw, 1993)"""
                c_d: float  # Select d parameter depending on the distribution
                if dist == "Gauss":
                    c_d = 2.2219
                elif dist == "Cauchy":  # Heavy-tailed distribution
                    c_d = 1.2071
                elif dist == "NegExp":  # Negative exponential (asymetric)
                    c_d = 3.4760
                else:
                    raise Exception(red('\nERROR! ') + 'Unknown distribution')
                num: int = len(data)
                sort_data = sorted(data)
                pairwisedifs: List[float] = []
                for (i, x_val) in enumerate(sort_data):
                    for y_val in sort_data[i + 1:]:
                        pairwisedifs.append(abs(x_val - y_val))
                k: int = int(num * (num / 2 + 1) / 4)
                return c_d * sorted(pairwisedifs)[k - 1]

            exclude_sets = {smpl: set() for smpl in raws[controls:]}
            vwrite(
                gray('Robust contamination removal: '
                     'Searching for contaminants...\n'))
            for tid in exclude_candidates:
                relfreq_ctrl: List[float] = [
                    accs[ctrl][tid] / accs[ctrl][ontology.ROOT]
                    for ctrl in raws[:controls]
                ]
                relfreq_smpl: List[float] = [
                    accs[smpl][tid] / accs[smpl][ontology.ROOT]
                    for smpl in raws[controls:]
                ]
                relfreq: List[float] = relfreq_ctrl + relfreq_smpl
                crossover: List[bool]  # Crossover source (yes/no)
                # Just-controls contamination check
                if all([rf < EPS for rf in relfreq_smpl]):
                    vwrite(cyan('just-ctrl:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    continue  # Go for next candidate
                # Critical contamination check
                if all([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(red('critical:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Severe contamination check
                if any([rf > SEVR_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(yellow('severe: \t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Mild contamination check
                if all([rf > MILD_CONTM_MIN_RELFREQ for rf in relfreq_ctrl]):
                    vwrite(blue('mild cont:\t'), tid, ontology.get_name(tid),
                           gray('relfreq:'),
                           fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                           '\n')
                    for exclude_set in exclude_sets.values():
                        exclude_set.add(tid)
                    continue  # Go for next candidate
                # Calculate median and MAD median but including controls
                mdn: float = statistics.median(relfreq)
                # mad:float=statistics.mean([abs(mdn - rf) for rf in relfreq])
                q_n: float = compute_qn(relfreq, dist="NegExp")
                # Calculate crossover in samples
                outlier_lim: float = mdn + ROBUST_XOVER_OUTLIER * q_n
                ordomag_lim: float = max(
                    relfreq_ctrl) * 10**ROBUST_XOVER_ORD_MAG
                crossover = [
                    rf > outlier_lim and rf > ordomag_lim
                    for rf in relfreq[controls:]
                ]
                # Crossover contamination check
                if any(crossover):
                    vwrite(
                        magenta('crossover:\t'), tid, ontology.get_name(tid),
                        green(f'lims: [{outlier_lim:.1g}]' +
                              ('<' if outlier_lim < ordomag_lim else '>') +
                              f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                        fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl),
                        gray('crossover:'), blst2str(crossover), '\n')
                    # Exclude just for contaminated samples (not the source)
                    vwrite(magenta('\t->'), gray(f'Include {tid} just in:'))
                    for i in range(len(raws[controls:])):
                        if not crossover[i]:
                            exclude_sets[raws[i + controls]].add(tid)
                        else:
                            vwrite(f' {raws[i + controls]}')
                    if all(crossover):  # Shared taxon contaminating control(s)
                        vwrite(' (', yellow('Shared crossover taxon!'), ')')
                        shared_crossover.add(tid)
                    vwrite('\n')
                    continue
                # Other contamination: remove from all samples
                vwrite(
                    gray('other cont:\t'), tid, ontology.get_name(tid),
                    green(f'lims: [{outlier_lim:.1g}]' +
                          ('<' if outlier_lim < ordomag_lim else '>') +
                          f'[{ordomag_lim:.1g}]'), gray('relfreq:'),
                    fltlst2str(relfreq_ctrl) + fltlst2str(relfreq_smpl), '\n')
                for exclude_set in exclude_sets.values():
                    exclude_set.add(tid)
Ejemplo n.º 8
0
def main():
    """Main entry point to script."""
    # Argument Parser Configuration
    parser = argparse.ArgumentParser(
        description='Extract reads following Centrifuge/Kraken output',
        epilog=f'%(prog)s  - {__author__} - {__date__}')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=f'%(prog)s release {__version__} ({__date__})')
    parser.add_argument('-f',
                        '--file',
                        action='store',
                        metavar='FILE',
                        required=True,
                        help='Centrifuge output file.')
    parser.add_argument('-l',
                        '--limit',
                        action='store',
                        metavar='NUMBER',
                        type=int,
                        default=None,
                        help=('Limit of FASTQ reads to extract. '
                              'Default: no limit'))
    parser.add_argument(
        '-m',
        '--maxreads',
        action='store',
        metavar='NUMBER',
        type=int,
        default=None,
        help=('Maximum number of FASTQ reads to search for the taxa. '
              'Default: no maximum'))
    parser.add_argument(
        '-n',
        '--nodespath',
        action='store',
        metavar='PATH',
        default=TAXDUMP_PATH,
        help=('path for the nodes information files (nodes.dmp and names.dmp' +
              ' from NCBI'))
    parser.add_argument(
        '-i',
        '--include',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to include a taxon and all underneath ' +
              '(multiple -i is available to include several taxid). ' +
              'By default all the taxa is considered for inclusion.'))
    parser.add_argument(
        '-x',
        '--exclude',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to exclude a taxon and all underneath ' +
              '(multiple -x is available to exclude several taxid)'))
    parser.add_argument(
        '-y',
        '--minscore',
        action='store',
        metavar='NUMBER',
        type=lambda txt: Score(float(txt)),
        default=None,
        help=('minimum score/confidence of the classification of a read '
              'to pass the quality filter; all pass by default'))
    filein = parser.add_mutually_exclusive_group(required=True)
    filein.add_argument('-q',
                        '--fastq',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Single FASTQ file (no paired-ends)')
    filein.add_argument('-1',
                        '--mate1',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 1s '
                        '(filename usually includes _1)')
    parser.add_argument('-2',
                        '--mate2',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 2s '
                        '(filename usually includes _2)')

    # timing initialization
    start_time: float = time.time()
    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} =-= {__date__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    args = parser.parse_args()
    output_file = args.file
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    excluding: Set[TaxId] = set(args.exclude)
    including: Set[TaxId] = set(args.include)
    fastq_1: Filename
    fastq_2: Filename = args.mate2
    if not fastq_2:
        fastq_1 = args.fastq
    else:
        fastq_1 = args.mate1

    # Load NCBI nodes, names and build children
    plasmidfile: Filename = None
    ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, False,
                              excluding, including)

    # Build taxonomy tree
    print(gray('Building taxonomy tree...'), end='')
    sys.stdout.flush()
    tree = TaxTree()
    tree.grow(taxonomy=ncbi, look_ancestors=False)
    print(green(' OK!'))

    # Get the taxa
    print(gray('Filtering taxa...'), end='')
    sys.stdout.flush()
    ranks: Ranks = Ranks({})
    tree.get_taxa(ranks=ranks, include=including, exclude=excluding)
    print(green(' OK!'))
    taxids: Set[TaxId] = set(ranks)
    taxlevels: TaxLevels = Rank.ranks_to_taxlevels(ranks)
    num_taxlevels = Counter({rank: len(taxlevels[rank]) for rank in taxlevels})
    num_taxlevels = +num_taxlevels

    # Statistics about including taxa
    print(f'  {len(taxids)}\033[90m taxid selected in \033[0m', end='')
    print(f'{len(num_taxlevels)}\033[90m different taxonomical levels:\033[0m')
    for rank in num_taxlevels:
        print(f'  Number of different {rank}: {num_taxlevels[rank]}')
    assert taxids, red('ERROR! No taxids to search for!')

    # Get the records
    records: List[SeqRecord] = []
    num_seqs: int = 0
    # timing initialization
    start_time_load: float = time.perf_counter()
    print(gray(f'Loading output file {output_file}...'), end='')
    sys.stdout.flush()
    try:
        with open(output_file, 'rU') as file:
            file.readline()  # discard header
            for num_seqs, record in enumerate(SeqIO.parse(file, 'centrifuge')):
                tid: TaxId = record.annotations['taxID']
                if tid not in taxids:
                    continue  # Ignore read if low confidence
                score: Score = Score(record.annotations['score'])
                if args.minscore is not None and score < args.minscore:
                    continue
                records.append(record)
    except FileNotFoundError:
        raise Exception(red('ERROR!') + 'Cannot read "' + output_file + '"')
    print(green(' OK!'))

    # Basic records statistics
    print(
        gray('  Load elapsed time: ') +
        f'{time.perf_counter() - start_time_load:.3g}' + gray(' sec'))
    print(f'  \033[90mMatching reads: \033[0m{len(records):_d} \033[90m\t'
          f'(\033[0m{len(records)/num_seqs:.4%}\033[90m of sample)')
    sys.stdout.flush()

    # FASTQ sequence dealing
    # records_ids: List[SeqRecord] = [record.id for record in records]
    records_ids: Set[SeqRecord] = {record.id for record in records}
    seqs1: List[SeqRecord] = []
    seqs2: List[SeqRecord] = []
    extracted: int = 0
    i: int = 0
    if fastq_2:
        print(
            f'\033[90mLoading FASTQ files {fastq_1} and {fastq_2}...\n'
            f'Mseqs: \033[0m',
            end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1, open(fastq_2, 'rU') as file2:
                for i, (rec1, rec2) in enumerate(
                        zip(SeqIO.parse(file1, 'quickfastq'),
                            SeqIO.parse(file2, 'quickfastq'))):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        seqs2.append(rec2)
                        extracted += 1

        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ files')
    else:
        print(f'\033[90mLoading FASTQ files {fastq_1}...\n'
              f'Mseqs: \033[0m',
              end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1:
                for i, rec1 in enumerate(SeqIO.parse(file1, 'quickfastq')):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        extracted += 1
        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ file')
    print(cyan(f' {i/1e+6:.3g} Mseqs'), green('OK! '))

    def format_filename(fastq: Filename) -> Filename:
        """Auxiliary function to properly format the output filenames.

        Args:
            fastq: Complete filename of the fastq input file

        Returns: Filename of the rextracted fastq output file
        """
        fastq_filename, _ = os.path.splitext(fastq)
        output_list: List[str] = [fastq_filename, '_rxtr']
        if including:
            output_list.append('_incl')
            output_list.extend('_'.join(including))
        if excluding:
            output_list.append('_excl')
            output_list.extend('_'.join(excluding))
        output_list.append('.fastq')
        return Filename(''.join(output_list))

    filename1: Filename = format_filename(fastq_1)
    SeqIO.write(seqs1, filename1, 'quickfastq')
    print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename1)
    if fastq_2:
        filename2: Filename = format_filename(fastq_2)
        SeqIO.write(seqs2, filename2, 'quickfastq')
        print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'),
              filename2)

    # Timing results
    print(gray('Total elapsed time:'),
          time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
Ejemplo n.º 9
0
 def read_plasmids(self, plasmid_file: Filename) -> None:
     """Read, check and include plasmid data"""
     print('\033[90mLoading LMAT plasmids...\033[0m', end='')
     sys.stdout.flush()
     pattern1 = re.compile(
         r"""((?:"([\w\-\.\(\)/+=':,%\*\s]*)"$)|(?:^([\w\-\.\(\)/+=':\*\s]*(?:, (?:strain|isolate|plasmid) [\w\-/\.]*)*(?:, fragment \w*)?(?:, contig \w)?)(?:, a cloning vector)?(?=(?=(?:, (?:complete|partial) (?:plasmid |genomic )*(?:sequence|genome|cds|replicon))(?:\[sequence_id)*)|(?:, complete sequence)*, whole genome shotgun sequence|\[sequence_id)))"""  # pylint: disable=line-too-long
     )
     pattern2 = re.compile(r"""(^(?:[A-Za-z0-9/=\-\.{},]*(?: |.)){1,8})""")
     match: Counter = col.Counter()
     try:
         with open(plasmid_file, 'r') as file:
             for line in file:
                 _tid, _parent, *_, last = line.rstrip('\n').split('\t')
                 last = last.split(r'|')[-1]
                 tid = Id(_tid)
                 parent = Id(_parent)
                 # Plasmids sanity checks
                 if tid in self.parents:  # if plasmid tid already in NCBI
                     match['ERR1'] += 1
                     if self.debug:
                         print(f'\033[93mPlasmid taxid ERROR!\033[0m'
                               f' Taxid={tid} already a NCBI taxid. '
                               f'Declared parent is {parent} but '
                               f'NCBI parent is {self.parents[tid]}.')
                         print('\tPlasmid details: ', last)
                     continue
                 elif tid == parent:  # if plasmid and parent tids are equal
                     match['ERR2'] += 1
                     if self.debug:
                         print(f'\033[93mPlasmid parent taxid ERROR!\033[0m'
                               f' Taxid={tid} and parent={parent}.')
                         print('\t\t   Plasmid details: ', last)
                     continue
                 else:  # No problem, go ahead and add the plasmid!
                     self.parents[tid] = parent
                 # Plasmid name extraction by regular expressions
                 name: str
                 try:
                     name = pattern1.search(last).group(1)  # type: ignore
                     name = 'Plasmid ' + name.strip(r'"').strip(',')
                 except AttributeError:
                     try:
                         name = pattern2.search(  # type: ignore
                             last).group(1).strip()
                         name = 'Plasmid ' + name
                     except AttributeError:
                         name = 'Plasmid ' + tid
                         match['FAIL'] += 1
                     else:
                         match['PAT2'] += 1
                 else:
                     match['PAT1'] += 1
                 self.names[tid] = name
     except OSError:
         print('\033[93mWARNING\033[0m: Cannot read "' + plasmid_file +
               '". Plasmid taxids not loaded!')
         print(magenta('TIP:'),
               'Manual installation of the plasmids file required.')
         raise
     else:  # Statistics about plasmids
         print(
             '\033[92m OK! \033[0m\n',
             '\033[90mPlasmid sanity check:\033[0m',
             f'\033[93m rejected\033[0m (taxid error) = {match["ERR1"]}',
             f'\033[93m rejected\033[0m (parent error) = {match["ERR2"]}')
         print('\033[90m Plasmid pattern matching:\033[0m',
               f'\033[90m 1st type =\033[0m {match["PAT1"]} ',
               f'\033[90m 2nd type =\033[0m {match["PAT2"]} ',
               f'\033[90m other =\033[0m {match["FAIL"]}')
Ejemplo n.º 10
0
    def generate_excel():
        """Generate Excel with results via pandas DataFrame"""

        xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx')
        print(gray(f'Generating Excel {str(excel).lower()} summary (') +
              magenta(xlsx_name) + gray(')... '),
              end='')
        sys.stdout.flush()
        xlsxwriter = pd.ExcelWriter(xlsx_name)
        list_rows: List = []

        # Save raw samples basic statistics
        data_frame: pd.DataFrame = pd.DataFrame.from_dict(
            {raw: stats[raw].to_dict()
             for raw in raw_samples})
        data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats')

        # Save taxid related statistics per sample
        if excel is Excel.FULL:
            polytree.to_items(ontology=ncbi, items=list_rows)
            # Generate the pandas DataFrame from items and export to Excel
            iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]]
            cols1 = pd.MultiIndex.from_product(iterable_1,
                                               names=['Samples', 'Stats'])
            iterable_2 = [['Details'], ['Rank', 'Name']]
            cols2 = pd.MultiIndex.from_product(iterable_2)
            cols = cols1.append(cols2)
            data_frame = pd.DataFrame.from_items(list_rows,
                                                 orient='index',
                                                 columns=cols)
            data_frame.index.names = ['Id']
            data_frame.to_excel(xlsxwriter, sheet_name=str(excel))
        elif excel is Excel.CMPLXCRUNCHER:
            target_ranks: List = [Rank.NO_RANK]
            if args.controls:  # if controls, add specific sheet for rank
                target_ranks.extend(Rank.selected_ranks)
            for rank in target_ranks:  # Once for no rank dependency (NO_RANK)
                indexes: List[int]
                sheet_name: str
                columns: List[str]
                if args.controls:
                    indexes = [
                        i for i in range(len(raw_samples), len(samples))
                        # Check if sample ends in _(STR_CONTROL)_(rank)
                        if (STR_CONTROL in samples[i].split('_')[-2:] and
                            rank.name.lower() in samples[i].split('_')[-1:])
                    ]
                    sheet_name = f'{STR_CONTROL}_{rank.name.lower()}'
                    columns = [
                        samples[i].replace(
                            '_' + STR_CONTROL + '_' + rank.name.lower(), '')
                        for i in indexes
                    ]
                if rank is Rank.NO_RANK:  # No rank dependency
                    indexes = list(range(len(raw_samples)))
                    sheet_name = f'raw_samples_{rank.name.lower()}'
                    columns = raw_samples
                list_rows = []
                polytree.to_items(ontology=ncbi,
                                  items=list_rows,
                                  sample_indexes=indexes)
                data_frame = pd.DataFrame.from_items(list_rows,
                                                     orient='index',
                                                     columns=columns)
                data_frame.index.names = ['Id']
                data_frame.to_excel(xlsxwriter, sheet_name=sheet_name)
        else:
            raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"')
        xlsxwriter.save()
        print(green('OK!'))
Ejemplo n.º 11
0
def read_kraken_output(
    output_file: Filename,
    scoring: Scoring = Scoring.KRAKEN,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read Kraken output file

    Args:
        output_file: output file name
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, statistics, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_kmerel: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    taxids: Set[Id] = set()
    num_read: int = 0
    nt_read: int = 0
    num_uncl: int = 0
    last_error_read: int = -1  # Number of read of the last error
    num_errors: int = 0  # Number or reads discarded due to error
    output.write(gray(f'Loading output file {output_file}... '))
    try:
        with open(output_file, 'r') as file:
            # Check number of cols in header
            header = file.readline().split('\t')
            if len(header) != 5:
                print(
                    red('\nERROR! ') + 'Kraken output format of ',
                    yellow(f'"{output_file}"'), 'not supported.')
                print(magenta('Expected:'),
                      'C/U, ID, taxid, length, list of mappings')
                print(magenta('Found:'), '\t'.join(header), end='')
                print(blue('HINT:'), 'Use Kraken or Kraken2 direct output.')
                raise Exception('Unsupported file format. Aborting.')
            for raw_line in file:
                try:
                    output_line = raw_line.strip()
                    (_clas, _label, _tid, _length,
                     _maps) = output_line.split('\t')
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                try:
                    length: int = sum(map(int, _length.split('|')))
                    num_read += 1
                    nt_read += length
                    if _clas == UNCLASSIFIED:  # Just count unclassified reads
                        num_uncl += 1
                        continue
                    tid: Id = Id(_tid)
                    maps: List[str] = _maps.split()
                    try:
                        maps.remove('|:|')
                    except ValueError:
                        pass
                    mappings: Counter[Id] = col.Counter()
                    for pair in maps:
                        couple: List[str] = pair.split(':')
                        mappings[Id(couple[0])] += int(couple[1])
                    # From Kraken score get "single hit equivalent length"
                    shel: Score = Score(mappings[tid] + K_MER_SIZE)
                    score: Score = Score(mappings[tid] /
                                         sum(mappings.values()) *
                                         100)  # % relative to all k-mers
                except ValueError:
                    print(
                        yellow('Failure'), 'parsing line elements:'
                        f' {output_line} in {output_file}'
                        '. Ignoring line!')
                    last_error_read = num_read + 1
                    num_errors += 1
                    continue
                else:
                    taxids.add(tid)  # Save all the tids of classified reads
                if minscore is not None:  # Decide if ignore read if low score
                    if scoring is Scoring.KRAKEN:
                        if score < minscore:
                            continue
                    else:
                        if shel < minscore:
                            continue
                try:
                    all_scores[tid].append(shel)
                except KeyError:
                    all_scores[tid] = [
                        shel,
                    ]
                try:
                    all_kmerel[tid].append(score)
                except KeyError:
                    all_kmerel[tid] = [
                        score,
                    ]
                try:
                    all_length[tid].append(length)
                except KeyError:
                    all_length[tid] = [
                        length,
                    ]
    except FileNotFoundError:
        raise Exception(red('\nERROR! ') + f'Cannot read "{output_file}"')
    if last_error_read == num_read + 1:  # Check error in last line: truncated!
        print(yellow('Warning!'), f'{output_file} seems truncated!')
    counts: Counter[Id] = col.Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    output.write(green('OK!\n'))
    if num_read == 0:
        raise Exception(
            red('\nERROR! ') +
            f'Cannot read any sequence from "{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    # Get statistics
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    lens=all_length,
                                    scores=all_scores,
                                    scores2=all_kmerel,
                                    seq_read=num_read,
                                    seq_unclas=num_uncl,
                                    seq_filt=filt_seqs,
                                    tid_clas=len(taxids))
    # Output statistics
    if num_errors:
        output.write(
            gray('  Seqs fail: ') + red(f'{num_errors:_d}\t') +
            gray('(Last error in read ') + red(f'{last_error_read}') +
            gray(')\n'))
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    output.write(
        gray('  Scores SHEL: min = ') + f'{stat.sco.mini:.1f},' +
        gray(' max = ') + f'{stat.sco.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Coverage(%): min = ') + f'{stat.sco2.mini:.1f},' +
        gray(' max = ') + f'{stat.sco2.maxi:.1f},' + gray(' avr = ') +
        f'{stat.sco2.mean:.1f}\n')
    output.write(
        gray('  Read length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(
        gray('  TaxIds: by classifier = ') + f'{stat.tid.clas}' +
        gray(', by filter = ') + f'{stat.tid.filt}\n')
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.SHEL:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    elif scoring is Scoring.KRAKEN:
        out_scores = {tid: Score(mean(all_kmerel[tid])) for tid in all_kmerel}
    elif scoring is Scoring.LENGTH:
        out_scores = {tid: Score(mean(all_length[tid])) for tid in all_length}
    elif scoring is Scoring.LOGLENGTH:
        out_scores = {
            tid: Score(log10(mean(all_length[tid])))
            for tid in all_length
        }
    elif scoring is Scoring.NORMA:
        scores: Dict[Id, Score] = {
            tid: Score(mean(all_scores[tid]))
            for tid in all_scores
        }
        lengths: Dict[Id, Score] = {
            tid: Score(mean(all_length[tid]))
            for tid in all_length
        }
        out_scores = {
            tid: Score(scores[tid] / lengths[tid] * 100)
            for tid in scores
        }
    else:
        print(red('ERROR!'), f'kraken: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')
    # Return
    return output.getvalue(), stat, counts, out_scores
Ejemplo n.º 12
0
    def generate_excel():
        """Generate Excel with results via pandas DataFrame"""

        xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx')
        print(gray(f'Generating Excel {str(excel).lower()} summary (') +
              magenta(xlsx_name) + gray(')... '),
              end='')
        sys.stdout.flush()
        xlsxwriter = pd.ExcelWriter(xlsx_name)
        list_rows: List = []

        # Save raw samples basic statistics
        data_frame: pd.DataFrame = pd.DataFrame.from_dict(
            {raw: stats[raw].to_dict()
             for raw in raw_samples})
        data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats')

        # Save taxid related statistics per sample
        if excel is Excel.FULL:
            polytree.to_items(taxonomy=ncbi, items=list_rows)
            # Generate the pandas DataFrame from items and export to Excel
            iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]]
            cols1 = pd.MultiIndex.from_product(iterable_1,
                                               names=['Samples', 'Stats'])
            iterable_2 = [['Details'], ['Rank', 'Name']]
            cols2 = pd.MultiIndex.from_product(iterable_2)
            cols = cols1.append(cols2)
            data_frame = pd.DataFrame.from_items(list_rows,
                                                 orient='index',
                                                 columns=cols)
            data_frame.index.names = ['TaxId']
            data_frame.to_excel(xlsxwriter, sheet_name=str(excel))
        elif excel is Excel.CMPLXCRUNCHER:
            target_ranks: List = [Rank.NO_RANK]
            if args.controls:
                target_ranks = [
                    Rank.SPECIES,
                    Rank.GENUS,  # Ranks of interest
                    Rank.FAMILY,
                    Rank.ORDER
                ]  # for cmplxcruncher
            for rank in target_ranks:  # Once for no rank dependency (NO_RANK)
                indexes: List[int]
                sheet_name: str
                columns: List[str]
                if args.controls:
                    indexes = [
                        i for i in range(len(raw_samples), len(samples))
                        if (samples[i].startswith(STR_CONTROL)
                            and rank.name.lower() in samples[i])
                    ]
                    sheet_name = f'{STR_CONTROL}_{rank.name.lower()}'
                    columns = [samples[i].split('_')[2] for i in indexes]
                else:  # No rank dependency
                    indexes = list(range(len(raw_samples)))
                    sheet_name = f'raw_samples_{rank.name.lower()}'
                    columns = [samples[i].split('_')[0] for i in indexes]
                list_rows = []
                polytree.to_items(taxonomy=ncbi,
                                  items=list_rows,
                                  sample_indexes=indexes)
                data_frame = pd.DataFrame.from_items(list_rows,
                                                     orient='index',
                                                     columns=columns)
                data_frame.index.names = ['TaxId']
                data_frame.to_excel(xlsxwriter, sheet_name=sheet_name)
        else:
            raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"')
        xlsxwriter.save()
        print(green('OK!'))