コード例 #1
0
ファイル: clark.py プロジェクト: pythseq/recentrifuge
def select_clark_inputs(clarks: List[Filename], ext: str = '.csv') -> None:
    """Search for CLARK, CLARK-l, CLARK-S files to analyze"""
    dir_name = clarks[0]
    clarks.clear()
    with os.scandir(dir_name) as dir_entry:
        for fil in dir_entry:
            if not fil.name.startswith('.') and fil.name.endswith(ext):
                if dir_name != '.':
                    clarks.append(Filename(os.path.join(dir_name, fil.name)))
                else:  # Avoid sample names starting with just the dot
                    clarks.append(Filename(fil.name))
    clarks.sort()
    print(gray(f'CLARK {ext} files to analyze:'), clarks)
コード例 #2
0
ファイル: kraken.py プロジェクト: oatesa/recentrifuge
def select_kraken_inputs(krakens: List[Filename], ext: str = '.krk') -> None:
    """Search for Kraken files to analyze"""
    dir_name = krakens[0]
    krakens.clear()
    with os.scandir(dir_name) as dir_entry:
        for fil in dir_entry:
            if not fil.name.startswith('.') and fil.name.endswith(ext):
                if dir_name != '.':
                    krakens.append(Filename(os.path.join(dir_name, fil.name)))
                else:  # Avoid sample names starting with just the dot
                    krakens.append(Filename(fil.name))
    krakens.sort()
    print(gray(f'Kraken {ext} files to analyze:'), krakens)
コード例 #3
0
ファイル: centrifuge.py プロジェクト: pythseq/recentrifuge
def select_centrifuge_inputs(outputs: List[Filename],
                             ext: str = '.out') -> None:
    """Centrifuge output files processing specific stuff"""
    dir_name = outputs[0]
    outputs.clear()
    with os.scandir(dir_name) as dir_entry:
        for fil in dir_entry:
            if not fil.name.startswith('.') and fil.name.endswith(ext):
                if dir_name != '.':
                    outputs.append(Filename(os.path.join(dir_name, fil.name)))
                else:  # Avoid sample names starting with just the dot
                    outputs.append(Filename(fil.name))
    outputs.sort()
    print(gray(f'Centrifuge {ext} files to analyze:'), outputs)
コード例 #4
0
    def select_inputs():
        """Choose right classifier, input and output files"""
        nonlocal process, scoring, input_files, plasmidfile, classifier

        if reports:
            classifier = Classifier.KRAKEN
            process = process_report
            input_files = reports
        elif clarks:
            classifier = Classifier.CLARK
            process = process_output
            input_files = clarks
            if len(clarks) == 1 and os.path.isdir(clarks[0]):
                select_clark_inputs(clarks)
        elif lmats:
            classifier = Classifier.LMAT
            scoring = Scoring.LMAT
            process = process_output
            input_files = lmats
            plasmidfile = Filename(os.path.join(args.nodespath, PLASMID_FILE))
            select_lmat_inputs(lmats)
        elif outputs:
            classifier = Classifier.CENTRIFUGE
            process = process_output
            input_files = outputs
            if len(outputs) == 1 and os.path.isdir(outputs[0]):
                select_centrifuge_inputs(outputs)
コード例 #5
0
ファイル: core.py プロジェクト: oatesa/recentrifuge
def krona_from_text(
        samples: List[Sample],
        outputs: Dict[Rank, Filename],
        htmlfile: Filename = Filename('Output' + HTML_SUFFIX),
):
    """Generate the Krona html file calling ktImportText.

    Superseded by krona.krona_from_xml().

    """
    subprc = ["ktImportText"]
    subprc.extend(samples)
    try:
        subprc.extend([
            outputs[level][i] for level in list(Rank.selected_ranks)
            for i in range(len(outputs[level]))
        ])
    except KeyError:
        pass
    subprc.extend(["-o", htmlfile])
    try:
        subprocess.run(subprc, check=True)
    except subprocess.CalledProcessError:
        print('\n\033[91mERROR!\033[0m ktImportText: ' +
              'returned a non-zero exit status (Krona plot built failed)')
コード例 #6
0
 def select_html_file():
     """HTML filename selection"""
     nonlocal htmlfile
     if lmats:  # Select case for dir name or filename prefix
         if os.path.isdir(lmats[0]):  # Dir name
             dirname = os.path.dirname(os.path.normpath(lmats[0]))
             if not dirname or dirname == '.':
                 basename = 'output'
             else:
                 basename = os.path.basename(dirname)
         else:  # Explicit path and file name prefix is provided
             dirname, basename = os.path.split(lmats[0])
         htmlfile = Filename(os.path.join(dirname, basename + HTML_SUFFIX))
     elif reports:
         htmlfile = Filename(reports[0].split('_mhl')[0] + HTML_SUFFIX)
     else:
         htmlfile = Filename(outputs[0].split('_mhl')[0] + HTML_SUFFIX)
コード例 #7
0
 def by_excel_file(dirname: Filename = None) -> None:
     """Do the job in case of Excel file with all the details"""
     if dirname is None:
         dirname = Filename(os.path.dirname(xcel))
     os.makedirs(dirname, exist_ok=True)
     # Expected index (taxids) in column after taxa name, and last row will
     #  be removed (reserved for sum of reads in Excel file)
     mock_df = pd.read_excel(xcel, index_col=1, skipfooter=1, dtype=str)
     del mock_df['RECENTRIFUGE MOCK']
     vprint(gray('Layout to generate the mock files:\n'), mock_df, '\n')
     for name, series in mock_df.iteritems():
         mock_layout: Counter[Id] = col.Counter(series.to_dict(dict))
         # In prev, series.to_dict(col.Counter) fails, so this is workaround
         test: Filename = Filename(os.path.join(dirname, name + '.out'))
         if file:
             mock_from_source(test, mock_layout)
         else:
             mock_from_scratch(test, mock_layout)
コード例 #8
0
def select_lmat_inputs(lmats: List[Filename]) -> None:
    """"LMAT files processing specific stuff"""
    if lmats == ['.']:
        lmats.clear()
        with os.scandir() as dir_entry:
            for entry in dir_entry:
                if not entry.name.startswith('.') and entry.is_dir():
                    if entry.name != os.path.basename(TAXDUMP_PATH):
                        lmats.append(Filename(entry.name))
        lmats.sort()
    print(gray('LMAT subdirs to analyze:'), lmats)
コード例 #9
0
 def by_mock_files() -> None:
     """Do the job in case of mock files"""
     if len(args.mock) == 1 and os.path.isdir(args.mock[0]):
         select_centrifuge_inputs(args.mock, ext='.mck')
     for mock in args.mock:
         mock_layout: Counter[Id] = read_mock_files(mock)
         test: Filename = Filename(mock.split('.mck')[0] + '.out')
         if args.file:
             mock_from_source(test, mock_layout)
         else:
             mock_from_scratch(test, mock_layout)
コード例 #10
0
ファイル: krona.py プロジェクト: pythseq/recentrifuge
def krona_from_xml(
        xmlfile: Filename,
        htmlfile: Filename = Filename('Output' + HTML_SUFFIX),
):
    """Generate the Krona html file calling ktImportXML."""
    subprc = ["ktImportXML"]
    subprc.append(xmlfile)
    subprc.extend(["-o", htmlfile])
    try:
        subprocess.run(subprc, check=True)
    except subprocess.CalledProcessError:
        print('\n\033[91mERROR!\033[0m ktImportXML: ' +
              'returned a non-zero exit status (Krona plot built failed)')
コード例 #11
0
    def format_filename(fastq: Filename) -> Filename:
        """Auxiliary function to properly format the output filenames.

        Args:
            fastq: Complete filename of the fastq input file

        Returns: Filename of the rextracted fastq output file
        """
        fastq_filename, _ = os.path.splitext(fastq)
        output_list: List[str] = [fastq_filename, '_rxtr']
        if including:
            output_list.append('_incl')
            output_list.extend('_'.join(including))
        if excluding:
            output_list.append('_excl')
            output_list.extend('_'.join(excluding))
        output_list.append('.fastq')
        return Filename(''.join(output_list))
コード例 #12
0
    def select_inputs():
        """Choose right input and output files"""
        nonlocal process, scoring, input_files, plasmidfile

        if outputs and len(outputs) == 1 and os.path.isdir(outputs[0]):
            select_centrifuge_inputs(outputs)
        if lmats:
            plasmidfile = Filename(os.path.join(args.nodespath, PLASMID_FILE))
            select_lmat_inputs(lmats)

        # Select method and arguments depending on type of files to analyze
        if lmats:
            process = process_output
            input_files = lmats
            scoring = Scoring.LMAT
        elif reports:
            process = process_report
            input_files = reports
        else:
            process = process_output
            input_files = outputs
コード例 #13
0
def read_lmat_output(
    output_file: Filename,
    scoring: Scoring = Scoring.LMAT,
    minscore: Score = None,
) -> Tuple[str, SampleStats, Counter[Id], Dict[Id, Score]]:
    """
    Read LMAT output (iterate over all the output files)

    Args:
        output_file: output file name (prefix)
        scoring: type of scoring to be applied (see Scoring class)
        minscore: minimum confidence level for the classification

    Returns:
        log string, abundances counter, scores dict

    """
    output: io.StringIO = io.StringIO(newline='')
    all_scores: Dict[Id, List[Score]] = {}
    all_length: Dict[Id, List[int]] = {}
    nt_read: int = 0
    matchings: Counter[Match] = Counter()
    output_files: List[Filename] = []
    # Select files to process depending on if the output files are explicitly
    #  given or directory name is provided (all the output files there)
    if os.path.isdir(output_file):  # Just the directory name is provided
        dirname = os.path.normpath(output_file)
        for file in os.listdir(dirname):  # Add all LMAT output files in dir
            if ('_output' in file and file.endswith('.out')
                    and 'canVfin' not in file and 'pyLCA' not in file):
                output_files.append(Filename(file))
    else:  # Explicit path and file name prefix is given
        dirname, basename = os.path.split(output_file)
        for file in os.listdir(dirname):  # Add selected output files in dir
            if (file.startswith(basename) and file.endswith('.out')
                    and 'canVfin' not in file and 'pyLCA' not in file):
                output_files.append(Filename(file))
    if not output_files:
        raise Exception(
            f'\n\033[91mERROR!\033[0m Cannot read from "{output_file}"')
    # Read LMAT output files
    for output_name in output_files:
        path: Filename = Filename(os.path.join(dirname, output_name))
        output.write(f'\033[90mLoading output file {path}...\033[0m')
        try:
            with open(path, 'r') as io_file:
                for seq in SeqIO.parse(io_file, "lmat"):
                    tid: Id = seq.annotations['final_taxid']
                    score: Score = seq.annotations['final_score']
                    match: Match = Match.lmat(seq.annotations['final_match'])
                    matchings[match] += 1
                    length: int = len(seq)
                    nt_read += length
                    if minscore is not None:
                        if score < minscore:  # Ignore read if low score
                            continue
                    if match in [Match.DIRECTMATCH, Match.MULTIMATCH]:
                        try:
                            all_scores[tid].append(score)
                        except KeyError:
                            all_scores[tid] = [
                                score,
                            ]
                        try:
                            all_length[tid].append(length)
                        except KeyError:
                            all_length[tid] = [
                                length,
                            ]
        except FileNotFoundError:
            raise Exception(red('\nERROR!') + f'Cannot read "{path}"')
        output.write(green('OK!\n'))
    abundances: Counter[Id] = Counter(
        {tid: len(all_scores[tid])
         for tid in all_scores})
    # Basic output statistics
    read_seqs: int = sum(matchings.values())
    if read_seqs == 0:
        raise Exception(
            red('\nERROR! ') + f'Cannot read any sequence from"{output_file}"')
    filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
    if filt_seqs == 0:
        raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
    stat: SampleStats = SampleStats(minscore=minscore,
                                    nt_read=nt_read,
                                    scores=all_scores,
                                    lens=all_length,
                                    seq_read=read_seqs,
                                    seq_filt=filt_seqs,
                                    seq_clas=matchings[Match.DIRECT] +
                                    matchings[Match.MULTI])
    output.write(
        gray('  Seqs read: ') + f'{stat.seq.read:_d}\t' + gray('[') +
        f'{stat.nt_read}' + gray(']\n'))
    output.write(
        gray('  Seqs clas: ') + f'{stat.seq.clas:_d}\t' + gray('(') +
        f'{stat.get_unclas_ratio():.2%}' + gray(' unclassified)\n'))
    output.write(
        gray('  Seqs pass: '******'{stat.seq.filt:_d}\t' + gray('(') +
        f'{stat.get_reject_ratio():.2%}' + gray(' rejected)\n'))
    multi_rel: float = matchings[Match.MULTI] / read_seqs
    direct_rel: float = matchings[Match.DIRECT] / read_seqs
    nodbhits_rel: float = matchings[Match.NODBHITS] / read_seqs
    tooshort_rel: float = matchings[Match.READTOOSHORT] / read_seqs
    lowscore_rel: float = matchings[Match.LOWSCORE] / read_seqs
    output.write(f'\033[90m  DB Matching: '
                 f'Multi =\033[0m {multi_rel:.1%}\033[90m  '
                 f'Direct =\033[0m {direct_rel:.1%}\033[90m  '
                 f'ReadTooShort =\033[0m {tooshort_rel:.1%}\033[90m  '
                 f'LowScore =\033[0m {lowscore_rel:.1%}\033[90m  '
                 f'NoDbHits =\033[0m {nodbhits_rel:.1%}\033[90m\n')
    output.write(
        gray('  Scores: min = ') + f'{stat.sco.mini:.1f},' + gray(' max = ') +
        f'{stat.sco.maxi:.1f},' + gray(' avr = ') + f'{stat.sco.mean:.1f}\n')
    output.write(
        gray('  Length: min = ') + f'{stat.len.mini},' + gray(' max = ') +
        f'{stat.len.maxi},' + gray(' avr = ') + f'{stat.len.mean}\n')
    output.write(f'  {stat.num_taxa}' + gray(f' taxa with assigned reads\n'))
    # Select score output
    out_scores: Dict[Id, Score]
    if scoring is Scoring.LMAT:
        out_scores = {tid: Score(mean(all_scores[tid])) for tid in all_scores}
    else:
        print(red('ERROR!'), f' LMAT: Unsupported Scoring "{scoring}"')
        raise Exception('Unsupported scoring')  # Return
    return output.getvalue(), stat, abundances, out_scores
コード例 #14
0
def generate_mock(
    ncbi: Taxonomy,
    file: Filename,
    rnd: int,
    mocks: List[Filename],
    xcel: Filename,
    debug: bool,
):
    def vprint(*args):
        """Print only if verbose/debug mode is enabled"""
        if debug:
            print(*args, end='')
            sys.stdout.flush()

    def read_mock_files(mock: Filename) -> Counter[Id]:
        """Read a mock layout (.mck) file"""
        mock_layout: Counter[Id] = col.Counter()
        with open(mock, 'r') as mck:
            vprint(gray('\nProcessing'), blue(mock), gray('file:\n'))
            for line in mck:
                if line.startswith('#'):
                    continue
                _tid, _num = line.split('\t')
                tid = Id(_tid)
                num = int(_num)
                mock_layout[tid] = num
                vprint(num, gray('\treads for taxid\t'), tid, '\t(',
                       cyan(ncbi.get_name(tid)), ')\n')
        return mock_layout

    def mock_from_source(out: Filename, mock_layout: Counter[Id]) -> None:
        """Generate a mock Centrifuge output file from source file"""
        with open(out, 'w') as fout, open(file) as fcfg:
            vprint(gray('Generating'), blue(out), gray('file... '))
            fout.write(fcfg.readline())  # copy cfg output file header
            reads_writen: int = 0
            for line in fcfg:
                tid = Id(line.split('\t')[2])
                if mock_layout[tid]:
                    fout.write(line)
                    mock_layout[tid] -= 1
                    reads_writen += 1
                    if not sum(mock_layout.values()):
                        vprint(reads_writen, 'reads', green('OK!\n'))
                        break
        if sum(mock_layout.values()):
            print(red('ERROR!\n'))
            print(gray('Incomplete read copy by taxid:'))
            mock_layout = +mock_layout  # Delete zero counts elements
            for tid in mock_layout:
                print(yellow(mock_layout[tid]), gray('reads missing for tid'),
                      tid, '(', cyan(ncbi.get_name(tid)), ')\n')

    def mock_from_scratch(out: Filename, mock_layout: Counter[Id]) -> None:
        """Generate a mock Centrifuge output file from scratch"""
        with open(out, 'w') as fout:
            vprint(gray('Generating'), blue(out), gray('file... '))
            fout.write('readID\tseqID\ttaxID\tscore\t2ndBestScore\t'
                       'hitLength\tqueryLength\tnumMatches\n')
            reads_writen: int = 0
            for numtid in mock_layout:
                tid = Id(numtid)  # Convert to Id the excel integer
                maxhl: int = random.randint(rnd + 1, MAX_HIT_LENGTH)
                rank: str = str(ncbi.get_rank(tid)).lower()
                for _ in range(int(mock_layout[numtid])):
                    hit_length = random.randint(rnd + 1, maxhl)
                    fout.write(f'test{reads_writen}\t{rank}\t'
                               f'{tid}\t{(hit_length - 15) ** 2}\t'
                               f'0\t{hit_length}\t{MAX_HIT_LENGTH}\t1\n')
                    reads_writen += 1
            vprint(reads_writen, 'reads', green('OK!\n'))
            if out == TEST_REXT_SMPL:  # Test mode: create mock FASTQ for smpl
                mock_fastq(reads_writen)

    def by_mock_files() -> None:
        """Do the job in case of mock files"""
        if len(mocks) == 1 and os.path.isdir(mocks[0]):
            select_centrifuge_inputs(mocks, ext='.mck')
        for mock in mocks:
            mock_layout: Counter[Id] = read_mock_files(mock)
            test: Filename = Filename(mock.split('.mck')[0] + '.out')
            if file:
                mock_from_source(test, mock_layout)
            else:
                mock_from_scratch(test, mock_layout)

    def by_excel_file(dirname: Filename = None) -> None:
        """Do the job in case of Excel file with all the details"""
        if dirname is None:
            dirname = Filename(os.path.dirname(xcel))
        os.makedirs(dirname, exist_ok=True)
        # Expected index (taxids) in column after taxa name, and last row will
        #  be removed (reserved for sum of reads in Excel file)
        mock_df = pd.read_excel(xcel, index_col=1, skipfooter=1, dtype=str)
        del mock_df['RECENTRIFUGE MOCK']
        vprint(gray('Layout to generate the mock files:\n'), mock_df, '\n')
        for name, series in mock_df.iteritems():
            mock_layout: Counter[Id] = col.Counter(series.to_dict(dict))
            # In prev, series.to_dict(col.Counter) fails, so this is workaround
            test: Filename = Filename(os.path.join(dirname, name + '.out'))
            if file:
                mock_from_source(test, mock_layout)
            else:
                mock_from_scratch(test, mock_layout)

    def mock_fastq(num_reads: int) -> None:
        """Do the job in case of Excel file with all the details"""
        def fastq_seqs(alphabet=single_letter_alphabet):
            """Generator function that creates mock fastq sequences
            """
            for seq in range(num_reads):
                yield SeqRecord(Seq('AGTC', alphabet),
                                id=f'test{seq}',
                                name=f'test{seq}',
                                description=f'test{seq}',
                                annotations={'quality': '@@@@'})

        print(gray('Writing'),
              magenta(f'{num_reads}'),
              gray('reads in'),
              TEST_REXT_FSTQ,
              gray('...'),
              end='',
              flush=True)
        SeqIO.write((sq for sq in fastq_seqs()), TEST_REXT_FSTQ, 'quickfastq')
        print(green(' OK!'))

    if mocks:
        by_mock_files()
    elif xcel:
        by_excel_file()
    else:  # Test mode
        path = os.path.dirname(os.path.realpath(__file__))
        xcel = Filename(os.path.join(path, TEST_MOCK_XLSX))
        vprint(gray('Test mode! Processing'), xcel, '\n')
        random.seed(18490)
        by_excel_file(dirname=TEST_OUTPUT_DIR)
コード例 #15
0
def main():
    """Main entry point to script."""
    # Argument Parser Configuration
    parser = argparse.ArgumentParser(
        description='Extract reads following Centrifuge/Kraken output',
        epilog=f'%(prog)s  - {__author__} - {__date__}')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=f'%(prog)s release {__version__} ({__date__})')
    parser.add_argument('-f',
                        '--file',
                        action='store',
                        metavar='FILE',
                        required=True,
                        help='Centrifuge output file.')
    parser.add_argument('-l',
                        '--limit',
                        action='store',
                        metavar='NUMBER',
                        type=int,
                        default=None,
                        help=('Limit of FASTQ reads to extract. '
                              'Default: no limit'))
    parser.add_argument(
        '-m',
        '--maxreads',
        action='store',
        metavar='NUMBER',
        type=int,
        default=None,
        help=('Maximum number of FASTQ reads to search for the taxa. '
              'Default: no maximum'))
    parser.add_argument(
        '-n',
        '--nodespath',
        action='store',
        metavar='PATH',
        default=TAXDUMP_PATH,
        help=('path for the nodes information files (nodes.dmp and names.dmp' +
              ' from NCBI'))
    parser.add_argument(
        '-i',
        '--include',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to include a taxon and all underneath ' +
              '(multiple -i is available to include several taxid). ' +
              'By default all the taxa is considered for inclusion.'))
    parser.add_argument(
        '-x',
        '--exclude',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to exclude a taxon and all underneath ' +
              '(multiple -x is available to exclude several taxid)'))
    parser.add_argument(
        '-y',
        '--minscore',
        action='store',
        metavar='NUMBER',
        type=lambda txt: Score(float(txt)),
        default=None,
        help=('minimum score/confidence of the classification of a read '
              'to pass the quality filter; all pass by default'))
    filein = parser.add_mutually_exclusive_group(required=True)
    filein.add_argument('-q',
                        '--fastq',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Single FASTQ file (no paired-ends)')
    filein.add_argument('-1',
                        '--mate1',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 1s '
                        '(filename usually includes _1)')
    parser.add_argument('-2',
                        '--mate2',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 2s '
                        '(filename usually includes _2)')

    # timing initialization
    start_time: float = time.time()
    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} =-= {__date__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    args = parser.parse_args()
    output_file = args.file
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    excluding: Set[TaxId] = set(args.exclude)
    including: Set[TaxId] = set(args.include)
    fastq_1: Filename
    fastq_2: Filename = args.mate2
    if not fastq_2:
        fastq_1 = args.fastq
    else:
        fastq_1 = args.mate1

    # Load NCBI nodes, names and build children
    plasmidfile: Filename = None
    ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, False,
                              excluding, including)

    # Build taxonomy tree
    print(gray('Building taxonomy tree...'), end='')
    sys.stdout.flush()
    tree = TaxTree()
    tree.grow(taxonomy=ncbi, look_ancestors=False)
    print(green(' OK!'))

    # Get the taxa
    print(gray('Filtering taxa...'), end='')
    sys.stdout.flush()
    ranks: Ranks = Ranks({})
    tree.get_taxa(ranks=ranks, include=including, exclude=excluding)
    print(green(' OK!'))
    taxids: Set[TaxId] = set(ranks)
    taxlevels: TaxLevels = Rank.ranks_to_taxlevels(ranks)
    num_taxlevels = Counter({rank: len(taxlevels[rank]) for rank in taxlevels})
    num_taxlevels = +num_taxlevels

    # Statistics about including taxa
    print(f'  {len(taxids)}\033[90m taxid selected in \033[0m', end='')
    print(f'{len(num_taxlevels)}\033[90m different taxonomical levels:\033[0m')
    for rank in num_taxlevels:
        print(f'  Number of different {rank}: {num_taxlevels[rank]}')
    assert taxids, red('ERROR! No taxids to search for!')

    # Get the records
    records: List[SeqRecord] = []
    num_seqs: int = 0
    # timing initialization
    start_time_load: float = time.perf_counter()
    print(gray(f'Loading output file {output_file}...'), end='')
    sys.stdout.flush()
    try:
        with open(output_file, 'rU') as file:
            file.readline()  # discard header
            for num_seqs, record in enumerate(SeqIO.parse(file, 'centrifuge')):
                tid: TaxId = record.annotations['taxID']
                if tid not in taxids:
                    continue  # Ignore read if low confidence
                score: Score = Score(record.annotations['score'])
                if args.minscore is not None and score < args.minscore:
                    continue
                records.append(record)
    except FileNotFoundError:
        raise Exception(red('ERROR!') + 'Cannot read "' + output_file + '"')
    print(green(' OK!'))

    # Basic records statistics
    print(
        gray('  Load elapsed time: ') +
        f'{time.perf_counter() - start_time_load:.3g}' + gray(' sec'))
    print(f'  \033[90mMatching reads: \033[0m{len(records):_d} \033[90m\t'
          f'(\033[0m{len(records)/num_seqs:.4%}\033[90m of sample)')
    sys.stdout.flush()

    # FASTQ sequence dealing
    # records_ids: List[SeqRecord] = [record.id for record in records]
    records_ids: Set[SeqRecord] = {record.id for record in records}
    seqs1: List[SeqRecord] = []
    seqs2: List[SeqRecord] = []
    extracted: int = 0
    i: int = 0
    if fastq_2:
        print(
            f'\033[90mLoading FASTQ files {fastq_1} and {fastq_2}...\n'
            f'Mseqs: \033[0m',
            end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1, open(fastq_2, 'rU') as file2:
                for i, (rec1, rec2) in enumerate(
                        zip(SeqIO.parse(file1, 'quickfastq'),
                            SeqIO.parse(file2, 'quickfastq'))):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        seqs2.append(rec2)
                        extracted += 1

        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ files')
    else:
        print(f'\033[90mLoading FASTQ files {fastq_1}...\n'
              f'Mseqs: \033[0m',
              end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1:
                for i, rec1 in enumerate(SeqIO.parse(file1, 'quickfastq')):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        extracted += 1
        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ file')
    print(cyan(f' {i/1e+6:.3g} Mseqs'), green('OK! '))

    def format_filename(fastq: Filename) -> Filename:
        """Auxiliary function to properly format the output filenames.

        Args:
            fastq: Complete filename of the fastq input file

        Returns: Filename of the rextracted fastq output file
        """
        fastq_filename, _ = os.path.splitext(fastq)
        output_list: List[str] = [fastq_filename, '_rxtr']
        if including:
            output_list.append('_incl')
            output_list.extend('_'.join(including))
        if excluding:
            output_list.append('_excl')
            output_list.extend('_'.join(excluding))
        output_list.append('.fastq')
        return Filename(''.join(output_list))

    filename1: Filename = format_filename(fastq_1)
    SeqIO.write(seqs1, filename1, 'quickfastq')
    print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename1)
    if fastq_2:
        filename2: Filename = format_filename(fastq_2)
        SeqIO.write(seqs2, filename2, 'quickfastq')
        print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'),
              filename2)

    # Timing results
    print(gray('Total elapsed time:'),
          time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
コード例 #16
0
def main():
    """Main entry point to Recentrifuge."""
    def configure_parser():
        """Argument Parser Configuration"""
        parser = argparse.ArgumentParser(
            description='Analyze results of metagenomic taxonomic classifiers',
            epilog=f'%(prog)s  - Release {__version__} - {__date__}' + LICENSE,
            formatter_class=argparse.RawDescriptionHelpFormatter)
        parser.add_argument(
            '-V',
            '--version',
            action='version',
            version=f'%(prog)s version {__version__} released in {__date__}')
        parser_in = parser.add_argument_group(
            'input', 'Define Recentrifuge input files and formats')
        parser_in.add_argument('-n',
                               '--nodespath',
                               action='store',
                               metavar='PATH',
                               default=TAXDUMP_PATH,
                               help=('path for the nodes information files '
                                     '(nodes.dmp and names.dmp from NCBI)'))
        parser_filein = parser_in.add_mutually_exclusive_group(required=True)
        parser_filein.add_argument(
            '-f',
            '--file',
            action='append',
            metavar='FILE',
            type=Filename,
            help=('Centrifuge output files. If a single directory is entered, '
                  'every .out file inside will be taken as a different sample.'
                  ' Multiple -f is available to include several samples.'))
        parser_filein.add_argument(
            '-l',
            '--lmat',
            action='append',
            metavar='FILE',
            type=Filename,
            default=None,
            help=('LMAT output dir or file prefix. If just "." is entered, '
                  'every subdirectory under the current directory will be '
                  'taken as a sample and scanned looking for LMAT output files'
                  '. Multiple -l is available to include several samples.'))
        parser_filein.add_argument(
            '-k',
            '--clark',
            action='append',
            metavar='FILE',
            type=Filename,
            help=('CLARK(S) output files. If a single directory is entered, '
                  'every .csv file inside will be taken as a different sample.'
                  ' Multiple -k is available to include several samples.'))
        parser_filein.add_argument(
            '-r',
            '--report',
            action='append',
            metavar='FILE',
            type=Filename,
            help=('Centrifuge/Kraken report files '
                  '(multiple -r is available to include several samples)'))
        parser_out = parser.add_argument_group(
            'output', 'Related to the Recentrifuge output files')
        parser_out.add_argument(
            '-o',
            '--outhtml',
            action='store',
            metavar='FILE',
            type=Filename,
            help='HTML output file (if not given, the filename will be '
            'inferred from input files)')
        parser_out.add_argument(
            '-e',
            '--excel',
            action='store',
            metavar='OUTPUT_TYPE',
            choices=[str(excel) for excel in Excel],
            default=str(Excel(0)),
            help=(f'type of excel report to be generated, and can be one of '
                  f'{[str(excel) for excel in Excel]}'))
        parser_coarse = parser.add_argument_group(
            'tuning', 'Coarse tuning of algorithm parameters')
        parser_cross = parser_coarse.add_mutually_exclusive_group(
            required=False)
        parser_cross.add_argument(
            '-c',
            '--controls',
            action='store',
            metavar='CONTROLS_NUMBER',
            type=int,
            default=0,
            help=('this number of first samples will be treated as negative '
                  'controls; default is no controls'))
        parser_coarse.add_argument(
            '-s',
            '--scoring',
            action='store',
            metavar='SCORING',
            choices=[str(each_score) for each_score in Scoring],
            default=str(Scoring(0)),
            help=(f'type of scoring to be applied, and can be one of '
                  f'{[str(scoring) for scoring in Scoring]}'))
        parser_coarse.add_argument(
            '-y',
            '--minscore',
            action='store',
            metavar='NUMBER',
            type=lambda txt: Score(float(txt)),
            default=None,
            help=('minimum score/confidence of the classification of a read '
                  'to pass the quality filter; all pass by default'))
        parser_coarse.add_argument(
            '-m',
            '--mintaxa',
            action='store',
            metavar='INT',
            type=int,
            default=DEFMINTAXA,
            help='minimum taxa to avoid collapsing one level to the parent one'
        )
        parser_coarse.add_argument(
            '-x',
            '--exclude',
            action='append',
            metavar='TAXID',
            type=Id,
            default=[],
            help=('NCBI taxid code to exclude a taxon and all underneath '
                  '(multiple -x is available to exclude several taxid)'))
        parser_coarse.add_argument(
            '-i',
            '--include',
            action='append',
            metavar='TAXID',
            type=Id,
            default=[],
            help=('NCBI taxid code to include a taxon and all underneath '
                  '(multiple -i is available to include several taxid); '
                  'by default, all the taxa are considered for inclusion'))
        parser_cross.add_argument('-a',
                                  '--avoidcross',
                                  action='store_true',
                                  help='avoid cross analysis')
        parser_fine = parser.add_argument_group(
            'fine tuning', 'Fine tuning of algorithm parameters')
        parser_fine.add_argument(
            '-z',
            '--ctrlminscore',
            action='store',
            metavar='NUMBER',
            type=lambda txt: Score(float(txt)),
            default=None,
            help=('minimum score/confidence of the classification of a read '
                  'in control samples to pass the quality filter; if defaults '
                  'to "minscore"'))
        parser_fine.add_argument(
            '-w',
            '--ctrlmintaxa',
            action='store',
            metavar='INT',
            type=int,
            default=None,
            help='minimum taxa to avoid collapsing one level to the parent one'
            ' in control samples; it defaults to "mintaxa"')
        parser_fine.add_argument(
            '-u',
            '--summary',
            action='store',
            metavar='OPTION',
            choices=['add', 'only', 'avoid'],
            default='add',
            help=(
                'select to "add" summary samples to other samples, or to '
                '"only" show summary samples or to "avoid" summaries at all'))
        parser_fine.add_argument(
            '-t',
            '--takeoutroot',
            action='store_true',
            help='remove counts directly assigned to the "root" level')
        parser_fine.add_argument('--nokollapse',
                                 action='store_true',
                                 help='show the "cellular organisms" taxon')
        parser_mode = parser.add_argument_group('advanced',
                                                'Advanced modes of running')
        parser_mode.add_argument(
            '--dummy',  # hidden flag: just generate a dummy plot for JS debug
            action='store_true',
            help=argparse.SUPPRESS)
        parser_mode.add_argument(
            '-g',
            '--debug',
            action='store_true',
            help='increase output verbosity and perform additional checks')
        parser_mode.add_argument('--sequential',
                                 action='store_true',
                                 help='deactivate parallel processing')
        return parser

    def check_debug():
        """Check debugging mode"""
        if args.debug:
            print(blue('INFO:'), gray('Debugging mode activated'))
            print(blue('INFO:'), gray('Active parameters:'))
            for key, value in vars(args).items():
                if value:
                    print(gray(f'\t{key} ='), f'{value}')

    def select_inputs():
        """Choose right classifier, input and output files"""
        nonlocal process, scoring, input_files, plasmidfile, classifier

        if reports:
            classifier = Classifier.KRAKEN
            process = process_report
            input_files = reports
        elif clarks:
            classifier = Classifier.CLARK
            process = process_output
            input_files = clarks
            if len(clarks) == 1 and os.path.isdir(clarks[0]):
                select_clark_inputs(clarks)
        elif lmats:
            classifier = Classifier.LMAT
            scoring = Scoring.LMAT
            process = process_output
            input_files = lmats
            plasmidfile = Filename(os.path.join(args.nodespath, PLASMID_FILE))
            select_lmat_inputs(lmats)
        elif outputs:
            classifier = Classifier.CENTRIFUGE
            process = process_output
            input_files = outputs
            if len(outputs) == 1 and os.path.isdir(outputs[0]):
                select_centrifuge_inputs(outputs)

    def check_controls():
        """Check and info about the control samples"""
        if args.controls:
            if args.controls > len(input_files):
                print(red(' ERROR!'), gray('More controls than samples'))
                exit(1)
            print(gray('Control(s) sample(s) for subtractions:'))
            for i in range(args.controls):
                print(blue(f'\t{input_files[i]}'))

    def select_html_file():
        """HTML filename selection"""
        nonlocal htmlfile
        if lmats:  # Select case for dir name or filename prefix
            if os.path.isdir(lmats[0]):  # Dir name
                dirname = os.path.dirname(os.path.normpath(lmats[0]))
                if not dirname or dirname == '.':
                    basename = 'output'
                else:
                    basename = os.path.basename(dirname)
            else:  # Explicit path and file name prefix is provided
                dirname, basename = os.path.split(lmats[0])
            htmlfile = Filename(os.path.join(dirname, basename + HTML_SUFFIX))
        elif reports:
            htmlfile = Filename(reports[0].split('_mhl')[0] + HTML_SUFFIX)
        else:
            htmlfile = Filename(outputs[0].split('_mhl')[0] + HTML_SUFFIX)

    def read_samples():
        """Read samples"""
        print(gray('\nPlease, wait, processing files in parallel...\n'))
        # Enable parallelization with 'spawn' under known platforms
        if platform.system() and not args.sequential:  # Only for known systems
            mpctx = mp.get_context('fork')
            with mpctx.Pool(
                    processes=min(os.cpu_count(), len(input_files))) as pool:
                async_results = [
                    pool.apply_async(
                        process,
                        args=[
                            input_files[num],  # file name
                            True if num < args.controls else False
                        ],  # is ctrl?
                        kwds=kwargs) for num in range(len(input_files))
                ]
                for file, (sample, tree, out, stat,
                           err) in zip(input_files,
                                       [r.get() for r in async_results]):
                    if err is Err.NO_ERROR:
                        samples.append(sample)
                        trees[sample] = tree
                        taxids[sample] = out.get_taxlevels()
                        counts[sample] = out.counts
                        accs[sample] = out.accs
                        scores[sample] = out.scores
                        stats[sample] = stat
                    elif err is Err.VOID_CTRL:
                        print('There were void controls.', red('Aborting!'))
                        exit(1)
        else:  # sequential processing of each sample
            for num, file in enumerate(input_files):
                (sample, tree, out, stat,
                 err) = process(file, True if num < args.controls else False,
                                **kwargs)
                if err is Err.NO_ERROR:
                    samples.append(sample)
                    trees[sample] = tree
                    taxids[sample] = out.get_taxlevels()
                    counts[sample] = out.counts
                    accs[sample] = out.accs
                    scores[sample] = out.scores
                    stats[sample] = stat
                elif err is Err.VOID_CTRL:
                    print('There were void controls.', red('Aborting!'))
                    exit(1)
        raw_samples.extend(samples)  # Store raw sample names

    def analyze_samples():
        """Cross analysis of samples in parallel by taxlevel"""
        print(gray('Please, wait. Performing cross analysis in parallel...\n'))
        # Update kwargs with more parameters for the followings func calls
        kwargs.update({
            'taxids': taxids,
            'counts': counts,
            'scores': scores,
            'accs': accs,
            'raw_samples': raw_samples
        })
        if platform.system() and not args.sequential:  # Only for known systems
            mpctx = mp.get_context('fork')  # Important for OSX&Win
            with mpctx.Pool(processes=min(os.cpu_count(),
                                          len(Rank.selected_ranks))) as pool:
                async_results = [
                    pool.apply_async(process_rank, args=[level], kwds=kwargs)
                    for level in Rank.selected_ranks
                ]
                for level, (smpls, abunds, accumulators,
                            score) in zip(Rank.selected_ranks,
                                          [r.get() for r in async_results]):
                    samples.extend(smpls)
                    counts.update(abunds)
                    accs.update(accumulators)
                    scores.update(score)
        else:  # sequential processing of each selected rank
            for level in Rank.selected_ranks:
                (smpls, abunds, accumulators,
                 score) = process_rank(level, **kwargs)
                samples.extend(smpls)
                counts.update(abunds)
                accs.update(accumulators)
                scores.update(score)

    def summarize_samples():
        """Summary of samples in parallel by type of cross-analysis"""
        # timing initialization
        summ_start_time: float = time.perf_counter()
        print(gray('Please, wait. Generating summaries in parallel...'))
        # Update kwargs with more parameters for the followings func calls
        kwargs.update({'samples': samples})
        # Get list of set of samples to summarize (note pylint bug #776)
        # pylint: disable=unsubscriptable-object
        target_analysis: col.OrderedDict[str, None] = col.OrderedDict({
            f'{raw}_{study}': None
            for study in [STR_EXCLUSIVE, STR_CONTROL] for raw in raw_samples
            for smpl in samples if smpl.startswith(f'{raw}_{study}')
        })
        # pylint: enable=unsubscriptable-object
        # Add shared and control_shared analysis if they exist (are not void)
        for study in [STR_SHARED, STR_CONTROL_SHARED]:
            for smpl in samples:
                if smpl.startswith(study):
                    target_analysis[study] = None
                    break

        if platform.system() and not args.sequential:  # Only for known systems
            mpctx = mp.get_context('fork')
            with mpctx.Pool(
                    processes=min(os.cpu_count(), len(input_files))) as pool:
                async_results = [
                    pool.apply_async(summarize_analysis,
                                     args=[analysis],
                                     kwds=kwargs)
                    for analysis in target_analysis
                ]
                for analysis, (summary, abund, acc,
                               score) in zip(target_analysis,
                                             [r.get() for r in async_results]):
                    if summary:  # Avoid adding empty samples
                        summaries.append(summary)
                        counts[summary] = abund
                        accs[summary] = acc
                        scores[summary] = score
        else:  # sequential processing of each selected rank
            for analysis in target_analysis:
                (summary, abund, acc,
                 score) = summarize_analysis(analysis, **kwargs)
                if summary:  # Avoid adding empty samples
                    summaries.append(summary)
                    counts[summary] = abund
                    accs[summary] = acc
                    scores[summary] = score
        # Timing results
        print(gray('Summary elapsed time:'),
              f'{time.perf_counter() - summ_start_time:.3g}', gray('sec'))

    def generate_krona():
        """Generate Krona plot with all the results via Krona 2.0 XML spec"""

        print(gray('\nBuilding the taxonomy multiple tree... '), end='')
        sys.stdout.flush()
        krona: KronaTree = KronaTree(
            samples,
            num_raw_samples=len(raw_samples),
            stats=stats,
            min_score=Score(
                min([
                    min(scores[sample].values()) for sample in samples
                    if len(scores[sample])
                ])),
            max_score=Score(
                max([
                    max(scores[sample].values()) for sample in samples
                    if len(scores[sample])
                ])),
            scoring=scoring,
        )
        polytree.grow(ontology=ncbi,
                      abundances=counts,
                      accs=accs,
                      scores=scores)
        print(green('OK!'))
        print(gray('Generating final plot (') + magenta(htmlfile) +
              gray(')... '),
              end='')
        sys.stdout.flush()
        polytree.toxml(ontology=ncbi, krona=krona)
        krona.tohtml(htmlfile, pretty=False)
        print(green('OK!'))

    def generate_excel():
        """Generate Excel with results via pandas DataFrame"""

        xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx')
        print(gray(f'Generating Excel {str(excel).lower()} summary (') +
              magenta(xlsx_name) + gray(')... '),
              end='')
        sys.stdout.flush()
        xlsxwriter = pd.ExcelWriter(xlsx_name)
        list_rows: List = []

        # Save raw samples basic statistics
        data_frame: pd.DataFrame = pd.DataFrame.from_dict(
            {raw: stats[raw].to_dict()
             for raw in raw_samples})
        data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats')

        # Save taxid related statistics per sample
        if excel is Excel.FULL:
            polytree.to_items(ontology=ncbi, items=list_rows)
            # Generate the pandas DataFrame from items and export to Excel
            iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]]
            cols1 = pd.MultiIndex.from_product(iterable_1,
                                               names=['Samples', 'Stats'])
            iterable_2 = [['Details'], ['Rank', 'Name']]
            cols2 = pd.MultiIndex.from_product(iterable_2)
            cols = cols1.append(cols2)
            data_frame = pd.DataFrame.from_items(list_rows,
                                                 orient='index',
                                                 columns=cols)
            data_frame.index.names = ['Id']
            data_frame.to_excel(xlsxwriter, sheet_name=str(excel))
        elif excel is Excel.CMPLXCRUNCHER:
            target_ranks: List = [Rank.NO_RANK]
            if args.controls:  # if controls, add specific sheet for rank
                target_ranks.extend(Rank.selected_ranks)
            for rank in target_ranks:  # Once for no rank dependency (NO_RANK)
                indexes: List[int]
                sheet_name: str
                columns: List[str]
                if args.controls:
                    indexes = [
                        i for i in range(len(raw_samples), len(samples))
                        # Check if sample ends in _(STR_CONTROL)_(rank)
                        if (STR_CONTROL in samples[i].split('_')[-2:] and
                            rank.name.lower() in samples[i].split('_')[-1:])
                    ]
                    sheet_name = f'{STR_CONTROL}_{rank.name.lower()}'
                    columns = [
                        samples[i].replace(
                            '_' + STR_CONTROL + '_' + rank.name.lower(), '')
                        for i in indexes
                    ]
                if rank is Rank.NO_RANK:  # No rank dependency
                    indexes = list(range(len(raw_samples)))
                    sheet_name = f'raw_samples_{rank.name.lower()}'
                    columns = raw_samples
                list_rows = []
                polytree.to_items(ontology=ncbi,
                                  items=list_rows,
                                  sample_indexes=indexes)
                data_frame = pd.DataFrame.from_items(list_rows,
                                                     orient='index',
                                                     columns=columns)
                data_frame.index.names = ['Id']
                data_frame.to_excel(xlsxwriter, sheet_name=sheet_name)
        else:
            raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"')
        xlsxwriter.save()
        print(green('OK!'))

    # timing initialization
    start_time: float = time.time()
    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} - {__date__}'
          f' =-= by {__author__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    argparser = configure_parser()
    args = argparser.parse_args()
    outputs: List[Filename] = args.file
    reports: List[Filename] = args.report
    lmats: List[Filename] = args.lmat
    clarks: List[Filename] = args.clark
    input_files: List[Filename]
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    htmlfile: Filename = args.outhtml
    collapse: bool = not args.nokollapse
    excluding: Set[Id] = set(args.exclude)
    including: Set[Id] = set(args.include)
    scoring: Scoring = Scoring[args.scoring]
    excel: Excel = Excel[args.excel]

    check_debug()

    plasmidfile: Filename = None
    classifier: Classifier
    process: Callable[..., Tuple[Sample, TaxTree, SampleDataById, SampleStats,
                                 Err]]
    select_inputs()
    check_controls()
    if not htmlfile:
        select_html_file()

    # Load NCBI nodes, names and build children
    ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, collapse,
                              excluding, including, args.debug)

    # If dummy flag enabled, just create dummy krona and exit
    if args.dummy:
        _debug_dummy_plot(ncbi, htmlfile, scoring)
        exit(0)

    # Declare variables that will hold results for the samples analyzed
    trees: Dict[Sample, TaxTree] = {}
    counts: Dict[Sample, Counter[Id]] = {}
    accs: Dict[Sample, Counter[Id]] = {}
    taxids: Dict[Sample, TaxLevels] = {}
    scores: Dict[Sample, Dict[Id, Score]] = {}
    stats: Dict[Sample, SampleStats] = {}
    samples: List[Sample] = []
    raw_samples: List[Sample] = []

    # Define dictionary of parameters for methods to be called (to be extended)
    kwargs = {
        'controls':
        args.controls,
        'ctrlminscore': (args.ctrlminscore
                         if args.ctrlminscore is not None else args.minscore),
        'ctrlmintaxa':
        (args.ctrlmintaxa if args.ctrlmintaxa is not None else args.mintaxa),
        'debug':
        args.debug,
        'root':
        args.takeoutroot,
        'classifier':
        classifier,
        'minscore':
        args.minscore,
        'mintaxa':
        args.mintaxa,
        'scoring':
        scoring,
        'ontology':
        ncbi,
    }
    # The big stuff (done in parallel)
    read_samples()
    # Avoid cross analysis if just one report file or explicitly stated by flag
    if len(raw_samples) > 1 and not args.avoidcross:
        analyze_samples()
        if args.summary != 'avoid':
            summaries: List[Sample] = []
            summarize_samples()
            if args.summary == 'only':
                samples = raw_samples + summaries
            else:
                samples.extend(summaries)
    # Final result generation is done in sequential mode

    polytree: MultiTree = MultiTree(samples=samples)
    generate_krona()
    if _USE_PANDAS:
        generate_excel()
    else:
        print(yellow('WARNING!'),
              'Pandas not installed: Excel cannot be created.')

    # Timing results
    print(gray('Total elapsed time:'),
          time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
コード例 #17
0
    def generate_excel():
        """Generate Excel with results via pandas DataFrame"""

        xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx')
        print(gray(f'Generating Excel {str(excel).lower()} summary (') +
              magenta(xlsx_name) + gray(')... '),
              end='')
        sys.stdout.flush()
        xlsxwriter = pd.ExcelWriter(xlsx_name)
        list_rows: List = []

        # Save raw samples basic statistics
        data_frame: pd.DataFrame = pd.DataFrame.from_dict(
            {raw: stats[raw].to_dict()
             for raw in raw_samples})
        data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats')

        # Save taxid related statistics per sample
        if excel is Excel.FULL:
            polytree.to_items(taxonomy=ncbi, items=list_rows)
            # Generate the pandas DataFrame from items and export to Excel
            iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]]
            cols1 = pd.MultiIndex.from_product(iterable_1,
                                               names=['Samples', 'Stats'])
            iterable_2 = [['Details'], ['Rank', 'Name']]
            cols2 = pd.MultiIndex.from_product(iterable_2)
            cols = cols1.append(cols2)
            data_frame = pd.DataFrame.from_items(list_rows,
                                                 orient='index',
                                                 columns=cols)
            data_frame.index.names = ['TaxId']
            data_frame.to_excel(xlsxwriter, sheet_name=str(excel))
        elif excel is Excel.CMPLXCRUNCHER:
            target_ranks: List = [Rank.NO_RANK]
            if args.controls:
                target_ranks = [
                    Rank.SPECIES,
                    Rank.GENUS,  # Ranks of interest
                    Rank.FAMILY,
                    Rank.ORDER
                ]  # for cmplxcruncher
            for rank in target_ranks:  # Once for no rank dependency (NO_RANK)
                indexes: List[int]
                sheet_name: str
                columns: List[str]
                if args.controls:
                    indexes = [
                        i for i in range(len(raw_samples), len(samples))
                        if (samples[i].startswith(STR_CONTROL)
                            and rank.name.lower() in samples[i])
                    ]
                    sheet_name = f'{STR_CONTROL}_{rank.name.lower()}'
                    columns = [samples[i].split('_')[2] for i in indexes]
                else:  # No rank dependency
                    indexes = list(range(len(raw_samples)))
                    sheet_name = f'raw_samples_{rank.name.lower()}'
                    columns = [samples[i].split('_')[0] for i in indexes]
                list_rows = []
                polytree.to_items(taxonomy=ncbi,
                                  items=list_rows,
                                  sample_indexes=indexes)
                data_frame = pd.DataFrame.from_items(list_rows,
                                                     orient='index',
                                                     columns=columns)
                data_frame.index.names = ['TaxId']
                data_frame.to_excel(xlsxwriter, sheet_name=sheet_name)
        else:
            raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"')
        xlsxwriter.save()
        print(green('OK!'))
コード例 #18
0
def main():
    """Main entry point to script."""
    def vprint(*args):
        """Print only if verbose/debug mode is enabled"""
        if debug:
            print(*args, end='')
            sys.stdout.flush()

    def configure_parser():
        """Argument Parser Configuration"""
        parser = argparse.ArgumentParser(
            description='Generate mock samples for Recentrifuge testing',
            epilog=f'%(prog)s  - Release {__version__} - {__date__}' + LICENSE,
            formatter_class=argparse.RawDescriptionHelpFormatter)
        parser_mode = parser.add_mutually_exclusive_group(required=True)
        parser_mode.add_argument(
            '-f',
            '--file',
            action='store',
            metavar='FILE',
            type=Filename,
            help='Explicit source: Centrifuge output file as source')
        parser_mode.add_argument(
            '-r',
            '--random',
            action='store',
            metavar='MHL',
            type=int,
            default=15,
            help=('Random score generated. Please provide the minimum hit '
                  'length (mhl) of the classification; 15 by default'))
        parser.add_argument(
            '-g',
            '--debug',
            action='store_true',
            help='increase output verbosity and perform additional checks')
        parser_input = parser.add_mutually_exclusive_group(required=True)
        parser_input.add_argument(
            '-m',
            '--mock',
            action='append',
            metavar='FILE',
            type=Filename,
            help=('Mock files to be read for mock Centrifuge sequences layout.'
                  ' If a single directory is entered, every .mck file inside '
                  'will be taken as a different sample. '
                  'Multiple -f is available to include several samples.'))
        if _USE_PANDAS:
            parser_input.add_argument('-x',
                                      '--xcel',
                                      action='store',
                                      metavar='FILE',
                                      type=Filename,
                                      help='Excel file with the mock layout.')
        parser.add_argument('-n',
                            '--nodespath',
                            action='store',
                            metavar='PATH',
                            default=TAXDUMP_PATH,
                            help=('path for the nodes information files '
                                  '(nodes.dmp and names.dmp from NCBI)'))
        parser.add_argument(
            '-V',
            '--version',
            action='version',
            version=f'%(prog)s release {__version__} ({__date__})')
        return parser

    def check_debug():
        """Check debugging mode"""
        if args.debug:
            print(gray('INFO: Debugging mode activated\n'))

    def read_mock_files(mock: Filename) -> Counter[Id]:
        """Read a mock layout (.mck) file"""
        mock_layout: Counter[Id] = col.Counter()
        with open(mock, 'r') as file:
            vprint(gray('\nProcessing'), blue(mock), gray('file:\n'))
            for line in file:
                if line.startswith('#'):
                    continue
                _tid, _num = line.split('\t')
                tid = Id(_tid)
                num = int(_num)
                mock_layout[tid] = num
                vprint(num, gray('\treads for taxid\t'), tid, '\t(',
                       cyan(ncbi.get_name(tid)), ')\n')
        return mock_layout

    def mock_from_source(out: Filename, mock_layout: Counter[Id]) -> None:
        """Generate a mock Centrifuge output file from source file"""
        with open(out, 'w') as fout, open(args.file) as fcfg:
            vprint(gray('Generating'), blue(out), gray('file... '))
            fout.write(fcfg.readline())  # copy cfg output file header
            reads_writen: int = 0
            for line in fcfg:
                tid = Id(line.split('\t')[2])
                if mock_layout[tid]:
                    fout.write(line)
                    mock_layout[tid] -= 1
                    reads_writen += 1
                    if not sum(mock_layout.values()):
                        vprint(reads_writen, 'reads', green('OK!\n'))
                        break
        if sum(mock_layout.values()):
            print(red('ERROR!\n'))
            print(gray('Incomplete read copy by taxid:'))
            mock_layout = +mock_layout  # Delete zero counts elements
            for tid in mock_layout:
                print(yellow(mock_layout[tid]), gray('reads missing for tid'),
                      tid, '(', cyan(ncbi.get_name(tid)), ')\n')

    def mock_from_scratch(out: Filename, mock_layout: Counter[Id]) -> None:
        """Generate a mock Centrifuge output file from scratch"""
        with open(out, 'w') as fout:
            vprint(gray('Generating'), blue(out), gray('file... '))
            fout.write('readID\tseqID\ttaxID\tscore\t2ndBestScore\t'
                       'hitLength\tqueryLength\tnumMatches\n')
            reads_writen: int = 0
            for numtid in mock_layout:
                tid = Id(numtid)  # Convert to Id the excel integer
                maxhl: int = random.randint(args.random + 1, MAX_HIT_LENGTH)
                rank: str = str(ncbi.get_rank(tid)).lower()
                for _ in range(int(mock_layout[numtid])):
                    hit_length = random.randint(args.random + 1, maxhl)
                    fout.write(f'test{reads_writen}\t{rank}\t'
                               f'{tid}\t{(hit_length-15)**2}\t'
                               f'0\t{hit_length}\t{MAX_HIT_LENGTH}\t1\n')
                    reads_writen += 1
            vprint(reads_writen, 'reads', green('OK!\n'))

    def by_mock_files() -> None:
        """Do the job in case of mock files"""
        if len(args.mock) == 1 and os.path.isdir(args.mock[0]):
            select_centrifuge_inputs(args.mock, ext='.mck')
        for mock in args.mock:
            mock_layout: Counter[Id] = read_mock_files(mock)
            test: Filename = Filename(mock.split('.mck')[0] + '.out')
            if args.file:
                mock_from_source(test, mock_layout)
            else:
                mock_from_scratch(test, mock_layout)

    def by_excel_file() -> None:
        """Do the job in case of Excel file with all the details"""
        dirname = os.path.dirname(args.xcel)
        # Expected index (taxids) in column after taxa name, and last row will
        #  be removed (reserved for sum of reads in Excel file)
        mock_df = pd.read_excel(args.xcel,
                                index_col=1,
                                skip_footer=1,
                                dtype=str)
        del mock_df['RECENTRIFUGE MOCK']
        vprint(gray('Layout to generate the mock files:\n'), mock_df, '\n')
        for name, series in mock_df.iteritems():
            mock_layout: Counter[Id] = col.Counter(series.to_dict(dict))
            # In prev, series.to_dict(col.Counter) fails, so this is workaround
            test: Filename = Filename(os.path.join(dirname, name + '.out'))
            if args.file:
                mock_from_source(test, mock_layout)
            else:
                mock_from_scratch(test, mock_layout)

    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} - {__date__}'
          f' =-= by {__author__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    argparser = configure_parser()
    args = argparser.parse_args()
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    debug: bool = args.debug

    check_debug()

    # Load NCBI nodes, names and build children
    ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, None, False)

    if args.mock:
        by_mock_files()
    elif args.xcel:
        by_excel_file()
コード例 #19
0
    def generate_excel():
        """Generate Excel with results via pandas DataFrame"""

        xlsx_name: Filename = Filename(htmlfile.split('.html')[0] + '.xlsx')
        print(gray(f'Generating Excel {str(excel).lower()} summary (') +
              magenta(xlsx_name) + gray(')... '),
              end='')
        sys.stdout.flush()
        xlsxwriter = pd.ExcelWriter(xlsx_name)
        list_rows: List = []

        # Save raw samples basic statistics
        data_frame: pd.DataFrame = pd.DataFrame.from_dict(
            {raw: stats[raw].to_dict()
             for raw in raw_samples})
        data_frame.to_excel(xlsxwriter, sheet_name='_sample_stats')

        # Save taxid related statistics per sample
        if excel is Excel.FULL:
            polytree.to_items(ontology=ncbi, items=list_rows)
            # Generate the pandas DataFrame from items and export to Excel
            iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]]
            cols1 = pd.MultiIndex.from_product(iterable_1,
                                               names=['Samples', 'Stats'])
            iterable_2 = [['Details'], ['Rank', 'Name']]
            cols2 = pd.MultiIndex.from_product(iterable_2)
            cols = cols1.append(cols2)
            data_frame = pd.DataFrame.from_items(list_rows,
                                                 orient='index',
                                                 columns=cols)
            data_frame.index.names = ['Id']
            data_frame.to_excel(xlsxwriter, sheet_name=str(excel))
        elif excel is Excel.CMPLXCRUNCHER:
            target_ranks: List = [Rank.NO_RANK]
            if args.controls:  # if controls, add specific sheet for rank
                target_ranks.extend(Rank.selected_ranks)
            for rank in target_ranks:  # Once for no rank dependency (NO_RANK)
                indexes: List[int]
                sheet_name: str
                columns: List[str]
                if args.controls:
                    indexes = [
                        i for i in range(len(raw_samples), len(samples))
                        # Check if sample ends in _(STR_CONTROL)_(rank)
                        if (STR_CONTROL in samples[i].split('_')[-2:] and
                            rank.name.lower() in samples[i].split('_')[-1:])
                    ]
                    sheet_name = f'{STR_CONTROL}_{rank.name.lower()}'
                    columns = [
                        samples[i].replace(
                            '_' + STR_CONTROL + '_' + rank.name.lower(), '')
                        for i in indexes
                    ]
                if rank is Rank.NO_RANK:  # No rank dependency
                    indexes = list(range(len(raw_samples)))
                    sheet_name = f'raw_samples_{rank.name.lower()}'
                    columns = raw_samples
                list_rows = []
                polytree.to_items(ontology=ncbi,
                                  items=list_rows,
                                  sample_indexes=indexes)
                data_frame = pd.DataFrame.from_items(list_rows,
                                                     orient='index',
                                                     columns=columns)
                data_frame.index.names = ['Id']
                data_frame.to_excel(xlsxwriter, sheet_name=sheet_name)
        else:
            raise Exception(red('\nERROR!'), f'Unknown Excel option "{excel}"')
        xlsxwriter.save()
        print(green('OK!'))
コード例 #20
0
from recentrifuge.config import TEST_INPUT_DIR, TEST_OUTPUT_DIR, MOCK_XLSX
from recentrifuge.config import REXTRACT_TEST_SAMPLE, REXTRACT_TEST_FASTQ
from recentrifuge.config import gray, blue, green, red, yellow, cyan, magenta
from recentrifuge.taxonomy import Taxonomy

# optional package pandas (to read Excel with mock layout)
_USE_PANDAS = True
try:
    import pandas as pd
except ImportError:
    pd = None
    _USE_PANDAS = False

MAX_HIT_LENGTH: int = 200  # Max hit length for random score generation
TEST_MOCK_XLSX = os.path.join(TEST_INPUT_DIR, MOCK_XLSX)
TEST_XCEL = Filename(
    os.path.join(os.path.dirname(os.path.realpath(__file__)), TEST_MOCK_XLSX))
TEST_REXT_SMPL = os.path.join(TEST_OUTPUT_DIR, REXTRACT_TEST_SAMPLE)
TEST_REXT_FSTQ = os.path.join(TEST_OUTPUT_DIR, REXTRACT_TEST_FASTQ)


def generate_mock(
    ncbi: Taxonomy,
    file: Filename,
    rnd: int,
    mocks: List[Filename],
    xcel: Filename,
    debug: bool,
):
    def vprint(*args):
        """Print only if verbose/debug mode is enabled"""
        if debug: