Python download_blast_binaries Examples

Programming Language: Python

Namespace/Package Name: quast_libs.qutils

Method/Function: download_blast_binaries

Examples at hotexamples.com: 7

Python download_blast_binaries - 7 examples found. These are the top rated real world Python examples of quast_libs.qutils.download_blast_binaries extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: run_busco.py Project: svpipaliya/quast

def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus',
                          augustus_dirpath, [join('bin', 'augustus')],
                          logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(
            logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger,
                                is_prokaryote=qconfig.prokaryote,
                                is_fungus=qconfig.is_fungus)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    config_fpath = make_config(output_dir, tmp_dir, busco_threads,
                               clade_dirpath, augustus_dirpath)
    logger.info('Logs and results will be saved under ' + output_dir + '...')

    os.environ['BUSCO_CONFIG_FILE'] = config_fpath
    os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_configs(
        augustus_dirpath, tmp_dir)
    if not os.environ['AUGUSTUS_CONFIG_PATH']:
        logger.error(
            'Augustus configs not found, failed to run BUSCO without them.')
    busco_args = [[
        contigs_fpath,
        qutils.label_from_fpath_for_fname(contigs_fpath)
    ] for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco_main_handler, busco_args,
                                  qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error(
            'Failed running BUSCO for all the assemblies. See log files in ' +
            output_dir + ' for information '
            '(rerun with --debug to keep all intermediate files).')
        return

    # saving results
    zero_output_for_all = True
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(
                    reporting.Fields.BUSCO_COMPLETE,
                    ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART,
                                 ('%.2f' %
                                  (float(part_buscos) * 100.0 / total_buscos)))
            if complete_buscos + part_buscos > 0:
                zero_output_for_all = False
            shutil.copy(summary_fpaths[i], output_dir)
        else:
            logger.error(
                'Failed running BUSCO for ' + contigs_fpath +
                '. See the log for detailed information'
                ' (rerun with --debug to keep all intermediate files).')
    if zero_output_for_all:
        logger.warning(
            'BUSCO did not fail explicitly but found nothing for all assemblies! '
            'Possible reasons and workarounds:\n'
            '  1. Provided assemblies are so small that they do not contain even a single partial BUSCO gene. Not likely but may happen -- nothing to worry then.\n'
            '  2. Incorrect lineage database was used. To run with fungi DB use --fungus, to run with eukaryota DB use --eukaryote, otherwise BUSCO uses bacteria DB.\n'
            '  3. Problem with BUSCO dependencies, most likely Augustus. Check that the binaries in '
            + augustus_dirpath + '/bin/ are working properly.\n'
            '     If something is wrong with Augustus, you may try to install it yourself (https://github.com/Gaius-Augustus/Augustus) and add "augustus" binary to PATH.\n'
            '  4. Some other problem with BUSCO. Check the logs (you may need to rerun QUAST with --debug to see all intermediate files).\n'
            '     If you cannot solve the problem yourself, post an issue at https://github.com/ablab/quast/issues or write to [email protected]'
        )
    if not qconfig.debug:
        cleanup(output_dir)
    logger.info('Done.')

Example #2

Show file

def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath,
                  labels, blast_check_fpath, err_fpath):
    if not download_blast_binaries(filenames=blast_filenames):
        return None, None, None

    if qconfig.custom_blast_db_fpath:
        global db_fpath
        db_fpath = qconfig.custom_blast_db_fpath
        if isdir(db_fpath):
            db_aux_files = [
                f for f in os.listdir(db_fpath) if f.endswith('.nsq')
            ]
            if db_aux_files:
                db_fpath = join(qconfig.custom_blast_db_fpath,
                                db_aux_files[0].replace('.nsq', ''))
        elif isfile(db_fpath) and db_fpath.endswith('.nsq'):
            db_fpath = db_fpath[:-len('.nsq')]
        if not os.path.isfile(db_fpath + '.nsq'):
            logger.error(
                'You should specify path to BLAST database obtained by running makeblastdb command: '
                'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.'
                ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.',
                exit_with_code=2)

    elif not download_blastdb():
        return None, None, None

    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')

    if len(blast_assemblies) > 0:
        logger.main_info('Running BlastN..')
        n_jobs = min(qconfig.max_threads, len(blast_assemblies))
        blast_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib2 import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(
            assembly.fpath, assembly.label, corrected_dirpath, err_fpath,
            blast_res_fpath, blast_check_fpath, blast_threads)
                                for i, assembly in enumerate(blast_assemblies))

    logger.main_info()
    species_scores = []
    species_by_assembly = dict()
    max_entries = 4
    replacement_dict = defaultdict(list)
    for label in labels:
        assembly_scores = []
        assembly_species = []
        res_fpath = get_blast_output_fpath(blast_res_fpath, label)
        if os.path.exists(res_fpath):
            refs_for_query = 0
            with open(res_fpath) as res_file:
                query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None
                for line in res_file:
                    fs = line.split()
                    if line.startswith('#'):
                        refs_for_query = 0
                        # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
                        if 'Fields' in line:
                            fs = line.strip().split('Fields: ')[-1].split(', ')
                            query_id_col = fs.index(
                                'query id') if 'query id' in fs else 0
                            subj_id_col = fs.index(
                                'subject id') if 'subject id' in fs else 1
                            idy_col = fs.index(
                                '% identity') if '% identity' in fs else 2
                            len_col = fs.index(
                                'alignment length'
                            ) if 'alignment length' in fs else 3
                            score_col = fs.index(
                                'bit score') if 'bit score' in fs else 11
                    elif refs_for_query < max_entries and len(fs) > score_col:
                        query_id = fs[query_id_col]
                        organism_id = fs[subj_id_col]
                        idy = float(fs[idy_col])
                        length = int(fs[len_col])
                        score = float(fs[score_col])
                        if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore:  # and (not scores or min(scores) - score < max_identity_difference):
                            seqname, taxons = parse_organism_id(organism_id)
                            if not seqname:
                                continue
                            species_name = get_species_name(seqname)
                            if species_name and 'uncultured' not in seqname:
                                if refs_for_query == 0:
                                    if species_name not in assembly_species:
                                        assembly_scores.append(
                                            (seqname, query_id, score))
                                        if taxons:
                                            taxons_for_krona[correct_name(
                                                seqname)] = taxons
                                        assembly_species.append(species_name)
                                        refs_for_query += 1
                                    else:
                                        seq_scores = [
                                            (query_name, seq_query_id,
                                             seq_score)
                                            for query_name, seq_query_id,
                                            seq_score in assembly_scores
                                            if get_species_name(
                                                query_name) == species_name
                                        ]
                                        if seq_scores and score > seq_scores[
                                                0][2]:
                                            assembly_scores.remove(
                                                seq_scores[0])
                                            assembly_scores.append(
                                                (seqname, query_id, score))
                                            if taxons:
                                                taxons_for_krona[correct_name(
                                                    seqname)] = taxons
                                            refs_for_query += 1
                                else:
                                    if seqname not in replacement_dict[
                                            query_id]:
                                        replacement_dict[query_id].append(
                                            seqname)
                                        refs_for_query += 1
        assembly_scores = sorted(assembly_scores, reverse=True)
        assembly_scores = assembly_scores[:qconfig.max_references]
        for seqname, query_id, score in assembly_scores:
            if not species_by_assembly or not any(
                    seqname in species_list
                    for species_list in species_by_assembly.values()):
                species_scores.append((seqname, query_id, score))
        species_by_assembly[label] = [
            seqname for seqname, query_id, score in assembly_scores
        ]
    if not species_scores:
        return None, None, None
    return species_scores, species_by_assembly, replacement_dict

Example #3

Show file

File: run_busco.py Project: treangen/quast_frc

def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus',
                          augustus_dirpath, [join('bin', 'augustus')],
                          logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(
            logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    set_augustus_dir(augustus_dirpath)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    log_fpath = join(output_dir, 'busco.log')
    logger.info('Logging to ' + log_fpath + '...')
    busco_args = [([
        '-i', contigs_fpath, '-o',
        qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath,
        '-m', 'genome', '-f', '-z', '-c',
        str(busco_threads), '-t', tmp_dir,
        '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' +
        join(augustus_dirpath, 'config') + '\''
    ], output_dir) for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error('Failed running BUSCO for all the assemblies. See ' +
                     log_fpath + ' for information.')
        return

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(
                    reporting.Fields.BUSCO_COMPLETE,
                    ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART,
                                 ('%.2f' %
                                  (float(part_buscos) * 100.0 / total_buscos)))
        else:
            logger.error('Failed running BUSCO for ' + contigs_fpath +
                         '. See ' + log_fpath + ' for information.')
    logger.info('Done.')

Example #4

Show file

def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus',
                          augustus_dirpath, [join('bin', 'augustus')],
                          logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(
            logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger,
                                is_prokaryote=qconfig.prokaryote,
                                is_fungus=qconfig.is_fungus)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    config_fpath = make_config(output_dir, tmp_dir, busco_threads,
                               clade_dirpath, augustus_dirpath)
    logger.info('Logs and results will be saved under ' + output_dir + '...')

    os.environ['BUSCO_CONFIG_FILE'] = config_fpath
    os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_contigs(
        augustus_dirpath, tmp_dir)
    if not os.environ['AUGUSTUS_CONFIG_PATH']:
        logger.error(
            'Augustus configs not found, failed to run BUSCO without them.')
    busco_args = [[
        contigs_fpath,
        qutils.label_from_fpath_for_fname(contigs_fpath)
    ] for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco_main_handler, busco_args,
                                  qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error(
            'Failed running BUSCO for all the assemblies. See log files in ' +
            output_dir + ' for information.')
        return

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(
                    reporting.Fields.BUSCO_COMPLETE,
                    ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART,
                                 ('%.2f' %
                                  (float(part_buscos) * 100.0 / total_buscos)))
            shutil.copy(summary_fpaths[i], output_dir)
        else:
            logger.error('Failed running BUSCO for ' + contigs_fpath +
                         '. See the log for detailed information.')
    if not qconfig.debug:
        cleanup(output_dir)
    logger.info('Done.')

Example #5

Show file

File: run_busco.py Project: student-t/quast

def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    set_augustus_dir(augustus_dirpath)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    log_fpath = join(output_dir, 'busco.log')
    logger.info('Logging to ' + log_fpath + '...')
    busco_args = [(['-i', contigs_fpath, '-o', qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath,
                    '-m', 'genome', '-f', '-z', '-c', str(busco_threads), '-t', tmp_dir,
                    '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' + join(augustus_dirpath, 'config') + '\'' ], output_dir)
                    for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error('Failed running BUSCO for all the assemblies. See ' + log_fpath + ' for information.')
        return

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos)))
        else:
            logger.error(
                'Failed running BUSCO for ' + contigs_fpath + '. See ' + log_fpath + ' for information.')
    logger.info('Done.')

Example #6

Show file

File: search_references_meta.py Project: student-t/quast

def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath):
    if not download_blast_binaries(filenames=blast_filenames):
        return None, None

    if qconfig.custom_blast_db_fpath:
        global db_fpath
        db_fpath = qconfig.custom_blast_db_fpath
        if isdir(db_fpath):
            db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')]
            if db_aux_files:
                db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', ''))
        elif isfile(db_fpath) and db_fpath.endswith('.nsq'):
            db_fpath = db_fpath[:-len('.nsq')]
        if not os.path.isfile(db_fpath + '.nsq'):
            logger.error('You should specify path to BLAST database obtained by running makeblastdb command: '
                         'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.'
                         ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.',
                         exit_with_code=2)

    elif not download_blastdb():
        return None, None

    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')

    if len(blast_assemblies) > 0:
        logger.main_info('Running BlastN..')
        n_jobs = min(qconfig.max_threads, len(blast_assemblies))
        blast_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath,
                                                        err_fpath, blast_res_fpath, blast_check_fpath, blast_threads)
                                for i, assembly in enumerate(blast_assemblies))

    logger.main_info()
    species_scores = []
    species_by_assembly = dict()
    max_entries = 4
    replacement_dict = defaultdict(list)
    for label in labels:
        assembly_scores = []
        assembly_species = []
        res_fpath = get_blast_output_fpath(blast_res_fpath, label)
        if os.path.exists(res_fpath):
            refs_for_query = 0
            with open(res_fpath) as res_file:
                query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None
                for line in res_file:
                    fs = line.split()
                    if line.startswith('#'):
                        refs_for_query = 0
                        # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
                        if 'Fields' in line:
                            fs = line.strip().split('Fields: ')[-1].split(', ')
                            query_id_col = fs.index('query id')
                            subj_id_col = fs.index('subject id')
                            idy_col = fs.index('% identity')
                            len_col = fs.index('alignment length')
                            score_col = fs.index('bit score')
                    elif refs_for_query < max_entries and len(fs) > score_col:
                        query_id = fs[query_id_col]
                        organism_id = fs[subj_id_col]
                        idy = float(fs[idy_col])
                        length = int(fs[len_col])
                        score = float(fs[score_col])
                        if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore:  # and (not scores or min(scores) - score < max_identity_difference):
                            seqname, taxons = parse_organism_id(organism_id)
                            if not seqname:
                                continue
                            species_name = seqname.split('_')
                            if len(species_name) > 1 and 'uncultured' not in seqname:
                                species_name = species_name[0] + '_' + species_name[1]
                                if refs_for_query == 0:
                                    if species_name not in assembly_species:
                                        assembly_scores.append((seqname, query_id, score))
                                        if taxons:
                                            taxons_for_krona[correct_name(seqname)] = taxons
                                            assembly_species.append(species_name)
                                        refs_for_query += 1
                                    else:
                                        seq_scores = [(seqname, query_id, score) for seqname, query_id, score in assembly_scores
                                                      if species_name in seqname]
                                        if seq_scores and score > seq_scores[0][2]:
                                            assembly_scores.remove(seq_scores[0])
                                            assembly_scores.append((seqname, query_id, score))
                                            if taxons:
                                                taxons_for_krona[correct_name(seqname)] = taxons
                                            refs_for_query += 1
                                else:
                                    if seqname not in replacement_dict[query_id]:
                                        replacement_dict[query_id].append(seqname)
                                        refs_for_query += 1
        assembly_scores = sorted(assembly_scores, reverse=True)
        assembly_scores = assembly_scores[:qconfig.max_references]
        for seqname, query_id, score in assembly_scores:
            if not species_by_assembly or not any(seqname in species_list for species_list in species_by_assembly.values()):
                species_scores.append((seqname, query_id, score))
        species_by_assembly[label] = [seqname for seqname, query_id, score in assembly_scores]
    if not species_scores:
        return None, None
    return species_scores, species_by_assembly, replacement_dict

Example #7

Show file

def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath,
                  labels, blast_check_fpath, err_fpath):
    if not download_blast_binaries(filenames=blast_filenames):
        return None, None

    if qconfig.custom_blast_db_fpath:
        global db_fpath
        db_fpath = qconfig.custom_blast_db_fpath
        if isdir(db_fpath):
            db_aux_files = [
                f for f in os.listdir(db_fpath) if f.endswith('.nsq')
            ]
            if db_aux_files:
                db_fpath = join(qconfig.custom_blast_db_fpath,
                                db_aux_files[0].replace('.nsq', ''))
        elif isfile(db_fpath) and db_fpath.endswith('.nsq'):
            db_fpath = db_fpath[:-len('.nsq')]
        if not os.path.isfile(db_fpath + '.nsq'):
            logger.error(
                'You should specify path to BLAST database obtained by running makeblastdb command: '
                'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.'
                ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.',
                exit_with_code=2)

    elif not download_blastdb():
        return None, None

    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')

    if len(blast_assemblies) > 0:
        logger.main_info('Running BlastN..')
        n_jobs = min(qconfig.max_threads, len(blast_assemblies))
        blast_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(
            assembly.fpath, assembly.label, corrected_dirpath, err_fpath,
            blast_res_fpath, blast_check_fpath, blast_threads)
                                for i, assembly in enumerate(blast_assemblies))

    logger.main_info('')
    scores_organisms = []
    organisms_assemblies = {}
    for label in labels:
        all_scores = []
        organisms = []
        res_fpath = get_blast_output_fpath(blast_res_fpath, label)
        if os.path.exists(res_fpath):
            refs_for_query = 0
            with open(res_fpath) as res_file:
                for line in res_file:
                    if refs_for_query == 0 and not line.startswith(
                            '#') and len(line.split()) > 10:
                        # TODO: find and parse "Fields" line to detect each column indexes:
                        # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
                        # We need: identity, legnth, score, query and subject id.
                        line = line.split()
                        organism_id = line[1]
                        idy = float(line[2])
                        length = int(line[3])
                        score = float(line[11])
                        if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore:  # and (not scores or min(scores) - score < max_identity_difference):
                            seqname, taxons = parse_organism_id(organism_id)
                            if not seqname:
                                continue
                            specie = seqname.split('_')
                            if len(specie) > 1 and 'uncultured' not in seqname:
                                specie = specie[0] + '_' + specie[1]
                                if specie not in organisms:
                                    all_scores.append((score, seqname))
                                    if taxons:
                                        taxons_for_krona[correct_name(
                                            seqname)] = taxons
                                    organisms.append(specie)
                                    refs_for_query += 1
                                else:
                                    tuple_scores = [
                                        x for x in all_scores if specie in x[1]
                                    ]
                                    if tuple_scores and score > tuple_scores[
                                            0][0]:
                                        all_scores.remove((tuple_scores[0][0],
                                                           tuple_scores[0][1]))
                                        all_scores.append((score, seqname))
                                        if taxons:
                                            taxons_for_krona[correct_name(
                                                seqname)] = taxons
                                        refs_for_query += 1
                    elif line.startswith('#'):
                        refs_for_query = 0
        all_scores = sorted(all_scores, reverse=True)
        all_scores = all_scores[:qconfig.max_references]
        for score in all_scores:
            if not organisms_assemblies or (
                    organisms_assemblies.values() and not [
                        1 for list in organisms_assemblies.values()
                        if score[1] in list
                    ]):
                scores_organisms.append(score)
        organisms_assemblies[label] = [score[1] for score in all_scores]
    if not scores_organisms:
        return None, None
    return scores_organisms, organisms_assemblies