コード例 #1
0
ファイル: run_busco.py プロジェクト: svpipaliya/quast
def download_db(logger, is_prokaryote, is_fungus=False, only_clean=False):
    if is_prokaryote:
        url = bacteria_db_url
        clade = 'bacteria'
    elif is_fungus:
        url = fungi_db_url
        clade = 'fungi'
    else:
        url = eukaryota_db_url
        clade = 'eukaryota'
    dirpath = get_dir_for_download('busco',
                                   'Busco databases', [clade],
                                   logger,
                                   only_clean=only_clean)
    if not dirpath:
        return None

    db_dirpath = join(dirpath, clade)
    if only_clean:
        if os.path.isdir(db_dirpath):
            shutil.rmtree(db_dirpath, ignore_errors=True)
        return True

    if not os.path.exists(db_dirpath):
        downloaded_fpath = join(dirpath, clade + '.tar.gz')
        logger.main_info('  Downloading BUSCO database...')
        download_unpack_compressed_tar(clade + ' database', url,
                                       downloaded_fpath, db_dirpath, logger)

        if not os.path.exists(db_dirpath):
            logger.warning('Failed to download ' + clade + ' database from ' +
                           url + ' and unpack it into ' + dirpath)
            return None
    return db_dirpath
コード例 #2
0
ファイル: run_busco.py プロジェクト: svpipaliya/quast
def download_tool(tool,
                  tool_version,
                  required_files,
                  logger,
                  url,
                  only_clean=False):
    tool_dirpath = get_dir_for_download(tool + tool_version,
                                        tool,
                                        required_files,
                                        logger,
                                        only_clean=only_clean)
    if not tool_dirpath:
        return None

    if only_clean:
        if os.path.isdir(tool_dirpath):
            shutil.rmtree(tool_dirpath, ignore_errors=True)
        return tool_dirpath

    failed_compilation_flag = join(tool_dirpath, 'make.failed')
    if not all(os.path.exists(join(tool_dirpath, fpath)) for fpath in required_files) and not \
            check_prev_compilation_failed(tool, failed_compilation_flag):
        downloaded_fpath = join(tool_dirpath, tool + '.tar.gz')
        logger.main_info('  Downloading third-party tools...')
        download_unpack_compressed_tar(tool, url, downloaded_fpath,
                                       tool_dirpath, logger)

        if not all(
                os.path.exists(join(tool_dirpath, fpath))
                for fpath in required_files):
            logger.warning('Failed to download ' + tool + ' from ' + url +
                           'and unpack it into ' + tool_dirpath)
            return None
    return tool_dirpath
コード例 #3
0
ファイル: run_busco.py プロジェクト: student-t/quast
def download_db(logger, is_prokaryote, is_fungus=False, only_clean=False):
    if is_prokaryote:
        url = bacteria_db_url
        clade = 'bacteria'
    elif is_fungus:
        url = fungi_db_url
        clade = 'fungi'
    else:
        url = eukaryota_db_url
        clade = 'eukaryota'
    dirpath = get_dir_for_download('busco', 'Busco databases', [clade], logger, only_clean=only_clean)
    if not dirpath:
        return None

    db_dirpath = join(dirpath, clade)
    if only_clean:
        if os.path.isdir(db_dirpath):
            shutil.rmtree(db_dirpath, ignore_errors=True)
        return True

    if not os.path.exists(db_dirpath):
        downloaded_fpath = join(dirpath, clade + '.tar.gz')
        logger.main_info('  Downloading ' + clade + ' database...')
        download_unpack_compressed_tar(clade + ' database', url, downloaded_fpath, db_dirpath, logger)

        if not os.path.exists(db_dirpath):
            logger.warning('Failed to download ' + clade + ' database from ' + url + 'and unpack it into ' + dirpath)
            return None
    return db_dirpath
コード例 #4
0
def download_manta(logger, bed_fpath=None, only_clean=False):
    global manta_dirpath
    manta_dirpath = get_dir_for_download('manta' + manta_version,
                                         'Manta', [config_manta_relpath],
                                         logger,
                                         only_clean=only_clean)
    if not manta_dirpath:
        return False

    manta_build_dirpath = join(manta_dirpath, 'build')
    config_manta_fpath = get_manta_fpath()
    if only_clean:
        if os.path.isdir(manta_build_dirpath):
            shutil.rmtree(manta_build_dirpath, ignore_errors=True)
        return True

    if not qconfig.no_sv and bed_fpath is None and not isfile(
            config_manta_fpath):
        if qconfig.platform_name == 'linux_64':
            url = manta_linux_url
            fpath = manta_ext_linux_fpath
        elif qconfig.platform_name == 'macosx':
            url = manta_osx_url
            fpath = manta_ext_osx_fpath
        else:
            logger.warning('Manta is not available for your platform.')
            return False

        if not exists(manta_build_dirpath):
            os.makedirs(manta_build_dirpath)
        manta_downloaded_fpath = join(manta_build_dirpath, 'manta.tar.bz2')

        if isfile(fpath):
            logger.info('Copying manta from ' + fpath)
            shutil.copy(fpath, manta_downloaded_fpath)
            logger.info('Unpacking ' + manta_downloaded_fpath + ' into ' +
                        manta_build_dirpath)
            unpack_tar(manta_downloaded_fpath, manta_build_dirpath)
        else:
            failed_compilation_flag = join(manta_dirpath, 'make.failed')
            if check_prev_compilation_failed('Manta', failed_compilation_flag):
                print_manta_warning(logger)
                return False

            logger.main_info('  Downloading binary distribution of Manta...')
            download_unpack_tar_bz('Manta', url, manta_downloaded_fpath,
                                   manta_build_dirpath, logger)

        manta_demo_dirpath = join(manta_build_dirpath, 'share', 'demo')
        if os.path.isdir(manta_demo_dirpath):
            shutil.rmtree(manta_demo_dirpath, ignore_errors=True)
        if not isfile(config_manta_fpath):
            logger.warning(
                'Failed to download binary distribution from https://github.com/ablab/quast/external_tools/manta '
                'and unpack it into ' + join(manta_dirpath, 'build/'))
            print_manta_warning(logger)
            return False
    return True
コード例 #5
0
ファイル: misc.py プロジェクト: student-t/quast
def download_gridss(logger, bed_fpath=None, only_clean=False):
    global gridss_dirpath
    gridss_dirpath = get_dir_for_download('gridss', 'GRIDSS', [gridss_fname], logger, only_clean=only_clean)
    if not gridss_dirpath:
        return False

    gridss_fpath = get_gridss_fpath()
    if not qconfig.no_sv and bed_fpath is None and not isfile(gridss_fpath):
        if not download_external_tool(gridss_fname, gridss_dirpath, 'gridss'):
            logger.warning('Failed to download binary distribution from https://github.com/ablab/quast/external_tools/gridss. '
                           'QUAST SV module will be able to search trivial deletions only.')
            return False
    return True
コード例 #6
0
def download_gridss(logger, bed_fpath=None, only_clean=False):
    global gridss_dirpath
    gridss_dirpath = get_dir_for_download('gridss',
                                          'GRIDSS', [gridss_fname],
                                          logger,
                                          only_clean=only_clean)
    if not gridss_dirpath:
        return False

    gridss_fpath = get_gridss_fpath()
    if not qconfig.no_sv and bed_fpath is None and not isfile(gridss_fpath):
        if not download_external_tool(gridss_fname, gridss_dirpath, 'gridss'):
            logger.warning(
                'Failed to download binary distribution from https://github.com/ablab/quast/external_tools/gridss. '
                'QUAST SV module will be able to search trivial deletions only.'
            )
            return False
    return True
コード例 #7
0
ファイル: run_busco.py プロジェクト: student-t/quast
def download_tool(tool, tool_version, required_files, logger, url, only_clean=False):
    tool_dirpath = get_dir_for_download(tool + tool_version, tool, required_files, logger, only_clean=only_clean)
    if not tool_dirpath:
        return None

    if only_clean:
        if os.path.isdir(tool_dirpath):
            shutil.rmtree(tool_dirpath, ignore_errors=True)
        return tool_dirpath

    failed_compilation_flag = join(tool_dirpath, 'make.failed')
    if not all(os.path.exists(join(tool_dirpath, fpath)) for fpath in required_files) and not \
            check_prev_compilation_failed(tool, failed_compilation_flag):
        downloaded_fpath = join(tool_dirpath, tool + '.tar.gz')
        logger.main_info('  Downloading ' + tool + '...')
        download_unpack_compressed_tar(tool, url, downloaded_fpath, tool_dirpath, logger)

        if not all(os.path.exists(join(tool_dirpath, fpath)) for fpath in required_files):
            logger.warning('Failed to download ' + tool + ' from ' + url + 'and unpack it into ' + tool_dirpath)
            return None
    return tool_dirpath
コード例 #8
0
def download_gridss(logger, bed_fpath=None, only_clean=False):
    global gridss_dirpath
    gridss_dirpath = get_dir_for_download('gridss',
                                          'GRIDSS', [gridss_fname],
                                          logger,
                                          only_clean=only_clean)
    if not gridss_dirpath:
        return False

    if only_clean:
        if os.path.isdir(gridss_dirpath):
            shutil.rmtree(gridss_dirpath, ignore_errors=True)
        return True
    gridss_fpath = get_gridss_fpath()
    if not qconfig.no_sv and bed_fpath is None and not isfile(gridss_fpath):
        if not download_external_tool(gridss_fname, gridss_dirpath, 'gridss'):
            logger.warning(
                'Failed to download binary distribution from https://github.com/ablab/quast/tree/master/external_tools/gridss. '
                'QUAST SV module will be able to search trivial deletions only. '
                'You can try to download it manually, save the jar archive under %s, and restart QUAST.'
                % gridss_dirpath)
            return False
    return True
コード例 #9
0
def download_all_blast_binaries(logger=logger, only_clean=False):
    global blast_dirpath

    required_files = [cmd for cmd in blast_filenames if not get_blast_fpath(cmd)]
    if not required_files and not only_clean:
        return True

    blast_dirpath = get_dir_for_download('blast', 'BLAST', blast_filenames, logger, only_clean=only_clean)
    if not blast_dirpath:
        return False

    if only_clean:
        if os.path.isdir(blast_dirpath):
            shutil.rmtree(blast_dirpath, ignore_errors=True)
        return True

    for i, cmd in enumerate(required_files):
        return_code = download_blast_binary(cmd, logger=logger)
        logger.info()
        if return_code != 0:
            return False
        blast_file = get_blast_fpath(cmd)
        os.chmod(blast_file, os.stat(blast_file).st_mode | stat.S_IEXEC)
    return True
コード例 #10
0
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Generating Upper Bound Assembly...")

    if not reads_analyzer.compile_reads_analyzer_tools(logger):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...'
        )
        return None

    if qconfig.platform_name == 'linux_32':
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly on this platform '
            '(only linux64 and macOS are supported), skipping...')
        return None

    red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger)
    binary_fpath = download_external_tool('Red',
                                          red_dirpath,
                                          'red',
                                          platform_specific=True,
                                          is_executable=True)
    if not binary_fpath or not os.path.isfile(binary_fpath):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to install/download third-party repeat finding tool [Red]), skipping...'
        )
        return None

    insert_size = qconfig.optimal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.optimal_assembly_default_IS

    ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (
        ref_basename, qconfig.optimal_assembly_basename, insert_size)
    long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads
    if long_reads:
        result_basename = add_suffix(result_basename,
                                     long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(original_ref_fpath))
    prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % (
        original_ref_basename, qconfig.optimal_assembly_basename, insert_size)
    if long_reads:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, mp_polished_suffix)
    ref_prepared_optimal_assembly = os.path.join(
        os.path.dirname(original_ref_fpath),
        prepared_optimal_assembly_basename)
    already_done_fpath = check_prepared_optimal_assembly(
        insert_size, result_fpath, ref_prepared_optimal_assembly)
    if already_done_fpath:
        return already_done_fpath

    uncovered_fpath = None
    reads_analyzer_dir = join(dirname(output_dirpath),
                              qconfig.reads_stats_dirname)
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam:
        sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference(
            ref_fpath,
            reads_analyzer_dir,
            using_reads='all',
            calculate_coverage=True)

    if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size:
        calculated_insert_size = qconfig.optimal_assembly_insert_size
        result_fpath = result_fpath.replace('is' + str(insert_size),
                                            'is' + str(calculated_insert_size))
        prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace(
            'is' + str(insert_size), 'is' + str(calculated_insert_size))
        insert_size = calculated_insert_size
        ref_prepared_optimal_assembly = os.path.join(
            os.path.dirname(original_ref_fpath),
            prepared_optimal_assembly_basename)
        already_done_fpath = check_prepared_optimal_assembly(
            insert_size, result_fpath, ref_prepared_optimal_assembly)
        if already_done_fpath:
            return already_done_fpath

    log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log')
    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    unique_covered_regions, repeats_regions = get_unique_covered_regions(
        ref_fpath,
        tmp_dir,
        log_fpath,
        binary_fpath,
        insert_size,
        uncovered_fpath,
        use_long_reads=long_reads)
    if unique_covered_regions is None:
        logger.error(
            '  Failed to create Upper Bound Assembly, see log for details: ' +
            log_fpath)
        return None

    reference = list(fastaparser.read_fasta(ref_fpath))
    result_fasta = []

    if long_reads or qconfig.mate_pairs:
        if long_reads:
            join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore'
        else:
            join_reads = 'mp'
        sam_fpath, bam_fpath, _ = reads_analyzer.align_reference(
            ref_fpath, reads_analyzer_dir, using_reads=join_reads)
        joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath,
                              bam_fpath, tmp_dir, log_fpath, join_reads)
        uncovered_regions = parse_bed(
            uncovered_fpath) if join_reads == 'mp' else defaultdict(list)
        mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None
        for chrom, seq in reference:
            region_pairing = get_regions_pairing(unique_covered_regions[chrom],
                                                 joiners[chrom], mp_len)
            ref_coords_to_output = scaffolding(unique_covered_regions[chrom],
                                               region_pairing)
            get_fasta_entries_from_coords(result_fasta, (chrom, seq),
                                          ref_coords_to_output,
                                          repeats_regions[chrom],
                                          uncovered_regions[chrom])
    else:
        for chrom, seq in reference:
            for idx, region in enumerate(unique_covered_regions[chrom]):
                if region[1] - region[0] >= MIN_CONTIG_LEN:
                    result_fasta.append(
                        (chrom + '_' + str(idx), seq[region[0]:region[1]]))

    fastaparser.write_fasta(result_fpath, result_fasta)
    logger.info('  ' + 'Theoretical Upper Bound Assembly is saved to ' +
                result_fpath)
    logger.notice(
        '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n'
        '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). '
        'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n'
        '\t\tOR\n'
        '\tYou can copy ' + result_fpath + ' to ' +
        ref_prepared_optimal_assembly + '. '
        'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference ('
        + original_ref_fpath + ') and '
        'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size '
        + str(insert_size) + '), '
        'QUAST will reuse this Upper Bound Assembly.\n')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath
コード例 #11
0
def download_blastdb(logger=logger, only_clean=False):
    global blastdb_dirpath
    blastdb_dirpath = get_dir_for_download('silva',
                                           'Silva',
                                           [silva_downloaded_fname + '.nsq'],
                                           logger,
                                           only_clean=only_clean)
    if not blastdb_dirpath:
        return False

    if only_clean:
        if os.path.isdir(blastdb_dirpath):
            logger.info('Removing ' + blastdb_dirpath)
            shutil.rmtree(blastdb_dirpath)
        return True

    global db_fpath
    db_fpath = join(blastdb_dirpath, silva_downloaded_fname)
    if os.path.isfile(db_fpath +
                      '.nsq') and os.path.getsize(db_fpath +
                                                  '.nsq') >= db_nsq_fsize:
        return True
    log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log')
    db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz')
    silva_fpath = os.path.join(blastdb_dirpath, silva_fname)

    logger.info()

    if os.path.isfile(db_gz_fpath):
        logger.info(
            'SILVA 16S ribosomal RNA gene database has already been downloaded.'
        )
    else:
        logger.info('Downloading SILVA 16S ribosomal RNA gene database...')
        if not os.path.isdir(blastdb_dirpath):
            os.makedirs(blastdb_dirpath)
        silva_download = urllib.FancyURLopener()
        silva_remote_fpath = silva_db_url + silva_fname + '.gz'
        try:
            silva_download.retrieve(silva_remote_fpath,
                                    db_gz_fpath + '.download', show_progress)
        except Exception:
            logger.error(
                'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. '
                'Try to download it manually in %s and restart your command.' %
                (silva_remote_fpath, blastdb_dirpath))
            return False
        shutil.move(db_gz_fpath + '.download', db_gz_fpath)

    logger.info('Processing downloaded file. Logging to %s...' % log_fpath)
    if not os.path.isfile(silva_fpath):
        logger.info('Unpacking and replacing " " with "_"...')

        unpacked_fpath = silva_fpath + ".unpacked"
        cmd = "gunzip -c %s" % db_gz_fpath
        qutils.call_subprocess(shlex.split(cmd),
                               stdout=open(unpacked_fpath, 'w'),
                               stderr=open(log_fpath, 'a'),
                               logger=logger)

        substituted_fpath = silva_fpath + ".substituted"
        with open(unpacked_fpath) as in_file:
            with open(substituted_fpath, 'w') as out_file:
                for line in in_file:
                    out_file.write(line.replace(' ', '_'))
        os.remove(unpacked_fpath)
        shutil.move(substituted_fpath, silva_fpath)

    logger.info('Making BLAST database...')
    cmd = get_blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' %
                                            (silva_fpath, db_fpath))
    qutils.call_subprocess(shlex.split(cmd),
                           stdout=open(log_fpath, 'a'),
                           stderr=open(log_fpath, 'a'),
                           logger=logger)
    if not os.path.exists(db_fpath +
                          '.nsq') or os.path.getsize(db_fpath +
                                                     '.nsq') < db_nsq_fsize:
        logger.error('Failed to make BLAST database ("' + blastdb_dirpath +
                     '"). See details in log. Try to make it manually: %s' %
                     cmd)
        return False
    elif not qconfig.debug:
        os.remove(db_gz_fpath)
        os.remove(silva_fpath)
    return True
コード例 #12
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    kmer_len = qconfig.unique_kmer_len
    logger.main_info('Running analysis based on unique ' + str(kmer_len) +
                     '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath,
                                      contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(
                reporting.Fields.KMER_COMPLETENESS,
                '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 7:
                corr_len = int(stats_content[1].strip().split(': ')[-1])
                mis_len = int(stats_content[2].strip().split(': ')[-1])
                undef_len = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                translocations = int(stats_content[5].strip().split(': ')[-1])
                relocations = int(stats_content[6].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_CORR_LENGTH,
                                 '%.2f' % (corr_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_MIS_LENGTH,
                                 '%.2f' % (mis_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_UNDEF_LENGTH,
                                 '%.2f' % (undef_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_TRANSLOCATIONS,
                                 translocations)
                report.add_field(reporting.Fields.KMER_RELOCATIONS,
                                 relocations)
                report.add_field(reporting.Fields.KMER_MISASSEMBLIES,
                                 translocations + relocations)
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [
        fpath for fpath in contigs_fpaths if fpath not in checked_assemblies
    ]
    if len(contigs_fpaths) == 0:
        save_kmers(output_dir)
        logger.info('Done.')
        return

    if qconfig.platform_name == 'linux_32':
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC',
                                       ['kmc', 'kmc_tools'], logger)
    global kmc_bin_fpath
    global kmc_tools_fpath
    kmc_bin_fpath = download_external_tool('kmc',
                                           kmc_dirpath,
                                           'KMC',
                                           platform_specific=True,
                                           is_executable=True)
    kmc_tools_fpath = download_external_tool('kmc_tools',
                                             kmc_dirpath,
                                             'KMC',
                                             platform_specific=True,
                                             is_executable=True)
    if not exists(kmc_bin_fpath) or not exists(
            kmc_tools_fpath) or not compile_minimap(logger):
        logger.warning('  Sorry, can\'t run KMC, skipping...')
        return None

    logger.info('  Running KMC on reference...')
    if not isdir(output_dir):
        os.makedirs(output_dir)
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, kmer_len,
                                    log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath,
                                 err_fpath)
    if not unique_kmers:
        logger.warning('KMC failed, check ' + log_fpath + ' and ' + err_fpath +
                       '. Skipping...')
        return

    logger.info('  Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        logger.info('    ' + qutils.index_to_str(id) + assembly_label)

        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, kmer_len,
                                    log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(
            tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath,
            err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath,
                                      log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS,
                         '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('  Analyzing assemblies correctness...')
    ref_contigs = [name for name, _ in read_fasta(ref_fpath)]
    logger.info('    Downsampling k-mers...')
    ref_kmers, downsampled_kmers_fpath = downsample_kmers(
        tmp_dirpath, ref_fpath, ref_kmc_out_fpath, kmer_len, log_fpath,
        err_fpath)
    for id, (contigs_fpath,
             kmc_db_fpath) in enumerate(zip(contigs_fpaths, kmc_out_fpaths)):
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        logger.info('    ' + qutils.index_to_str(id) + assembly_label)

        report = reporting.get(contigs_fpath)
        corr_len = None
        mis_len = None
        undef_len = None
        translocations, relocations = None, None
        total_len = 0
        contig_lens = dict()
        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            contig_lens[name] = len(seq)

        if len(ref_contigs) > MAX_REF_CONTIGS_NUM:
            logger.warning(
                'Reference is too fragmented. Scaffolding accuracy will not be assessed.'
            )
        else:
            corr_len = 0
            mis_len = 0
            kmers_by_contig, kmers_pos_by_contig = align_kmers(
                tmp_dirpath, contigs_fpath, downsampled_kmers_fpath, err_fpath,
                qconfig.max_threads)
            is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref
            cyclic_ref_lens = report.get_field(
                reporting.Fields.REFLEN) if is_cyclic else None
            translocations = 0
            relocations = 0
            with open(
                    join(
                        tmp_dirpath,
                        qutils.label_from_fpath_for_fname(contigs_fpath) +
                        '.misjoins.txt'), 'w') as out:
                for contig in kmers_by_contig.keys():
                    contig_markers = []
                    prev_pos, prev_ref_pos, prev_chrom, marker = None, None, None, None
                    for pos, kmer in sorted(zip(kmers_pos_by_contig[contig],
                                                kmers_by_contig[contig]),
                                            key=lambda x: x[0]):
                        ref_chrom, ref_pos = ref_kmers[kmer]
                        if prev_pos and prev_chrom:
                            if prev_chrom == ref_chrom and abs(
                                    abs(pos - prev_pos) /
                                    abs(ref_pos - prev_ref_pos) - 1) <= 0.05:
                                marker = (pos, ref_pos, ref_chrom)
                            elif marker:
                                contig_markers.append(marker)
                                pos, ref_pos, ref_chrom, marker = None, None, None, None
                        prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom
                    if marker:
                        contig_markers.append(marker)
                    prev_pos, prev_ref_pos, prev_chrom = None, None, None
                    is_misassembled = False
                    for marker in contig_markers:
                        pos, ref_pos, ref_chrom = marker
                        if prev_pos and prev_chrom:
                            if ref_chrom != prev_chrom:
                                translocations += 1
                                out.write(
                                    'Translocation in %s: %s %d | %s %d\n' %
                                    (contig, prev_chrom, prev_pos, ref_chrom,
                                     pos))
                                is_misassembled = True
                            elif _get_dist_inconstistency(
                                    pos, prev_pos, ref_pos, prev_ref_pos,
                                    cyclic_ref_lens) > EXT_RELOCATION_SIZE:
                                relocations += 1
                                out.write(
                                    'Relocation in %s: %d (%d) | %d (%d)\n' %
                                    (contig, prev_pos, prev_ref_pos, pos,
                                     ref_pos))
                                is_misassembled = True
                        prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom
                    if is_misassembled:
                        mis_len += contig_lens[contig]
                    elif len(contig_markers) > 0:
                        corr_len += contig_lens[contig]
            undef_len = total_len - corr_len - mis_len
            report.add_field(reporting.Fields.KMER_CORR_LENGTH,
                             '%.2f' % (corr_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_MIS_LENGTH,
                             '%.2f' % (mis_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_UNDEF_LENGTH,
                             '%.2f' % (undef_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_TRANSLOCATIONS,
                             translocations)
            report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations)
            report.add_field(reporting.Fields.KMER_MISASSEMBLIES,
                             translocations + relocations)

        create_kmc_stats_file(
            output_dir, contigs_fpath, ref_fpath,
            report.get_field(reporting.Fields.KMER_COMPLETENESS), corr_len,
            mis_len, undef_len, total_len, translocations, relocations)
    save_kmers(output_dir)
    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
コード例 #13
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 5:
                len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1])
                len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1])
                len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    if qconfig.platform_name == 'linux_32':
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger)
    global kmc_bin_fpath
    global kmc_tools_fpath
    kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True)
    kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True)
    if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath) or not compile_minimap(logger):
        logger.warning('  Sorry, can\'t run KMC, skipping...')
        return None

    logger.info('Running KMC on reference...')
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath)
    if not unique_kmers:
        return

    logger.info('Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('Analyzing assemblies accuracy...')
    if len(kmc_out_fpaths) > 1:
        shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath)
    else:
        shared_kmc_db = kmc_out_fpaths[0]

    kmer_fraction = 0.001

    ref_contigs = [name for name, _ in read_fasta(ref_fpath)]
    ref_kmc_dbs = []

    if len(ref_contigs) <= MAX_REF_CONTIGS_NUM:
        shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, ref_fpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction)
        for name, seq in read_fasta(ref_fpath):
            seq_kmc_db = seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, seq=seq, name=name, is_ref=True,
                                                     intersect_with=shared_downsampled_kmc_db)
            ref_kmc_dbs.append((name, seq_kmc_db))

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = None
        len_map_to_multi_chrom = None
        len_map_to_none_chrom = None
        total_len = 0
        long_contigs = []
        contig_lens = dict()
        contig_markers = defaultdict(list)
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        list_files_fpath = join(tmp_dirpath, label + '_files.txt')
        with open(list_files_fpath, 'w') as list_files:
            for name, seq in read_fasta(contigs_fpath):
                total_len += len(seq)
                contig_lens[name] = len(seq)
                if len(seq) >= MIN_CONTIGS_LEN:
                    long_contigs.append(len(seq))
                    tmp_contig_fpath = join(tmp_dirpath, name + '.fasta')
                    with open(tmp_contig_fpath, 'w') as out_f:
                        out_f.write('>%s\n' % name)
                        out_f.write('%s\n' % seq)
                    list_files.write(tmp_contig_fpath + '\n')

        if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5:
            logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.')
        elif len(ref_contigs) > MAX_REF_CONTIGS_NUM:
            logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.')
        else:
            len_map_to_one_chrom = 0
            len_map_to_multi_chrom = 0
            filtered_fpath = join(tmp_dirpath, label + '.filtered.fasta')
            filter_contigs(list_files_fpath, filtered_fpath, shared_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MARKERS)
            filtered_list_files_fpath = join(tmp_dirpath, label + '_files.filtered.txt')
            with open(filtered_list_files_fpath, 'w') as list_files:
                for name, _ in read_fasta(filtered_fpath):
                    tmp_contig_fpath = join(tmp_dirpath, name + '.fasta')
                    list_files.write(tmp_contig_fpath + '\n')
            for ref_name, ref_kmc_db in ref_kmc_dbs:
                tmp_filtered_fpath = join(tmp_dirpath, ref_name + '.filtered.fasta')
                filter_contigs(filtered_list_files_fpath, tmp_filtered_fpath, ref_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MISJOIN_MARKERS)
                if exists(tmp_filtered_fpath):
                    for name, _ in read_fasta(tmp_filtered_fpath):
                        contig_markers[name].append(ref_name)
            for name, chr_markers in contig_markers.items():
                if len(chr_markers) == 1:
                    len_map_to_one_chrom += contig_lens[name]
                else:
                    len_map_to_multi_chrom += contig_lens[name]
            len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
                             report.get_field(reporting.Fields.KMER_COMPLETENESS),
                             len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
コード例 #14
0
ファイル: optimal_assembly.py プロジェクト: nwespe/quast
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Simulating Optimal Assembly...")

    uncovered_fpath = None
    reads_analyzer_dir = join(dirname(output_dirpath),
                              qconfig.reads_stats_dirname)
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam:
        sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference(
            ref_fpath,
            reads_analyzer_dir,
            using_reads='all',
            calculate_coverage=True)
    insert_size = qconfig.optimal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.optimal_assembly_default_IS

    ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (
        ref_basename, qconfig.optimal_assembly_basename, insert_size)
    long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads
    if long_reads:
        result_basename = add_suffix(result_basename,
                                     long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(original_ref_fpath))
    prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % (
        original_ref_basename, qconfig.optimal_assembly_basename, insert_size)
    ref_prepared_optimal_assembly = os.path.join(
        os.path.dirname(original_ref_fpath),
        prepared_optimal_assembly_basename)

    if os.path.isfile(result_fpath) or os.path.isfile(
            ref_prepared_optimal_assembly):
        already_done_fpath = result_fpath if os.path.isfile(
            result_fpath) else ref_prepared_optimal_assembly
        logger.notice(
            '  Will reuse already generated Optimal Assembly with insert size %d (%s)'
            % (insert_size, already_done_fpath))
        return already_done_fpath

    if qconfig.platform_name == 'linux_32':
        logger.warning(
            '  Sorry, can\'t create Optimal Assembly on this platform, skipping...'
        )
        return None

    red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger)
    binary_fpath = download_external_tool('Red',
                                          red_dirpath,
                                          'red',
                                          platform_specific=True,
                                          is_executable=True)
    if not binary_fpath or not os.path.isfile(binary_fpath):
        logger.warning('  Sorry, can\'t create Optimal Assembly, skipping...')
        return None

    log_fpath = os.path.join(output_dirpath, 'optimal_assembly.log')
    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    unique_covered_regions, repeats_regions = get_unique_covered_regions(
        ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size,
        uncovered_fpath)
    if unique_covered_regions is None:
        logger.error(
            '  Failed to create Optimal Assembly, see log for details: ' +
            log_fpath)
        return None

    reference = list(fastaparser.read_fasta(ref_fpath))
    result_fasta = []

    if long_reads or qconfig.mate_pairs:
        if long_reads:
            join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore'
        else:
            join_reads = 'mp'
        sam_fpath, bam_fpath, _ = reads_analyzer.align_reference(
            ref_fpath, reads_analyzer_dir, using_reads=join_reads)
        joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath,
                              bam_fpath, tmp_dir, log_fpath, join_reads)
        uncovered_regions = parse_uncovered_fpath(
            uncovered_fpath, ref_fpath, return_covered_regions=False
        ) if join_reads == 'mp' else defaultdict(list)
        mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None
        for chrom, seq in reference:
            region_pairing = get_regions_pairing(unique_covered_regions[chrom],
                                                 joiners[chrom], mp_len)
            ref_coords_to_output = scaffolding(unique_covered_regions[chrom],
                                               region_pairing)
            get_fasta_entries_from_coords(result_fasta, (chrom, seq),
                                          ref_coords_to_output,
                                          repeats_regions[chrom],
                                          uncovered_regions[chrom])
    else:
        for chrom, seq in reference:
            for idx, region in enumerate(unique_covered_regions[chrom]):
                if region[1] - region[0] >= MIN_CONTIG_LEN:
                    result_fasta.append(
                        (chrom + '_' + str(idx), seq[region[0]:region[1]]))

    fastaparser.write_fasta(result_fpath, result_fasta)
    logger.info('  ' + 'Theoretically optimal Assembly saved to ' +
                result_fpath)
    logger.notice(
        'You can copy it to ' + ref_prepared_optimal_assembly +
        ' and QUAST will reuse it in further runs against the same reference ('
        + original_ref_fpath + ')')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath
コード例 #15
0
def download_blastdb(logger=logger, only_clean=False):
    global blastdb_dirpath
    blastdb_dirpath = get_dir_for_download('silva', 'Silva', [silva_downloaded_fname + '.nsq'], logger, only_clean=only_clean)
    if not blastdb_dirpath:
        return False

    if only_clean:
        if os.path.isdir(blastdb_dirpath):
            logger.info('Removing ' + blastdb_dirpath)
            shutil.rmtree(blastdb_dirpath)
        return True

    global db_fpath
    db_fpath = join(blastdb_dirpath, silva_downloaded_fname)
    if os.path.isfile(db_fpath + '.nsq') and os.path.getsize(db_fpath + '.nsq') >= db_nsq_fsize:
        return True
    log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log')
    db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz')
    silva_fpath = os.path.join(blastdb_dirpath, silva_fname)

    logger.info()

    if os.path.isfile(db_gz_fpath):
        logger.info('SILVA 16S ribosomal RNA gene database has already been downloaded.')
    else:
        logger.info('Downloading SILVA 16S ribosomal RNA gene database...')
        if not os.path.isdir(blastdb_dirpath):
            os.makedirs(blastdb_dirpath)
        silva_download = urllib.FancyURLopener()
        silva_remote_fpath = silva_db_url + silva_fname + '.gz'
        try:
            silva_download.retrieve(silva_remote_fpath, db_gz_fpath + '.download', show_progress)
        except Exception:
            logger.error(
                'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. '
                'Try to download it manually in %s and restart your command.' % (silva_remote_fpath, blastdb_dirpath))
            return False
        shutil.move(db_gz_fpath + '.download', db_gz_fpath)

    logger.info('Processing downloaded file. Logging to %s...' % log_fpath)
    if not os.path.isfile(silva_fpath):
        logger.info('Unpacking and replacing " " with "_"...')

        unpacked_fpath = silva_fpath + ".unpacked"
        cmd = "gunzip -c %s" % db_gz_fpath
        qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_fpath, 'w'), stderr=open(log_fpath, 'a'), logger=logger)

        substituted_fpath = silva_fpath + ".substituted"
        with open(unpacked_fpath) as in_file:
            with open(substituted_fpath, 'w') as out_file:
                for line in in_file:
                    out_file.write(line.replace(' ', '_'))
        os.remove(unpacked_fpath)
        shutil.move(substituted_fpath, silva_fpath)

    logger.info('Making BLAST database...')
    cmd = get_blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' % (silva_fpath, db_fpath))
    qutils.call_subprocess(shlex.split(cmd), stdout=open(log_fpath, 'a'), stderr=open(log_fpath, 'a'), logger=logger)
    if not os.path.exists(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize:
        logger.error('Failed to make BLAST database ("' + blastdb_dirpath +
                     '"). See details in log. Try to make it manually: %s' % cmd)
        return False
    elif not qconfig.debug:
        os.remove(db_gz_fpath)
        os.remove(silva_fpath)
    return True