コード例 #1
0
def create_successful_check(fpath, contigs_fpath, ref_fpath):
    successful_check_file = open(fpath, 'w')
    successful_check_file.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath))
    successful_check_file.write("Reference md5 checksum: %s\n" % md5(ref_fpath))
    successful_check_file.write("Successfully finished on " +
                                       datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') + '\n')
    successful_check_file.close()
コード例 #2
0
ファイル: align_contigs.py プロジェクト: ablab/quast
def create_successful_check(fpath, contigs_fpath, ref_fpath):
    successful_check_file = open(fpath, 'w')
    successful_check_file.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath))
    successful_check_file.write("Reference md5 checksum: %s\n" % md5(ref_fpath))
    successful_check_file.write("Successfully finished on " +
                                       datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') + '\n')
    successful_check_file.close()
コード例 #3
0
def check_successful_check(fpath, contigs_fpath, ref_fpath):
    successful_check_content = open(fpath).read().split('\n')
    if len(successful_check_content) < 2:
        return False
    if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)):
        return False
    if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)):
        return False
    return True
コード例 #4
0
ファイル: align_contigs.py プロジェクト: ablab/quast
def check_successful_check(fpath, contigs_fpath, ref_fpath):
    successful_check_content = open(fpath).read().split('\n')
    if len(successful_check_content) < 2:
        return False
    if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)):
        return False
    if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)):
        return False
    return True
コード例 #5
0
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    if not exists(kmc_check_fpath):
        return False
    successful_check_content = open(kmc_check_fpath).read().split('\n')
    if len(successful_check_content) < 2:
        return False
    if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)):
        return False
    if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)):
        return False
    return True
コード例 #6
0
def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath,
                   blast_res_fpath, blast_check_fpath, blast_threads):
    logger.info('  ' + 'processing ' + label)
    blast_query_fpath = contigs_fpath
    compress_ext = ['.gz', '.gzip', '.bz2', '.bzip2', '.zip']
    if any(contigs_fpath.endswith(ext) for ext in compress_ext):
        logger.info('  ' + 'unpacking ' + label)
        unpacked_fpath = os.path.join(
            corrected_dirpath,
            os.path.basename(contigs_fpath) + '.unpacked')
        with _get_fasta_file_handler(contigs_fpath) as f_in:
            with open(unpacked_fpath, 'w') as f_out:
                for l in f_in:
                    f_out.write(l)
        blast_query_fpath = unpacked_fpath
    res_fpath = get_blast_output_fpath(blast_res_fpath, label)
    check_fpath = get_blast_output_fpath(blast_check_fpath, label)
    cmd = get_blast_fpath('blastn') + (
        ' -query %s -db %s -outfmt 7 -num_threads %s' %
        (blast_query_fpath, db_fpath, blast_threads))
    qutils.call_subprocess(shlex.split(cmd),
                           stdout=open(res_fpath, 'w'),
                           stderr=open(err_fpath, 'a'),
                           logger=logger)
    logger.info('  ' + 'BLAST results for %s are saved to %s...' %
                (label, res_fpath))
    with open(check_fpath, 'w') as check_file:
        check_file.writelines('Assembly: %s md5 checksum: %s\n' %
                              (contigs_fpath, md5(contigs_fpath)))
コード例 #7
0
def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath=None):
    logger.print_timestamp()
    err_fpath = os.path.join(downloaded_dirpath, 'blast.err')
    blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check')
    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')
    files_md5 = dict((assembly.fpath, md5(assembly.fpath)) for assembly in assemblies)
    assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies)
    blast_assemblies, downloaded_organisms, not_founded_organisms = \
        check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels)

    species_list = []
    replacement_list = None
    if ref_txt_fpath:
        species_list = parse_refs_list(ref_txt_fpath)
        species_by_assembly = None
    else:
        species_scores, species_by_assembly, replacement_dict = process_blast(blast_assemblies, downloaded_dirpath,
                                                                              corrected_dirpath, labels, blast_check_fpath, err_fpath)
        if species_scores:
            species_scores = sorted(species_scores, reverse=True)
            species_list = [species for (species, query_id, score) in species_scores]
            replacement_list = [replacement_dict[query_id] for (species, query_id, score) in species_scores]

    downloaded_ref_fpaths = [os.path.join(downloaded_dirpath, file) for (path, dirs, files) in os.walk(downloaded_dirpath)
                             for file in files if qutils.check_is_fasta_file(file)]

    ref_fpaths = search_references(species_list, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths,
                 blast_check_fpath, err_fpath, species_by_assembly, replacement_list)

    if not ref_fpaths:
        logger.main_info('Reference genomes are not found.')
    if not qconfig.debug and os.path.exists(err_fpath):
        os.remove(err_fpath)
    ref_fpaths.sort()
    return ref_fpaths
コード例 #8
0
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    if not exists(kmc_check_fpath):
        return False
    successful_check_content = open(kmc_check_fpath).read().split('\n')
    if len(successful_check_content) < 3:
        return False
    if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)):
        return False
    if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)):
        return False
    used_assemblies = successful_check_content[2].strip().split(': ')[-1]
    if used_assemblies and sorted(used_assemblies.split(',')) != sorted(contigs_fpaths):
        return False
    return True
コード例 #9
0
def create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, completeness,
                         len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    kmc_stats_fpath = join(output_dir, label + '.stat')
    with open(kmc_check_fpath, 'w') as check_f:
        check_f.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath))
        check_f.write("Reference md5 checksum: %s\n" % md5(ref_fpath))
        check_f.write("Used assemblies: %s\n" % ','.join(contigs_fpaths))
    with open(kmc_stats_fpath, 'w') as stats_f:
        stats_f.write("Completeness: %s\n" % completeness)
        if len_map_to_one_chrom or len_map_to_multi_chrom:
            stats_f.write("Length assigned to one chromosome: %d\n" % len_map_to_one_chrom)
            stats_f.write("Length assigned to multi chromosomes: %d\n" % len_map_to_multi_chrom)
            stats_f.write("Length assigned to none chromosome: %d\n" % len_map_to_none_chrom)
            stats_f.write("Total length: %d\n" % total_len)
コード例 #10
0
def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath=None):
    logger.print_timestamp()
    err_fpath = os.path.join(downloaded_dirpath, 'blast.err')
    blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check')
    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')
    files_md5 = dict((assembly.fpath, md5(assembly.fpath)) for assembly in assemblies)
    assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies)
    blast_assemblies, downloaded_organisms, not_founded_organisms = \
        check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels)
    organisms = []

    if ref_txt_fpath:
        organisms = parse_refs_list(ref_txt_fpath)
        organisms_assemblies = None
    else:
        scores_organisms, organisms_assemblies = process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath,
                                                               labels, blast_check_fpath, err_fpath)
        if scores_organisms:
            scores_organisms = sorted(scores_organisms, reverse=True)
            organisms = [organism for (score, organism) in scores_organisms]

    downloaded_ref_fpaths = [os.path.join(downloaded_dirpath, file) for (path, dirs, files) in os.walk(downloaded_dirpath)
                             for file in files if qutils.check_is_fasta_file(file)]

    ref_fpaths = process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths,
                 blast_check_fpath, err_fpath, organisms_assemblies)

    if not ref_fpaths:
        logger.main_info('Reference genomes are not found.')
    if not qconfig.debug and os.path.exists(err_fpath):
        os.remove(err_fpath)
    ref_fpaths.sort()
    return ref_fpaths
コード例 #11
0
def create_kmc_stats_file(output_dir, contigs_fpath, ref_fpath, completeness,
                          corr_len, mis_len, undef_len, total_len, translocations, relocations):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    kmc_stats_fpath = join(output_dir, label + '.stat')
    with open(kmc_check_fpath, 'w') as check_f:
        check_f.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath))
        check_f.write("Reference md5 checksum: %s\n" % md5(ref_fpath))
    with open(kmc_stats_fpath, 'w') as stats_f:
        stats_f.write("Completeness: %s\n" % completeness)
        if corr_len or mis_len:
            stats_f.write("K-mer-based correct length: %d\n" % corr_len)
            stats_f.write("K-mer-based misjoined length: %d\n" % mis_len)
            stats_f.write("K-mer-based undefined length: %d\n" % undef_len)
            stats_f.write("Total length: %d\n" % total_len)
            stats_f.write("# translocations: %d\n" % translocations)
            stats_f.write("# 100 kbp relocations: %d\n" % relocations)
コード例 #12
0
def search_references(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths,
                 blast_check_fpath, err_fpath, organisms_assemblies=None, replacement_list=None):
    ref_fpaths = []
    downloaded_organisms = []

    total_downloaded = 0
    total_scored_left = len(organisms)
    if total_scored_left == 0:
        if not qconfig.debug and os.path.exists(err_fpath):
            os.remove(err_fpath)
        return ref_fpaths

    max_organism_name_len = 0
    for organism in organisms:
        max_organism_name_len = max(len(organism), max_organism_name_len)
    for organism in downloaded_organisms:
        max_organism_name_len = max(len(organism), max_organism_name_len)

    logger.print_timestamp()
    logger.main_info('Trying to download found references from NCBI. Totally ' + str(total_scored_left) + ' organisms to try.')
    if len(downloaded_ref_fpaths) > 0:
        logger.main_info('MetaQUAST will attempt to use previously downloaded references...')

    for idx, organism in enumerate(organisms):
        ref_fpath, total_downloaded, total_scored_left = process_ref(ref_fpaths, organism, downloaded_dirpath, max_organism_name_len,
                                                                      downloaded_organisms, not_founded_organisms, total_downloaded, total_scored_left)
        if not ref_fpath and replacement_list:
            for next_match in replacement_list[idx]:
                if next_match not in organisms:
                    logger.main_info('  ' + organism.replace('+', ' ') + ' was not found in NCBI database, trying to download the next best match')
                    ref_fpath, total_downloaded, _ = process_ref(ref_fpaths, next_match, downloaded_dirpath,
                                                                 max_organism_name_len, downloaded_organisms, not_founded_organisms,
                                                                 total_downloaded, total_scored_left + 1)
                    organism = next_match
                    if ref_fpath:
                        break

    for assembly, label in zip(assemblies, labels):
        check_fpath = get_blast_output_fpath(blast_check_fpath, label)
        if os.path.exists(check_fpath):
            with open(check_fpath) as check_file:
                text = check_file.read()
                text = text[:text.find('\n')]
        else:
            text = 'Assembly: %s md5 checksum: %s\n' % (assembly.fpath, md5(assembly.fpath))
        with open(check_fpath, 'w') as check_file:
            check_file.writelines(text)
            check_file.writelines('\n---\n')
            cur_downloaded_organisms = [organism for organism in downloaded_organisms] if not organisms_assemblies else \
                [organism for organism in downloaded_organisms if organism in organisms_assemblies[label]]
            cur_not_founded_organisms = [organism for organism in not_founded_organisms] if not organisms_assemblies else \
                [organism for organism in not_founded_organisms if organism in organisms_assemblies[label]]
            check_file.writelines('Downloaded: %s\n' % ','.join(cur_downloaded_organisms))
            check_file.writelines('Not_founded: %s\n' % ','.join(cur_not_founded_organisms))
    return ref_fpaths
コード例 #13
0
def process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths,
                 blast_check_fpath, err_fpath, organisms_assemblies=None):
    ref_fpaths = []
    downloaded_organisms = []

    total_downloaded = 0
    total_scored_left = len(organisms)
    if total_scored_left == 0:
        if not qconfig.debug and os.path.exists(err_fpath):
            os.remove(err_fpath)
        return ref_fpaths

    max_organism_name_len = 0
    for organism in organisms:
        max_organism_name_len = max(len(organism), max_organism_name_len)
    for organism in downloaded_organisms:
        max_organism_name_len = max(len(organism), max_organism_name_len)

    logger.print_timestamp()
    logger.main_info('Trying to download found references from NCBI. '
                'Totally ' + str(total_scored_left) + ' organisms to try.')
    if len(downloaded_ref_fpaths) > 0:
        logger.main_info('MetaQUAST will attempt to use previously downloaded references...')

    for organism in organisms:
        ref_fpath = os.path.join(downloaded_dirpath, correct_name(organism) + '.fasta')
        spaces = (max_organism_name_len - len(organism)) * ' '
        new_ref_fpath = None
        was_downloaded = False
        if not os.path.exists(ref_fpath) and organism not in not_founded_organisms:
            new_ref_fpath = download_refs(organism, ref_fpath)
        elif os.path.exists(ref_fpath):
            was_downloaded = True
            new_ref_fpath = ref_fpath
        if new_ref_fpath:
            total_scored_left -= 1
            total_downloaded += 1
            if was_downloaded:
                logger.main_info("  %s%s | was downloaded previously (total %d, %d more to go)" %
                            (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left))
                if new_ref_fpath not in ref_fpaths:
                    ref_fpaths.append(new_ref_fpath)
            else:
                logger.main_info("  %s%s | successfully downloaded (total %d, %d more to go)" %
                        (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left))
                ref_fpaths.append(new_ref_fpath)
            downloaded_organisms.append(organism)
        else:
            total_scored_left -= 1
            logger.main_info("  %s%s | not found in the NCBI database" % (organism.replace('+', ' '), spaces))
            not_founded_organisms.add(organism)
    for assembly, label in zip(assemblies, labels):
        check_fpath = get_blast_output_fpath(blast_check_fpath, label)
        if os.path.exists(check_fpath):
            with open(check_fpath) as check_file:
                text = check_file.read()
                text = text[:text.find('\n')]
        else:
            text = 'Assembly: %s md5 checksum: %s\n' % (assembly.fpath, md5(assembly.fpath))
        with open(check_fpath, 'w') as check_file:
            check_file.writelines(text)
            check_file.writelines('\n---\n')
            cur_downloaded_organisms = [organism for organism in downloaded_organisms] if not organisms_assemblies else \
                [organism for organism in downloaded_organisms if organism in organisms_assemblies[label]]
            cur_not_founded_organisms = [organism for organism in not_founded_organisms] if not organisms_assemblies else \
                [organism for organism in not_founded_organisms if organism in organisms_assemblies[label]]
            check_file.writelines('Downloaded: %s\n' % ','.join(cur_downloaded_organisms))
            check_file.writelines('Not_founded: %s\n' % ','.join(cur_not_founded_organisms))
    return ref_fpaths