Esempio n. 1
0
def parse_alignments(contigs_fpaths, contig_report_fpath_pattern):
    lists_of_aligned_blocks = []
    for contigs_fpath in contigs_fpaths:
        if contig_report_fpath_pattern:
            report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(
                contigs_fpath)
            aligned_blocks, misassembled_id_to_structure = parse_aligner_contig_report(
                report_fpath)
            if aligned_blocks is None:
                continue

            aligned_blocks = check_misassembled_blocks(
                aligned_blocks,
                misassembled_id_to_structure,
                filter_local=True)
            lists_of_aligned_blocks.append(aligned_blocks)

    if lists_of_aligned_blocks:
        max_contigs = max([
            len(aligned_blocks) for aligned_blocks in lists_of_aligned_blocks
        ])
        return get_assemblies(contigs_fpaths,
                              lists_of_aligned_blocks).assemblies, max_contigs
    else:
        return None, None
Esempio n. 2
0
def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath):
    total_len = dict()
    contigs_dict = dict()

    contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]]
    for contigs_fpath in contigs_fpaths:
        total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN)
        contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS)
    cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]
    num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]

    common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs)
    histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)'
    plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram',
                               histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
    for contigs_fpath in contigs_with_coverage:
        coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]],
                                                                                             [contigs_dict[contigs_fpath]])
        label = qutils.label_from_fpath(contigs_fpath)
        corr_label = qutils.label_from_fpath_for_fname(contigs_fpath)
        histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)'
        histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram')
        plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath,
                                   histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov,
                                   low_threshold=low_threshold, high_threshold=high_threshold)
Esempio n. 3
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function,
                  prokaryote, num_threads):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_genemark.stderr')

    genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads)

    if not genes:
        unique_count = None
        count = None  # [None] * len(gene_lengths)
    else:
        tool_name = "genemark"
        out_gff_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else ''))
        add_genes_to_gff(genes, out_gff_fpath, prokaryote)
        if OUTPUT_FASTA:
            out_fasta_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.fasta')
            add_genes_to_fasta(genes, out_fasta_fpath)

        count = [sum([gene.end - gene.start > x for gene in genes]) for x in gene_lengths]
        gene_ids = [gene.seq if gene.seq else gene.name for gene in genes]
        unique_count = len(set(gene_ids))
        total_count = len(genes)

        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_fpath)

    return genes, unique_count, count
Esempio n. 4
0
def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath):
    total_len = dict()
    contigs_dict = dict()

    contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]]
    for contigs_fpath in contigs_fpaths:
        total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN)
        contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS)
    cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]
    num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]

    common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs)
    histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)'
    plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram',
                               histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
    for contigs_fpath in contigs_with_coverage:
        coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]],
                                                                                             [contigs_dict[contigs_fpath]])
        label = qutils.label_from_fpath(contigs_fpath)
        corr_label = qutils.label_from_fpath_for_fname(contigs_fpath)
        histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)'
        histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram')
        plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath,
                                   histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov,
                                   low_threshold=low_threshold, high_threshold=high_threshold)
Esempio n. 5
0
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess(
        ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference,
         contigs_fpath, tmp_dir, str(qconfig.min_contig)],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(i),
        only_if_debug=False)
    if return_code != 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
Esempio n. 6
0
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir):
    assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess(
        ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference,
         contigs_fpath, tmp_dir, str(qconfig.min_contig)],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(i),
        only_if_debug=False)
    if return_code != 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
Esempio n. 7
0
def create_mismatches_plot(assembly, window_size, ref_len, root_dir, output_dir):
    assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
    nucmer_dirpath = join(root_dir, '..', 'contigs_reports')
    nucmer_fpath = join(create_nucmer_output_dir(nucmer_dirpath), assembly_label)
    _, _, _, _, used_snps_fpath = get_nucmer_aux_out_fpaths(nucmer_fpath)
    if not exists(used_snps_fpath):
        return None

    mismatches_fpath = join(output_dir, assembly_label + '.mismatches.txt')
    mismatch_density_by_chrom = defaultdict(lambda : [0] * (ref_len // window_size + 1))
    for line in open_gzipsafe(used_snps_fpath):
        chrom, contig, ref_pos, ref_nucl, ctg_nucl, ctg_pos = line.split('\t')
        if ref_nucl != '.' and ctg_nucl != '.':
            mismatch_density_by_chrom[chrom][int(ref_pos) // window_size] += 1
    with open(mismatches_fpath, 'w') as out_f:
        for chrom, density_list in mismatch_density_by_chrom.items():
            start, end = 0, 0
            for i, density in enumerate(density_list):
                if density == 0:
                    end = (i + 1) * window_size
                else:
                    if end:
                        out_f.write('\t'.join([chrom, str(start), str(end), '0']) + '\n')
                    out_f.write('\t'.join([chrom, str(i * window_size), str(((i + 1) * window_size)), str(density)]) + '\n')
                    start = (i + 1) * window_size
                    end = None
            out_f.write('\t'.join([chrom, str(start), str(end), '0']) + '\n')
    return mismatches_fpath
Esempio n. 8
0
def get_assemblies_data(contigs_fpaths, icarus_dirpath, stdout_pattern, nx_marks):
    assemblies_n50 = defaultdict(dict)
    assemblies_data = ''
    assemblies_data += 'var assemblies_links = {};\n'
    assemblies_data += 'var assemblies_len = {};\n'
    assemblies_data += 'var assemblies_contigs = {};\n'
    assemblies_data += 'var assemblies_misassemblies = {};\n'
    assemblies_data += 'var assemblies_n50 = {};\n'
    assemblies_contig_size_data = ''
    for contigs_fpath in contigs_fpaths:
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        report = reporting.get(contigs_fpath)
        l = report.get_field(reporting.Fields.TOTALLEN)
        contigs = report.get_field(reporting.Fields.CONTIGS)
        n50 = report.get_field(reporting.Fields.N50)
        if stdout_pattern:
            contig_stdout_fpath = stdout_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) + '.stdout'
            contig_stdout_fpath = qutils.relpath(contig_stdout_fpath, icarus_dirpath)
            assemblies_data += 'assemblies_links["' + assembly_label + '"] = "' + contig_stdout_fpath + '";\n'
        assemblies_contig_size_data += 'assemblies_len["' + assembly_label + '"] = ' + str(l) + ';\n'
        assemblies_contig_size_data += 'assemblies_contigs["' + assembly_label + '"] = ' + str(contigs) + ';\n'
        assemblies_contig_size_data += 'assemblies_n50["' + assembly_label + '"] = "' + str(n50) + '";\n'
        for nx in nx_marks:
            assemblies_n50[assembly_label][nx] = report.get_field(nx)
    return assemblies_data, assemblies_contig_size_data, assemblies_n50
Esempio n. 9
0
def calculate_ave_read_support(combined_output_dirpath, assemblies):
    unique_contigs_fpath = os.path.join(combined_output_dirpath, 'contigs_reports', qconfig.unique_contigs_fname_pattern)
    for assembly in assemblies:
        aligned_contigs_by_ref = dict()
        assembly_label = qutils.label_from_fpath(assembly.fpath)
        corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
        with open(unique_contigs_fpath % corr_assembly_label) as in_f:
            for line in in_f:
                ref_name, contig_len, contig_cov = line.strip().split('\t')
                aligned_contigs_by_ref.setdefault(ref_name, []).append((float(contig_len), float(contig_cov)))
        for ref_name, contigs in aligned_contigs_by_ref.items():
            ref_cov = sum(contig_cov * aligned_len for (aligned_len, contig_cov) in contigs)
            ref_cov /= sum(aligned_len for (aligned_len, contig_cov) in contigs)
            corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
            ref_contigs_fpath = os.path.join(
                        os.path.dirname(assembly.fpath), corr_assembly_label + '_to_' + ref_name + '.fasta')
            qconfig.assembly_labels_by_fpath[ref_contigs_fpath] = assembly_label
            report = reporting.get(ref_contigs_fpath, ref_name=ref_name)
            report.add_field(reporting.Fields.AVE_READ_SUPPORT, '%.2f' % ref_cov)
Esempio n. 10
0
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    if not exists(kmc_check_fpath):
        return False
    successful_check_content = open(kmc_check_fpath).read().split('\n')
    if len(successful_check_content) < 2:
        return False
    if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)):
        return False
    if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)):
        return False
    return True
Esempio n. 11
0
def create_mismatches_plot(assembly, window_size, ref_len, root_dir,
                           output_dir):
    assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
    aligner_dirpath = join(root_dir, '..',
                           qconfig.detailed_contigs_reports_dirname)
    coords_basename = join(create_minimap_output_dir(aligner_dirpath),
                           assembly_label)
    _, coords_filtered_fpath, _, _ = get_aux_out_fpaths(coords_basename)
    if not exists(coords_filtered_fpath) or not qconfig.show_snps:
        return None

    mismatches_fpath = join(output_dir, assembly_label + '.mismatches.txt')
    mismatch_density_by_chrom = defaultdict(lambda: [0] *
                                            (ref_len // window_size + 1))
    with open(coords_filtered_fpath) as coords_file:
        for line in coords_file:
            s1 = int(line.split('|')[0].split()[0])
            chrom = line.split()[11].strip()
            cigar = line.split()[-1].strip()
            ref_pos = s1
            for op in parse_cs_tag(cigar):
                n_bases = len(op) - 1
                if op.startswith('*'):
                    mismatch_density_by_chrom[chrom][int(ref_pos) //
                                                     window_size] += 1
                    ref_pos += 1
                elif not op.startswith('+'):
                    ref_pos += n_bases
    with open(mismatches_fpath, 'w') as out_f:
        for chrom, density_list in mismatch_density_by_chrom.items():
            start, end = 0, 0
            for i, density in enumerate(density_list):
                if density == 0:
                    end = (i + 1) * window_size
                else:
                    if end:
                        out_f.write(
                            '\t'.join([chrom, str(start),
                                       str(end), '0']) + '\n')
                    out_f.write('\t'.join([
                        chrom,
                        str(i * window_size),
                        str(((i + 1) * window_size)),
                        str(density)
                    ]) + '\n')
                    start = (i + 1) * window_size
                    end = None
            if end:
                out_f.write('\t'.join([chrom, str(start),
                                       str(end), '0']) + '\n')
    return mismatches_fpath
Esempio n. 12
0
def is_license_valid(out_dirpath, fasta_fpaths):
    # checking the installation
    err_fpath = os.path.join(out_dirpath, qutils.label_from_fpath_for_fname(fasta_fpaths[0]) + '_genemark.stderr')
    if os.path.isfile(err_fpath):
        with open(err_fpath) as err_f:
            for line in err_f:
                if line.find('license period has ended') != -1:
                    logger.main_info()
                    logger.warning('License period for GeneMark has ended! \n'
                                   'To update license, please visit http://exon.gatech.edu/GeneMark/license_download.cgi page and fill in the form.\n'
                                   'You should choose GeneMarkS tool and your operating system (note that GeneMark is free for non-commercial use).\n'
                                   'Download the license key and replace your ~/.gm_key with the updated version. After that you can restart QUAST.\n')
                    return False
    return True
Esempio n. 13
0
def is_license_valid(out_dirpath, fasta_fpaths):
    # checking the installation
    err_fpath = os.path.join(out_dirpath, qutils.label_from_fpath_for_fname(fasta_fpaths[0]) + '_genemark.stderr')
    if os.path.isfile(err_fpath):
        with open(err_fpath) as err_f:
            for line in err_f:
                if line.find('license period has ended') != -1:
                    logger.main_info()
                    logger.warning('License period for GeneMark has ended! \n'
                                   'To update license, please visit http://exon.gatech.edu/GeneMark/license_download.cgi page and fill in the form.\n'
                                   'You should choose GeneMarkS tool and your operating system (note that GeneMark is free for non-commercial use).\n'
                                   'Download the license key and replace your ~/.gm_key with the updated version. After that you can restart QUAST.\n')
                    return False
    return True
Esempio n. 14
0
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template):
    assembly_label = qutils.label_from_fpath(asm.fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath)
    logger.info('  ' + 'processing ' + assembly_label)
    added_ref_asm = []
    not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta'
    not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname)
    contigs = {}
    aligned_contig_names = set()
    aligned_contigs_for_each_ref = {}
    contigs_seq = fastaparser.read_fasta_one_time(asm.fpath)
    alignments_fpath = alignments_fpath_template % corr_assembly_label
    if os.path.exists(alignments_fpath):
        with open(alignments_fpath) as f:
            for line in f:
                values = line.split()
                if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys():
                    ref_name = contigs_analyzer.ref_labels_by_chromosomes[values[0]]
                    ref_contigs_names = values[1:]
                    ref_contigs_fpath = os.path.join(
                        corrected_dirpath, corr_assembly_label + '_to_' + ref_name + '.fasta')
                    if ref_name not in aligned_contigs_for_each_ref:
                        aligned_contigs_for_each_ref[ref_name] = []

                    for (cont_name, seq) in contigs_seq:
                        if not cont_name in contigs:
                            contigs[cont_name] = seq

                        if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ref_name]:
                            # Collecting all aligned contigs names in order to further extract not aligned
                            aligned_contig_names.add(cont_name)
                            aligned_contigs_for_each_ref[ref_name].append(cont_name)
                            fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a')

                    ref_asm = Assembly(ref_contigs_fpath, assembly_label)
                    if ref_asm.name not in added_ref_asm:
                        if ref_name in assemblies_by_ref:
                            assemblies_by_ref[ref_name].append(ref_asm)
                            added_ref_asm.append(ref_asm.name)
        if qconfig.space_efficient:
            os.remove(alignments_fpath)

    # Extraction not aligned contigs
    all_contigs_names = set(contigs.keys())
    not_aligned_contigs_names = all_contigs_names - aligned_contig_names
    fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names])

    not_aligned_asm = Assembly(not_aligned_fpath, asm.label)
    return assemblies_by_ref, not_aligned_asm
Esempio n. 15
0
def create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, completeness,
                         len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    kmc_stats_fpath = join(output_dir, label + '.stat')
    with open(kmc_check_fpath, 'w') as check_f:
        check_f.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath))
        check_f.write("Reference md5 checksum: %s\n" % md5(ref_fpath))
        check_f.write("Used assemblies: %s\n" % ','.join(contigs_fpaths))
    with open(kmc_stats_fpath, 'w') as stats_f:
        stats_f.write("Completeness: %s\n" % completeness)
        if len_map_to_one_chrom or len_map_to_multi_chrom:
            stats_f.write("Length assigned to one chromosome: %d\n" % len_map_to_one_chrom)
            stats_f.write("Length assigned to multi chromosomes: %d\n" % len_map_to_multi_chrom)
            stats_f.write("Length assigned to none chromosome: %d\n" % len_map_to_none_chrom)
            stats_f.write("Total length: %d\n" % total_len)
Esempio n. 16
0
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    if not exists(kmc_check_fpath):
        return False
    successful_check_content = open(kmc_check_fpath).read().split('\n')
    if len(successful_check_content) < 3:
        return False
    if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)):
        return False
    if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)):
        return False
    used_assemblies = successful_check_content[2].strip().split(': ')[-1]
    if used_assemblies and sorted(used_assemblies.split(',')) != sorted(contigs_fpaths):
        return False
    return True
Esempio n. 17
0
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    if not exists(kmc_check_fpath):
        return False
    successful_check_content = open(kmc_check_fpath).read().split('\n')
    if len(successful_check_content) < 3:
        return False
    if not successful_check_content[0].strip().endswith(str(getsize(contigs_fpath))):
        return False
    if not successful_check_content[1].strip().endswith(str(getsize(ref_fpath))):
        return False
    used_assemblies = successful_check_content[2].strip().split(': ')[-1]
    if used_assemblies and sorted(used_assemblies.split(',')) != sorted(contigs_fpaths):
        return False
    return True
Esempio n. 18
0
def create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, completeness,
                         len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    kmc_stats_fpath = join(output_dir, label + '.stat')
    with open(kmc_check_fpath, 'w') as check_f:
        check_f.write("Assembly file size in bytes: %d\n" % getsize(contigs_fpath))
        check_f.write("Reference file size in bytes: %d\n" % getsize(ref_fpath))
        check_f.write("Used assemblies: %s\n" % ','.join(contigs_fpaths))
    with open(kmc_stats_fpath, 'w') as stats_f:
        stats_f.write("Completeness: %s\n" % completeness)
        if len_map_to_one_chrom or len_map_to_multi_chrom:
            stats_f.write("Length assigned to one chromosome: %d\n" % len_map_to_one_chrom)
            stats_f.write("Length assigned to multi chromosomes: %d\n" % len_map_to_multi_chrom)
            stats_f.write("Length assigned to none chromosome: %d\n" % len_map_to_none_chrom)
            stats_f.write("Total length: %d\n" % total_len)
Esempio n. 19
0
def parse_alignments(contigs_fpaths, contig_report_fpath_pattern):
    lists_of_aligned_blocks = []
    for contigs_fpath in contigs_fpaths:
        if contig_report_fpath_pattern:
            report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(contigs_fpath)
            aligned_blocks, misassembled_id_to_structure = parse_nucmer_contig_report(report_fpath)
            if aligned_blocks is None:
                continue

            aligned_blocks = check_misassembled_blocks(aligned_blocks, misassembled_id_to_structure, filter_local=True)
            lists_of_aligned_blocks.append(aligned_blocks)

    if lists_of_aligned_blocks:
        max_contigs = max([len(aligned_blocks) for aligned_blocks in lists_of_aligned_blocks])
        return get_assemblies(contigs_fpaths, lists_of_aligned_blocks).assemblies, max_contigs
    else:
        return None, None
Esempio n. 20
0
def create_kmc_stats_file(output_dir, contigs_fpath, ref_fpath, completeness,
                          corr_len, mis_len, undef_len, total_len, translocations, relocations):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    kmc_check_fpath = join(output_dir, label + '.sf')
    kmc_stats_fpath = join(output_dir, label + '.stat')
    with open(kmc_check_fpath, 'w') as check_f:
        check_f.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath))
        check_f.write("Reference md5 checksum: %s\n" % md5(ref_fpath))
    with open(kmc_stats_fpath, 'w') as stats_f:
        stats_f.write("Completeness: %s\n" % completeness)
        if corr_len or mis_len:
            stats_f.write("K-mer-based correct length: %d\n" % corr_len)
            stats_f.write("K-mer-based misjoined length: %d\n" % mis_len)
            stats_f.write("K-mer-based undefined length: %d\n" % undef_len)
            stats_f.write("Total length: %d\n" % total_len)
            stats_f.write("# translocations: %d\n" % translocations)
            stats_f.write("# 100 kbp relocations: %d\n" % relocations)
Esempio n. 21
0
def create_jf_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
                         completeness, len_map_to_one_chrom,
                         len_map_to_multi_chrom, len_map_to_none_chrom):
    label = qutils.label_from_fpath_for_fname(contigs_fpath)
    jf_check_fpath = join(output_dir, label + '.sf')
    jf_stats_fpath = join(output_dir, label + '.stat')
    with open(jf_check_fpath, 'w') as check_f:
        check_f.write("Assembly file size in bytes: %d\n" %
                      getsize(contigs_fpath))
        check_f.write("Reference file size in bytes: %d\n" %
                      getsize(ref_fpath))
        check_f.write("Used assemblies: %s\n" % ','.join(contigs_fpaths))
    with open(jf_stats_fpath, 'w') as stats_f:
        stats_f.write("Completeness: %s\n" % completeness)
        stats_f.write("Length assigned to one chromosome: %d\n" %
                      len_map_to_one_chrom)
        stats_f.write("Length assigned to multi chromosomes: %d\n" %
                      len_map_to_multi_chrom)
        stats_f.write("Length assigned to none chromosome: %d\n" %
                      len_map_to_none_chrom)
Esempio n. 22
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    out_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer')
    err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer.stderr')

    #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir,
    #    fasta_path, out_path, gene_lengths, err_path)

    out_gff_path, genes, unique, total, cnt = glimmerHMM(tool_dirpath,
        contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index)

    if out_gff_path:
        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique) + ' unique, ' + str(total) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_path)

    return genes, unique, cnt
Esempio n. 23
0
def create_mismatches_plot(assembly, window_size, ref_len, root_dir, output_dir):
    assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
    aligner_dirpath = join(root_dir, '..', 'contigs_reports')
    coords_basename = join(create_minimap_output_dir(aligner_dirpath), assembly_label)
    _, coords_filtered_fpath, _, _ = get_aux_out_fpaths(coords_basename)
    if not exists(coords_filtered_fpath) or not qconfig.show_snps:
        return None

    mismatches_fpath = join(output_dir, assembly_label + '.mismatches.txt')
    mismatch_density_by_chrom = defaultdict(lambda : [0] * (ref_len // window_size + 1))
    with open(coords_filtered_fpath) as coords_file:
        for line in coords_file:
            s1 = int(line.split('|')[0].split()[0])
            chrom = line.split()[11].strip()
            cigar = line.split()[-1].strip()
            ref_pos = s1
            for op in parse_cs_tag(cigar):
                n_bases = len(op) - 1
                if op.startswith('*'):
                    mismatch_density_by_chrom[chrom][int(ref_pos) // window_size] += 1
                    ref_pos += 1
                elif not op.startswith('+'):
                    ref_pos += n_bases
    with open(mismatches_fpath, 'w') as out_f:
        for chrom, density_list in mismatch_density_by_chrom.items():
            start, end = 0, 0
            for i, density in enumerate(density_list):
                if density == 0:
                    end = (i + 1) * window_size
                else:
                    if end:
                        out_f.write('\t'.join([chrom, str(start), str(end), '0']) + '\n')
                    out_f.write('\t'.join([chrom, str(i * window_size), str(((i + 1) * window_size)), str(density)]) + '\n')
                    start = (i + 1) * window_size
                    end = None
            if end:
                out_f.write('\t'.join([chrom, str(start), str(end), '0']) + '\n')
    return mismatches_fpath
Esempio n. 24
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    kmer_len = qconfig.unique_kmer_len
    logger.main_info('Running analysis based on unique ' + str(kmer_len) +
                     '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath,
                                      contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(
                reporting.Fields.KMER_COMPLETENESS,
                '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 7:
                corr_len = int(stats_content[1].strip().split(': ')[-1])
                mis_len = int(stats_content[2].strip().split(': ')[-1])
                undef_len = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                translocations = int(stats_content[5].strip().split(': ')[-1])
                relocations = int(stats_content[6].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_CORR_LENGTH,
                                 '%.2f' % (corr_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_MIS_LENGTH,
                                 '%.2f' % (mis_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_UNDEF_LENGTH,
                                 '%.2f' % (undef_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_TRANSLOCATIONS,
                                 translocations)
                report.add_field(reporting.Fields.KMER_RELOCATIONS,
                                 relocations)
                report.add_field(reporting.Fields.KMER_MISASSEMBLIES,
                                 translocations + relocations)
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [
        fpath for fpath in contigs_fpaths if fpath not in checked_assemblies
    ]
    if len(contigs_fpaths) == 0:
        save_kmers(output_dir)
        logger.info('Done.')
        return

    if qconfig.platform_name == 'linux_32':
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC',
                                       ['kmc', 'kmc_tools'], logger)
    global kmc_bin_fpath
    global kmc_tools_fpath
    kmc_bin_fpath = download_external_tool('kmc',
                                           kmc_dirpath,
                                           'KMC',
                                           platform_specific=True,
                                           is_executable=True)
    kmc_tools_fpath = download_external_tool('kmc_tools',
                                             kmc_dirpath,
                                             'KMC',
                                             platform_specific=True,
                                             is_executable=True)
    if not exists(kmc_bin_fpath) or not exists(
            kmc_tools_fpath) or not compile_minimap(logger):
        logger.warning('  Sorry, can\'t run KMC, skipping...')
        return None

    logger.info('  Running KMC on reference...')
    if not isdir(output_dir):
        os.makedirs(output_dir)
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, kmer_len,
                                    log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath,
                                 err_fpath)
    if not unique_kmers:
        logger.warning('KMC failed, check ' + log_fpath + ' and ' + err_fpath +
                       '. Skipping...')
        return

    logger.info('  Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        logger.info('    ' + qutils.index_to_str(id) + assembly_label)

        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, kmer_len,
                                    log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(
            tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath,
            err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath,
                                      log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS,
                         '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('  Analyzing assemblies correctness...')
    ref_contigs = [name for name, _ in read_fasta(ref_fpath)]
    logger.info('    Downsampling k-mers...')
    ref_kmers, downsampled_kmers_fpath = downsample_kmers(
        tmp_dirpath, ref_fpath, ref_kmc_out_fpath, kmer_len, log_fpath,
        err_fpath)
    for id, (contigs_fpath,
             kmc_db_fpath) in enumerate(zip(contigs_fpaths, kmc_out_fpaths)):
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        logger.info('    ' + qutils.index_to_str(id) + assembly_label)

        report = reporting.get(contigs_fpath)
        corr_len = None
        mis_len = None
        undef_len = None
        translocations, relocations = None, None
        total_len = 0
        contig_lens = dict()
        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            contig_lens[name] = len(seq)

        if len(ref_contigs) > MAX_REF_CONTIGS_NUM:
            logger.warning(
                'Reference is too fragmented. Scaffolding accuracy will not be assessed.'
            )
        else:
            corr_len = 0
            mis_len = 0
            kmers_by_contig, kmers_pos_by_contig = align_kmers(
                tmp_dirpath, contigs_fpath, downsampled_kmers_fpath, err_fpath,
                qconfig.max_threads)
            is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref
            cyclic_ref_lens = report.get_field(
                reporting.Fields.REFLEN) if is_cyclic else None
            translocations = 0
            relocations = 0
            with open(
                    join(
                        tmp_dirpath,
                        qutils.label_from_fpath_for_fname(contigs_fpath) +
                        '.misjoins.txt'), 'w') as out:
                for contig in kmers_by_contig.keys():
                    contig_markers = []
                    prev_pos, prev_ref_pos, prev_chrom, marker = None, None, None, None
                    for pos, kmer in sorted(zip(kmers_pos_by_contig[contig],
                                                kmers_by_contig[contig]),
                                            key=lambda x: x[0]):
                        ref_chrom, ref_pos = ref_kmers[kmer]
                        if prev_pos and prev_chrom:
                            if prev_chrom == ref_chrom and abs(
                                    abs(pos - prev_pos) /
                                    abs(ref_pos - prev_ref_pos) - 1) <= 0.05:
                                marker = (pos, ref_pos, ref_chrom)
                            elif marker:
                                contig_markers.append(marker)
                                pos, ref_pos, ref_chrom, marker = None, None, None, None
                        prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom
                    if marker:
                        contig_markers.append(marker)
                    prev_pos, prev_ref_pos, prev_chrom = None, None, None
                    is_misassembled = False
                    for marker in contig_markers:
                        pos, ref_pos, ref_chrom = marker
                        if prev_pos and prev_chrom:
                            if ref_chrom != prev_chrom:
                                translocations += 1
                                out.write(
                                    'Translocation in %s: %s %d | %s %d\n' %
                                    (contig, prev_chrom, prev_pos, ref_chrom,
                                     pos))
                                is_misassembled = True
                            elif _get_dist_inconstistency(
                                    pos, prev_pos, ref_pos, prev_ref_pos,
                                    cyclic_ref_lens) > EXT_RELOCATION_SIZE:
                                relocations += 1
                                out.write(
                                    'Relocation in %s: %d (%d) | %d (%d)\n' %
                                    (contig, prev_pos, prev_ref_pos, pos,
                                     ref_pos))
                                is_misassembled = True
                        prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom
                    if is_misassembled:
                        mis_len += contig_lens[contig]
                    elif len(contig_markers) > 0:
                        corr_len += contig_lens[contig]
            undef_len = total_len - corr_len - mis_len
            report.add_field(reporting.Fields.KMER_CORR_LENGTH,
                             '%.2f' % (corr_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_MIS_LENGTH,
                             '%.2f' % (mis_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_UNDEF_LENGTH,
                             '%.2f' % (undef_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_TRANSLOCATIONS,
                             translocations)
            report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations)
            report.add_field(reporting.Fields.KMER_MISASSEMBLIES,
                             translocations + relocations)

        create_kmc_stats_file(
            output_dir, contigs_fpath, ref_fpath,
            report.get_field(reporting.Fields.KMER_COMPLETENESS), corr_len,
            mis_len, undef_len, total_len, translocations, relocations)
    save_kmers(output_dir)
    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
Esempio n. 25
0
def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus',
                          augustus_dirpath, [join('bin', 'augustus')],
                          logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(
            logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger,
                                is_prokaryote=qconfig.prokaryote,
                                is_fungus=qconfig.is_fungus)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    config_fpath = make_config(output_dir, tmp_dir, busco_threads,
                               clade_dirpath, augustus_dirpath)
    logger.info('Logs and results will be saved under ' + output_dir + '...')

    os.environ['BUSCO_CONFIG_FILE'] = config_fpath
    os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_contigs(
        augustus_dirpath, tmp_dir)
    if not os.environ['AUGUSTUS_CONFIG_PATH']:
        logger.error(
            'Augustus configs not found, failed to run BUSCO without them.')
    busco_args = [[
        contigs_fpath,
        qutils.label_from_fpath_for_fname(contigs_fpath)
    ] for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco_main_handler, busco_args,
                                  qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error(
            'Failed running BUSCO for all the assemblies. See log files in ' +
            output_dir + ' for information.')
        return

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(
                    reporting.Fields.BUSCO_COMPLETE,
                    ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART,
                                 ('%.2f' %
                                  (float(part_buscos) * 100.0 / total_buscos)))
            shutil.copy(summary_fpaths[i], output_dir)
        else:
            logger.error('Failed running BUSCO for ' + contigs_fpath +
                         '. See the log for detailed information.')
    if not qconfig.debug:
        cleanup(output_dir)
    logger.info('Done.')
Esempio n. 26
0
def process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath,
                        reference_chromosomes, ns_by_chromosomes, containers):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    results = dict()
    ref_lengths = defaultdict(int)
    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    coords_base_fpath = os.path.join(coords_dirpath, corr_assembly_label + '.coords')
    if qconfig.use_all_alignments:
        coords_fpath = coords_base_fpath
    else:
        coords_fpath = coords_base_fpath + '.filtered'

    if not os.path.isfile(coords_fpath):
        logger.error('File with alignment coords (' + coords_fpath + ') not found! Try to restart QUAST.',
            indent='  ')
        return None, None

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.items():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    sorted_contig_tuples = sorted(enumerate(contig_tuples), key=lambda x: len(x[1][1]), reverse=True)
    sorted_contigs_names = []
    contigs_order = []
    for idx, (name, _) in sorted_contig_tuples:
        sorted_contigs_names.append(name)
        contigs_order.append(idx)

    features_in_contigs = [0] * len(sorted_contigs_names)  # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock

    gene_searching_enabled = len(containers)
    if qconfig.memory_efficient and gene_searching_enabled:
        logger.warning('Run QUAST without genes and operons files to reduce memory consumption.')
    if gene_searching_enabled:
        for name in sorted_contigs_names:
            aligned_blocks_by_contig_name[name] = []
    with open(coords_fpath) as coordfile:
        for line in coordfile:
            s1 = int(line.split('|')[0].split()[0])
            e1 = int(line.split('|')[0].split()[1])
            s2 = int(line.split('|')[1].split()[0])
            e2 = int(line.split('|')[1].split()[1])
            contig_name = line.split()[12].strip()
            chr_name = line.split()[11].strip()

            if chr_name not in genome_mapping:
                logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \
                             "differ from the names in the reference. Try to remove the file and restart QUAST.")
                return None

            if gene_searching_enabled:
                aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1,
                                                                               contig=contig_name, start_in_contig=s2, end_in_contig=e2))
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1

    for chr_name in genome_mapping.keys():
        for i in ns_by_chromosomes[chr_name]:
            genome_mapping[chr_name][i] = 0
        ref_lengths[chr_name] = sum(genome_mapping[chr_name])

    if qconfig.space_efficient and coords_fpath.endswith('.filtered'):
        os.remove(coords_fpath)

    # counting genome coverage and gaps number
    gaps_count = 0
    if qconfig.analyze_gaps:
        gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null'
        with open(gaps_fpath, 'w') as gaps_file:
            for chr_name, chr_len in reference_chromosomes.items():
                gaps_file.write(chr_name + '\n')
                cur_gap_size = 0
                for i in range(1, chr_len + 1):
                    if genome_mapping[chr_name][i] == 1 or i in ns_by_chromosomes[chr_name]:
                        if cur_gap_size >= qconfig.min_gap_size:
                            gaps_count += 1
                            gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n')
                        cur_gap_size = 0
                    else:
                        cur_gap_size += 1
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n')

    results["gaps_count"] = gaps_count
    results[reporting.Fields.GENES + "_full"] = None
    results[reporting.Fields.GENES + "_partial"] = None
    results[reporting.Fields.OPERONS + "_full"] = None
    results[reporting.Fields.OPERONS + "_partial"] = None

    # finding genes and operons
    for container in containers:
        if not container.region_list:
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_genomic_features_' + container.kind.lower() + '.txt')
        found_file = open(found_fpath, 'w')
        found_file.write('%s\t\t%s\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type', 'Contig'))
        found_file.write('=' * 50 + '\n')

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            gene_blocks = []
            if region.id is None:
                region.id = '# ' + str(region.number + 1)
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if cur_block.seqname != region.seqname:
                        continue
                    if region.end <= cur_block.start or cur_block.end <= region.start:
                        continue
                    elif cur_block.start <= region.start and region.end <= cur_block.end:
                        if found_list[i] == 2:  # already found as partial gene
                            total_partial -= 1
                        found_list[i] = 1
                        total_full += 1
                        contig_info = cur_block.format_gene_info(region)
                        found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' % (region.id, region.start, region.end, contig_info))
                        if container.kind == 'operon':
                            operons_in_contigs[contig_id] += 1  # inc number of found genes/operons in id-th contig
                        else:
                            features_in_contigs[contig_id] += 1

                        cur_feature_is_found = True
                        break
                    elif min(region.end, cur_block.end) - max(region.start, cur_block.start) >= qconfig.min_gene_overlap:
                        if found_list[i] == 0:
                            found_list[i] = 2
                            total_partial += 1
                        gene_blocks.append(cur_block)
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break
            # adding info about partially found genes/operons
            if found_list[i] == 2:  # partial gene/operon
                contig_info = ','.join([block.format_gene_info(region) for block in sorted(gene_blocks, key=lambda block: block.start)])
                found_file.write('%s\t\t%d\t%d\tpartial\t%s\n' % (region.id, region.start, region.end, contig_info))

        if container.kind == 'operon':
            results[reporting.Fields.OPERONS + "_full"] = total_full
            results[reporting.Fields.OPERONS + "_partial"] = total_partial
        else:
            if results[reporting.Fields.GENES + "_full"] is None:
                results[reporting.Fields.GENES + "_full"] = 0
                results[reporting.Fields.GENES + "_partial"] = 0
            results[reporting.Fields.GENES + "_full"] += total_full
            results[reporting.Fields.GENES + "_partial"] += total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    unsorted_features_in_contigs = [features_in_contigs[idx] for idx in contigs_order]
    unsorted_operons_in_contigs = [operons_in_contigs[idx] for idx in contigs_order]

    return ref_lengths, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)
Esempio n. 27
0
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, cov_fpath=None,  physical_cov_fpath=None,
       stdout_pattern=None, find_similar=True, features=None, json_output_dir=None, genes_by_labels=None):
    make_output_dir(output_dirpath)

    lists_of_aligned_blocks = []
    contigs_by_assemblies = OrderedDict()
    structures_by_labels = {}
    ambiguity_alignments_by_labels = {}

    total_genome_size = 0
    reference_chromosomes = OrderedDict()
    contig_names_by_refs = None
    assemblies = None
    chr_names = []
    features_data = None

    plot_fpath = None
    max_small_chromosomes = 10

    if ref_fpath:
        for name, seq in fastaparser.read_fasta(ref_fpath):
            chr_name = name.split()[0]
            chr_names.append(chr_name)
            chr_len = len(seq)
            total_genome_size += chr_len
            reference_chromosomes[chr_name] = chr_len
        virtual_genome_shift = 100
        sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True)
        sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True)
        cumulative_ref_lengths = [0]
        if ref_labels_by_chromosomes:
            contig_names_by_refs = ref_labels_by_chromosomes
        elif sum(reference_chromosomes.values()) > qconfig.MAX_SIZE_FOR_COMB_PLOT:
            contig_names_by_refs = dict()
            if len(chr_names) > max_small_chromosomes:
                summary_len = 0
                num_parts = 1
                html_name = qconfig.alignment_viewer_part_name + str(num_parts)
                for chr_name, chr_len in reference_chromosomes.items():
                    summary_len += chr_len
                    contig_names_by_refs[chr_name] = html_name
                    if summary_len >= qconfig.MAX_SIZE_FOR_COMB_PLOT:
                        summary_len = 0
                        num_parts += 1
                        html_name = qconfig.alignment_viewer_part_name + str(num_parts)
            else:
                for chr_name in chr_names:
                    contig_names_by_refs[chr_name] = chr_name

        for i, chr in enumerate(chr_names):
            chr_length = reference_chromosomes[chr]
            len_to_append = cumulative_ref_lengths[-1] + chr_length
            if contig_names_by_refs:
                if i < len(chr_names) - 1 and contig_names_by_refs[chr] != contig_names_by_refs[chr_names[i + 1]]:
                    len_to_append = 0
            cumulative_ref_lengths.append(len_to_append)
        virtual_genome_size = sum(reference_chromosomes.values()) + virtual_genome_shift * (len(reference_chromosomes.values()) - 1)

    for contigs_fpath in contigs_fpaths:
        label = qconfig.assembly_labels_by_fpath[contigs_fpath]
        if not contig_report_fpath_pattern:
            contigs = parse_contigs_fpath(contigs_fpath)
        else:
            report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(contigs_fpath)
            aligned_blocks, misassembled_id_to_structure, contigs, ambiguity_alignments = parse_nucmer_contig_report(report_fpath,
                                                                        list(reference_chromosomes.keys()), cumulative_ref_lengths)
            if not contigs:
                contigs = parse_contigs_fpath(contigs_fpath)
            if aligned_blocks is None:
                return None
            for block in aligned_blocks:
                block.label = label
            aligned_blocks = check_misassembled_blocks(aligned_blocks, misassembled_id_to_structure)
            lists_of_aligned_blocks.append(aligned_blocks)
            structures_by_labels[label] = misassembled_id_to_structure
            if qconfig.ambiguity_usage == 'all':
                ambiguity_alignments_by_labels[label] = ambiguity_alignments
        contigs_by_assemblies[label] = contigs

    if contigs_fpaths and ref_fpath and features:
        features_data = parse_features_data(features, cumulative_ref_lengths, chr_names)
    if contigs_fpaths and qconfig.gene_finding:
        parse_genes_data(contigs_by_assemblies, genes_by_labels)
    if reference_chromosomes and lists_of_aligned_blocks:
        assemblies = get_assemblies(contigs_fpaths, virtual_genome_size, lists_of_aligned_blocks, find_similar)
        if qconfig.draw_svg:
            plot_fpath = draw_alignment_plot(assemblies, virtual_genome_size, output_dirpath, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift)
    if (assemblies or contigs_by_assemblies) and qconfig.create_icarus_html:
        icarus_html_fpath = js_data_gen(assemblies, contigs_fpaths, reference_chromosomes,
                    output_dirpath, structures_by_labels, contig_names_by_refs=contig_names_by_refs, ref_fpath=ref_fpath, stdout_pattern=stdout_pattern,
                    ambiguity_alignments_by_labels=ambiguity_alignments_by_labels, contigs_by_assemblies=contigs_by_assemblies,
                    features_data=features_data, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, json_output_dir=json_output_dir)
    else:
        icarus_html_fpath = None

    return icarus_html_fpath, plot_fpath
Esempio n. 28
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index,
                                  parallel_by_chr, threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_lens = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        ref_lens[name] = len(seq)
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(line[0]) # Kolya: python don't convert int<->str types automatically
            loc = int(line[3]) # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in ref_lens.items():
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file, icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)

    # if qconfig.large_genome:
    #     log_out_f.write('Analyzing large blocks...\n')
    #     large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null'
    #     ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'),
    #                                coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w'))
    #     min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD
    #     result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null',
    #                                   aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0])
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Esempio n. 29
0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
                        reference_chromosomes, genes_container, operons_container):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    results = dict()
    ref_lengths = {}
    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    nucmer_base_fpath = os.path.join(nucmer_path_dirpath, corr_assembly_label + '.coords')
    if qconfig.use_all_alignments:
        nucmer_fpath = nucmer_base_fpath
    else:
        nucmer_fpath = nucmer_base_fpath + '.filtered'

    if not os.path.isfile(nucmer_fpath):
        logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.',
            indent='  ')
        return None

    coordfile = open(nucmer_fpath, 'r')
    for line in coordfile:
        if line.startswith('='):
            break

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.items():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True)
    sorted_contigs_names = [name for (name, seq) in contig_tuples]

    genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock

    gene_searching_enabled = len(genes_container.region_list) or len(operons_container.region_list)
    if qconfig.memory_efficient and gene_searching_enabled:
        logger.warning('Run QUAST without genes and operons files to reduce memory consumption.')
    if gene_searching_enabled:
        for name in sorted_contigs_names:
            aligned_blocks_by_contig_name[name] = []
    for line in coordfile:
        if line.strip() == '':
            break
        s1 = int(line.split('|')[0].split()[0])
        e1 = int(line.split('|')[0].split()[1])
        s2 = int(line.split('|')[1].split()[0])
        e2 = int(line.split('|')[1].split()[1])
        contig_name = line.split()[12].strip()
        chr_name = line.split()[11].strip()

        if chr_name not in genome_mapping:
            logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \
                         "differ from the names in the reference. Try to remove the file and restart QUAST.")
            return None

        if gene_searching_enabled:
            aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1))
        if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
            for i in range(s1, len(genome_mapping[chr_name])):
                genome_mapping[chr_name][i] = 1
            for i in range(1, e1 + 1):
                genome_mapping[chr_name][i] = 1
        else: #if s1 <= e1:
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1
    coordfile.close()
    if qconfig.space_efficient and nucmer_fpath.endswith('.filtered'):
        os.remove(nucmer_fpath)

    # counting genome coverage and gaps number
    covered_bp = 0
    gaps_count = 0
    gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null'
    gaps_file = open(gaps_fpath, 'w')
    for chr_name, chr_len in reference_chromosomes.items():
        gaps_file.write(chr_name + '\n')
        cur_gap_size = 0
        aligned_len = 0
        for i in range(1, chr_len + 1):
            if genome_mapping[chr_name][i] == 1:
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n')
                aligned_len += 1
                covered_bp += 1
                cur_gap_size = 0
            else:
                cur_gap_size += 1
        ref_lengths[chr_name] = aligned_len
        if cur_gap_size >= qconfig.min_gap_size:
            gaps_count += 1
            gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n')
    gaps_file.close()

    results["covered_bp"] = covered_bp
    results["gaps_count"] = gaps_count

    # finding genes and operons
    for container, feature_in_contigs, field, suffix in [
        (genes_container,
         genes_in_contigs,
         reporting.Fields.GENES,
         '_genes.txt'),

        (operons_container,
         operons_in_contigs,
         reporting.Fields.OPERONS,
         '_operons.txt')]:

        if not container.region_list:
            results[field + "_full"] = None
            results[field + "_partial"] = None
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + suffix)
        found_file = open(found_fpath, 'w')
        found_file.write('%s\t\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type'))
        found_file.write('=========================================\n')

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if container.chr_names_dict[region.seqname] != cur_block.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1),
                                  AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)]
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            region_id = str(region.id)
                            if region_id == 'None':
                                region_id = '# ' + str(region.number + 1)
                            found_file.write('%s\t\t%d\t%d\tcomplete\n' % (region_id, region.start, region.end))
                            feature_in_contigs[contig_id] += 1  # inc number of found genes/operons in id-th contig

                            cur_feature_is_found = True
                            break
                        elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap:
                            found_list[i] = 2
                            total_partial += 1
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break
            # adding info about partially found genes/operons
            if found_list[i] == 2:  # partial gene/operon
                region_id = str(region.id)
                if region_id == 'None':
                    region_id = '# ' + str(region.number + 1)
                found_file.write('%s\t\t%d\t%d\tpartial\n' % (region_id, region.start, region.end))

        results[field + "_full"] = total_full
        results[field + "_partial"] = total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')

    return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
Esempio n. 30
0
def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus',
                          augustus_dirpath, [join('bin', 'augustus')],
                          logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(
            logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger,
                                is_prokaryote=qconfig.prokaryote,
                                is_fungus=qconfig.is_fungus)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    config_fpath = make_config(output_dir, tmp_dir, busco_threads,
                               clade_dirpath, augustus_dirpath)
    logger.info('Logs and results will be saved under ' + output_dir + '...')

    os.environ['BUSCO_CONFIG_FILE'] = config_fpath
    os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_configs(
        augustus_dirpath, tmp_dir)
    if not os.environ['AUGUSTUS_CONFIG_PATH']:
        logger.error(
            'Augustus configs not found, failed to run BUSCO without them.')
    busco_args = [[
        contigs_fpath,
        qutils.label_from_fpath_for_fname(contigs_fpath)
    ] for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco_main_handler, busco_args,
                                  qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error(
            'Failed running BUSCO for all the assemblies. See log files in ' +
            output_dir + ' for information '
            '(rerun with --debug to keep all intermediate files).')
        return

    # saving results
    zero_output_for_all = True
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(
                    reporting.Fields.BUSCO_COMPLETE,
                    ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART,
                                 ('%.2f' %
                                  (float(part_buscos) * 100.0 / total_buscos)))
            if complete_buscos + part_buscos > 0:
                zero_output_for_all = False
            shutil.copy(summary_fpaths[i], output_dir)
        else:
            logger.error(
                'Failed running BUSCO for ' + contigs_fpath +
                '. See the log for detailed information'
                ' (rerun with --debug to keep all intermediate files).')
    if zero_output_for_all:
        logger.warning(
            'BUSCO did not fail explicitly but found nothing for all assemblies! '
            'Possible reasons and workarounds:\n'
            '  1. Provided assemblies are so small that they do not contain even a single partial BUSCO gene. Not likely but may happen -- nothing to worry then.\n'
            '  2. Incorrect lineage database was used. To run with fungi DB use --fungus, to run with eukaryota DB use --eukaryote, otherwise BUSCO uses bacteria DB.\n'
            '  3. Problem with BUSCO dependencies, most likely Augustus. Check that the binaries in '
            + augustus_dirpath + '/bin/ are working properly.\n'
            '     If something is wrong with Augustus, you may try to install it yourself (https://github.com/Gaius-Augustus/Augustus) and add "augustus" binary to PATH.\n'
            '  4. Some other problem with BUSCO. Check the logs (you may need to rerun QUAST with --debug to see all intermediate files).\n'
            '     If you cannot solve the problem yourself, post an issue at https://github.com/ablab/quast/issues or write to [email protected]'
        )
    if not qconfig.debug:
        cleanup(output_dir)
    logger.info('Done.')
Esempio n. 31
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique 101-mers...')
    addsitedir(jellyfish_python_dirpath)
    try:
        compile_jellyfish(logger)
        import jellyfish
        try:
            import imp
            imp.reload(jellyfish)
        except:
            reload(jellyfish)
        jellyfish.MerDNA.k(KMERS_LEN)
    except:
        logger.warning('Failed unique 101-mers analysis.')
        return

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths,
                                     ref_fpath):
            jf_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(jf_stats_fpath).read().split('\n')
            if len(stats_content) < 4:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(
                reporting.Fields.KMER_COMPLETENESS,
                '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                '%.2f' % float(stats_content[1].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                '%.2f' % float(stats_content[2].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                '%.2f' % float(stats_content[3].strip().split(': ')[-1]))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [
        fpath for fpath in contigs_fpaths if fpath not in checked_assemblies
    ]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    logger.info('Running Jellyfish on reference...')
    jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf')
    qutils.call_subprocess([
        jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
        str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t',
        str(qconfig.max_threads), ref_fpath
    ])
    ref_kmers = jellyfish.ReadMerFile(jf_out_fpath)
    os.remove(jf_out_fpath)

    logger.info('Running Jellyfish on assemblies...')
    contigs_kmers = []
    for contigs_fpath in contigs_fpaths:
        jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf')
        qutils.call_subprocess([
            jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
            str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t',
            str(qconfig.max_threads), contigs_fpath
        ])
        contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath))
        os.remove(jf_out_fpath)

    logger.info('Analyzing completeness and accuracy of assemblies...')
    unique_kmers = 0
    matched_kmers = defaultdict(int)
    shared_kmers = set()
    kmer_i = 0
    for kmer, count in ref_kmers:
        unique_kmers += 1
        matches = 0
        for idx in range(len(contigs_fpaths)):
            if contigs_kmers[idx][kmer]:
                matched_kmers[idx] += 1
                matches += 1
        if matches == len(contigs_fpaths):
            if kmer_i % 100 == 0:
                shared_kmers.add(str(kmer))
            kmer_i += 1

    for idx, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        completeness = matched_kmers[idx] * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS,
                         '%.2f' % completeness)

    shared_kmers_by_chrom = dict()
    ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath))
    for name, seq in ref_contigs.items():
        seq_kmers = jellyfish.string_mers(seq)
        for kmer in seq_kmers:
            if str(kmer) in shared_kmers:
                shared_kmers_by_chrom[str(kmer)] = name

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = 0
        len_map_to_multi_chrom = 0
        total_len = 0

        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            seq_kmers = jellyfish.string_mers(seq)
            chrom_markers = []
            for kmer in seq_kmers:
                kmer_str = str(kmer)
                if kmer_str in shared_kmers_by_chrom:
                    chrom = shared_kmers_by_chrom[kmer_str]
                    chrom_markers.append(chrom)
            if len(chrom_markers) < MIN_MARKERS:
                continue
            if len(set(chrom_markers)) == 1:
                len_map_to_one_chrom += len(seq)
            else:
                len_map_to_multi_chrom += len(seq)

        len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                         '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                         '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                         '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_jf_stats_file(
            output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
            report.get_field(reporting.Fields.KMER_COMPLETENESS),
            len_map_to_one_chrom, len_map_to_multi_chrom,
            len_map_to_none_chrom)

    logger.info('Done.')
Esempio n. 32
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1):
    tmp_output_dirpath = create_minimap_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    out_basename = join(tmp_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(out_basename)
    status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads,
                           log_out_fpath, log_err_fpath)
    if status != AlignerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if status == AlignerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif status == AlignerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif status == AlignerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        return status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    with open(coords_fpath) as coords_file:
        for line in coords_file:
            mapping = Mapping.from_line(line)
            aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_features = {}

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in reference_chromosomes.items():
        log_out_f.write('\tLoaded [%s]\n' % name)
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'),
                         icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    total_aligned_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)
    total_indels_info += indels_info
    cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases}
    result.update(cov_stats)
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    if not ref_aligns:
        return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Esempio n. 33
0
def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    set_augustus_dir(augustus_dirpath)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    log_fpath = join(output_dir, 'busco.log')
    logger.info('Logging to ' + log_fpath + '...')
    busco_args = [(['-i', contigs_fpath, '-o', qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath,
                    '-m', 'genome', '-f', '-z', '-c', str(busco_threads), '-t', tmp_dir,
                    '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' + join(augustus_dirpath, 'config') + '\'' ], output_dir)
                    for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error('Failed running BUSCO for all the assemblies. See ' + log_fpath + ' for information.')
        return

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos)))
        else:
            logger.error(
                'Failed running BUSCO for ' + contigs_fpath + '. See ' + log_fpath + ' for information.')
    logger.info('Done.')
Esempio n. 34
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 5:
                len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1])
                len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1])
                len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    if qconfig.platform_name == 'linux_32':
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger)
    global kmc_bin_fpath
    global kmc_tools_fpath
    kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True)
    kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True)
    if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath) or not compile_minimap(logger):
        logger.warning('  Sorry, can\'t run KMC, skipping...')
        return None

    logger.info('Running KMC on reference...')
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath)
    if not unique_kmers:
        return

    logger.info('Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('Analyzing assemblies accuracy...')
    if len(kmc_out_fpaths) > 1:
        shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath)
    else:
        shared_kmc_db = kmc_out_fpaths[0]

    kmer_fraction = 0.001

    ref_contigs = [name for name, _ in read_fasta(ref_fpath)]
    ref_kmc_dbs = []

    if len(ref_contigs) <= MAX_REF_CONTIGS_NUM:
        shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, ref_fpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction)
        for name, seq in read_fasta(ref_fpath):
            seq_kmc_db = seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, seq=seq, name=name, is_ref=True,
                                                     intersect_with=shared_downsampled_kmc_db)
            ref_kmc_dbs.append((name, seq_kmc_db))

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = None
        len_map_to_multi_chrom = None
        len_map_to_none_chrom = None
        total_len = 0
        long_contigs = []
        contig_lens = dict()
        contig_markers = defaultdict(list)
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        list_files_fpath = join(tmp_dirpath, label + '_files.txt')
        with open(list_files_fpath, 'w') as list_files:
            for name, seq in read_fasta(contigs_fpath):
                total_len += len(seq)
                contig_lens[name] = len(seq)
                if len(seq) >= MIN_CONTIGS_LEN:
                    long_contigs.append(len(seq))
                    tmp_contig_fpath = join(tmp_dirpath, name + '.fasta')
                    with open(tmp_contig_fpath, 'w') as out_f:
                        out_f.write('>%s\n' % name)
                        out_f.write('%s\n' % seq)
                    list_files.write(tmp_contig_fpath + '\n')

        if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5:
            logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.')
        elif len(ref_contigs) > MAX_REF_CONTIGS_NUM:
            logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.')
        else:
            len_map_to_one_chrom = 0
            len_map_to_multi_chrom = 0
            filtered_fpath = join(tmp_dirpath, label + '.filtered.fasta')
            filter_contigs(list_files_fpath, filtered_fpath, shared_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MARKERS)
            filtered_list_files_fpath = join(tmp_dirpath, label + '_files.filtered.txt')
            with open(filtered_list_files_fpath, 'w') as list_files:
                for name, _ in read_fasta(filtered_fpath):
                    tmp_contig_fpath = join(tmp_dirpath, name + '.fasta')
                    list_files.write(tmp_contig_fpath + '\n')
            for ref_name, ref_kmc_db in ref_kmc_dbs:
                tmp_filtered_fpath = join(tmp_dirpath, ref_name + '.filtered.fasta')
                filter_contigs(filtered_list_files_fpath, tmp_filtered_fpath, ref_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MISJOIN_MARKERS)
                if exists(tmp_filtered_fpath):
                    for name, _ in read_fasta(tmp_filtered_fpath):
                        contig_markers[name].append(ref_name)
            for name, chr_markers in contig_markers.items():
                if len(chr_markers) == 1:
                    len_map_to_one_chrom += contig_lens[name]
                else:
                    len_map_to_multi_chrom += contig_lens[name]
            len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
                             report.get_field(reporting.Fields.KMER_COMPLETENESS),
                             len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
Esempio n. 35
0
def do(ref_fpath, contigs_fpaths, output_dirpath):
    gage_results_dirpath = os.path.join(output_dirpath, 'gage')

    # suffixes for files with report tables in plain text and tab separated formats
    if not os.path.isdir(gage_results_dirpath):
        os.mkdir(gage_results_dirpath)

    ########################################################################
    gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh')

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running GAGE...')

    metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases',
               'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs',
               'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp',
               'Indels >= 5', 'Inversions', 'Relocation', 'Translocation',
               'Total units', 'BasesInFasta', 'Min', 'Max', 'N50']
    metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, 
                            reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE,
                            reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, 
                            reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, 
                            reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, 
                            reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, 
                            reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, 
                            reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, 
                            reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50]

    tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp')
    if not os.path.exists(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    if not compile_aligner(logger) or (not all_required_java_classes_exist(gage_dirpath) and not compile_gage()):
        logger.error('GAGE module was not installed properly, so it is disabled and you cannot use --gage.')
        return

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath)
        for i, contigs_fpath in enumerate(contigs_fpaths))

    if 0 not in return_codes:
        logger.error('Error occurred while GAGE was processing assemblies.'
                     ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr'))
        return

    ## find metrics for total report:
    for i, contigs_fpath in enumerate(contigs_fpaths):
        corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

        report = reporting.get(contigs_fpath)

        log_out_fpath = os.path.join(
            gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout')
        logfile_out = open(log_out_fpath, 'r')
        cur_metric_id = 0
        for line in logfile_out:
            if metrics[cur_metric_id] in line:
                if (metrics[cur_metric_id].startswith('N50')):
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip())
                else:
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip())
                cur_metric_id += 1
                if cur_metric_id == len(metrics):
                    break
        logfile_out.close()

    reporting.save_gage(output_dirpath)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Esempio n. 36
0
def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus',
                          augustus_dirpath, [join('bin', 'augustus')],
                          logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(
            logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    set_augustus_dir(augustus_dirpath)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    log_fpath = join(output_dir, 'busco.log')
    logger.info('Logging to ' + log_fpath + '...')
    busco_args = [([
        '-i', contigs_fpath, '-o',
        qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath,
        '-m', 'genome', '-f', '-z', '-c',
        str(busco_threads), '-t', tmp_dir,
        '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' +
        join(augustus_dirpath, 'config') + '\''
    ], output_dir) for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error('Failed running BUSCO for all the assemblies. See ' +
                     log_fpath + ' for information.')
        return

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(
                    reporting.Fields.BUSCO_COMPLETE,
                    ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART,
                                 ('%.2f' %
                                  (float(part_buscos) * 100.0 / total_buscos)))
        else:
            logger.error('Failed running BUSCO for ' + contigs_fpath +
                         '. See ' + log_fpath + ' for information.')
    logger.info('Done.')
Esempio n. 37
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 5:
                len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1])
                len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1])
                len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath):
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    logger.info('Running KMC on reference...')
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath)
    if not unique_kmers:
        return

    logger.info('Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('Analyzing assemblies accuracy...')
    if len(kmc_out_fpaths) > 1:
        shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath)
    else:
        shared_kmc_db = kmc_out_fpaths[0]

    kmer_fraction = 100 if getsize(ref_fpath) < 500 * 1024 ** 2 else 1000

    shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction)

    shared_kmers_by_chrom = dict()
    shared_kmers_fpath = join(tmp_dirpath, 'shared_kmers.txt')
    ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath))
    with open(shared_kmers_fpath, 'w') as out_f:
        for name, seq in ref_contigs.items():
            seq_kmers = get_string_kmers(tmp_dirpath, log_fpath, err_fpath, seq=seq, intersect_with=shared_downsampled_kmc_db)
            for kmer_i, kmer in enumerate(seq_kmers):
                shared_kmers_by_chrom[str(kmer)] = name
                out_f.write('>' + str(kmer_i) + '\n')
                out_f.write(kmer + '\n')

    shared_kmc_db = count_kmers(tmp_dirpath, shared_kmers_fpath, log_fpath, err_fpath)
    ref_kmc_dbs = []
    for ref_name, ref_seq in ref_contigs.items():
        ref_contig_fpath = join(tmp_dirpath, ref_name + '.fa')
        if not is_non_empty_file(ref_contig_fpath):
            with open(ref_contig_fpath, 'w') as out_f:
                out_f.write(ref_seq)
        ref_kmc_db = count_kmers(tmp_dirpath, ref_contig_fpath, log_fpath, err_fpath)
        ref_shared_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, shared_kmc_db], log_fpath, err_fpath)
        ref_kmc_dbs.append((ref_name, ref_shared_kmc_db))

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = None
        len_map_to_multi_chrom = None
        len_map_to_none_chrom = None
        total_len = 0
        long_contigs = []
        contig_lens = dict()
        contig_markers = defaultdict(list)
        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            contig_lens[name] = len(seq)
            if len(seq) >= MIN_CONTIGS_LEN:
                long_contigs.append(len(seq))

        if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5:
            logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.')
        elif len(ref_kmc_dbs) > MAX_CONTIGS_NUM:
            logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.')
        else:
            len_map_to_one_chrom = 0
            len_map_to_multi_chrom = 0
            for name, seq in read_fasta(contigs_fpath):
                if len(seq) < MIN_CONTIGS_LEN:
                    continue

                tmp_contig_fpath = join(tmp_dirpath, name + '.fa')
                with open(tmp_contig_fpath, 'w') as out_tmp_f:
                    out_tmp_f.write(seq)
                contig_kmc_db = count_kmers(tmp_dirpath, tmp_contig_fpath, log_fpath, err_fpath)
                intersect_all_ref_kmc_db = intersect_kmers(tmp_dirpath, [contig_kmc_db, shared_kmc_db], log_fpath, err_fpath)
                kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_all_ref_kmc_db, log_fpath, err_fpath)
                if kmers_cnt < MIN_MARKERS:
                    continue
                for ref_name, ref_kmc_db in ref_kmc_dbs:
                    intersect_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, intersect_all_ref_kmc_db], log_fpath, err_fpath)
                    kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_kmc_db, log_fpath, err_fpath)
                    if kmers_cnt:
                        contig_markers[name].append(ref_name)
            for name, chr_markers in contig_markers.items():
                if len(chr_markers) == 1:
                    len_map_to_one_chrom += contig_lens[name]
                else:
                    len_map_to_multi_chrom += contig_lens[name]
            len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
                             report.get_field(reporting.Fields.KMER_COMPLETENESS),
                             len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')