Beispiel #1
0
def save_features_in_contigs(output_dirpath, contigs_fpaths, feature_name, features_in_contigs, ref_features_num):
    return save(join(output_dirpath, feature_name + in_contigs_suffix_fn), {
        'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths],
        feature_name + '_in_contigs': dict((qutils.label_from_fpath(contigs_fpath), feature_amounts)
                                           for (contigs_fpath, feature_amounts) in features_in_contigs.items()),
        'ref_' + feature_name + '_number': ref_features_num,
    })
Beispiel #2
0
def save_features_in_contigs(output_dirpath, contigs_fpaths, feature_name,
                             features_in_contigs, ref_features_num):
    return save(
        join(output_dirpath, feature_name + in_contigs_suffix_fn), {
            'filenames':
            [qutils.label_from_fpath(label) for label in contigs_fpaths],
            feature_name + '_in_contigs':
            dict((qutils.label_from_fpath(contigs_fpath), feature_amounts)
                 for (contigs_fpath,
                      feature_amounts) in features_in_contigs.items()),
            'ref_' + feature_name + '_number':
            ref_features_num,
        })
def contigs_GC_content_plot(contigs_fpath, GC_distributions, plot_fpath):
    if not can_draw_plots or qconfig.no_gc:
        return
    title = label_from_fpath(contigs_fpath) + ' GC content'
    logger.info('  Drawing ' + title + ' plot...')

    plots = []
    color, ls = get_color_and_ls(contigs_fpath)
    x_vals, y_vals = GC_distributions

    for GC_x, GC_y in zip(x_vals, y_vals):
        plots.append(Bar(GC_x, GC_y, color, width=5))

    legend_list = [label_from_fpath(contigs_fpath)]
    create_plot(plot_fpath, title, plots, legend_list, x_label='GC (%)', y_label='# contigs', x_limit=[0, 100])
Beispiel #4
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function,
                  prokaryote, num_threads):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_genemark.stderr')

    genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads)

    if not genes:
        unique_count = None
        count = None  # [None] * len(gene_lengths)
    else:
        tool_name = "genemark"
        out_gff_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else ''))
        add_genes_to_gff(genes, out_gff_fpath, prokaryote)
        if OUTPUT_FASTA:
            out_fasta_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.fasta')
            add_genes_to_fasta(genes, out_fasta_fpath)

        count = [sum([gene.end - gene.start > x for gene in genes]) for x in gene_lengths]
        gene_ids = [gene.seq if gene.seq else gene.name for gene in genes]
        unique_count = len(set(gene_ids))
        total_count = len(genes)

        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_fpath)

    return genes, unique_count, count
Beispiel #5
0
def save_colors_and_ls(fpaths, labels=None):
    if not labels:
        labels = [qutils.label_from_fpath(fpath) for fpath in fpaths]
    if not dict_color_and_ls:
        color_id = 0
        for i, fpath in enumerate(fpaths):
            ls = primary_line_style
            label = labels[i]
            # contigs and scaffolds should be equally colored but scaffolds should be dashed
            if fpath and fpath in qconfig.dict_of_broken_scaffolds:
                color = dict_color_and_ls[qutils.label_from_fpath(qconfig.dict_of_broken_scaffolds[fpath])][0]
                ls = secondary_line_style
            else:
                 color = colors[color_id % len(colors)]
                 color_id += 1
            dict_color_and_ls[label] = (color, ls)
Beispiel #6
0
def get_color_and_ls(fpath, label=None):
    if not label:
        label = qutils.label_from_fpath(fpath)
    """
    Returns tuple: color, line style
    """
    return dict_color_and_ls[label]
Beispiel #7
0
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess(
        ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference,
         contigs_fpath, tmp_dir, str(qconfig.min_contig)],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(i),
        only_if_debug=False)
    if return_code != 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
Beispiel #8
0
def get(assembly_fpath, ref_name=None):
    if not ref_name and qconfig.reference:
        ref_name = qutils.name_from_fpath(qconfig.reference)
    if assembly_fpath not in assembly_fpaths:
        assembly_fpaths.append(assembly_fpath)
    return reports.setdefault((os.path.abspath(assembly_fpath), ref_name),
                              Report(qutils.label_from_fpath(assembly_fpath)))
Beispiel #9
0
def calculate_ave_read_support(combined_output_dirpath, assemblies):
    unique_contigs_fpath = os.path.join(combined_output_dirpath,
                                        'contigs_reports',
                                        qconfig.unique_contigs_fname_pattern)
    for assembly in assemblies:
        aligned_contigs_by_ref = dict()
        assembly_label = qutils.label_from_fpath(assembly.fpath)
        corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
        with open(unique_contigs_fpath % corr_assembly_label) as in_f:
            for line in in_f:
                ref_name, contig_len, contig_cov = line.strip().split('\t')
                aligned_contigs_by_ref.setdefault(ref_name, []).append(
                    (float(contig_len), float(contig_cov)))
        for ref_name, contigs in aligned_contigs_by_ref.items():
            ref_cov = sum(contig_cov * aligned_len
                          for (aligned_len, contig_cov) in contigs)
            ref_cov /= sum(aligned_len
                           for (aligned_len, contig_cov) in contigs)
            corr_assembly_label = qutils.label_from_fpath_for_fname(
                assembly.fpath)
            ref_contigs_fpath = os.path.join(
                os.path.dirname(assembly.fpath),
                corr_assembly_label + '_to_' + ref_name + '.fasta')
            qconfig.assembly_labels_by_fpath[
                ref_contigs_fpath] = assembly_label
            report = reporting.get(ref_contigs_fpath, ref_name=ref_name)
            report.add_field(reporting.Fields.AVE_READ_SUPPORT,
                             '%.2f' % ref_cov)
def GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions, plot_fpath):
    if not can_draw_plots or qconfig.no_gc:
        return
    title = 'GC content'
    logger.info('  Drawing ' + title + ' plot...')

    plots = []

    all_fpaths = contigs_fpaths
    if ref_fpath:
        all_fpaths = contigs_fpaths + [ref_fpath]

    for i, (GC_distribution_x, GC_distribution_y) in enumerate(list_of_GC_distributions):
        # for log scale
        for id2, v in enumerate(GC_distribution_y):
            if v == 0:
                GC_distribution_y[id2] = 0.1

        # add to plot
        if ref_fpath and (i == len(all_fpaths) - 1):
            color = reference_color
            ls = reference_ls
        else:
            color, ls = get_color_and_ls(all_fpaths[i])

        plots.append(Plot(GC_distribution_x, GC_distribution_y, color, ls))

    legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths]
    if ref_fpath:
        legend_list += ['Reference']
    create_plot(plot_fpath, title, plots, legend_list, x_label='GC (%)', y_label='# windows', x_limit=[0, 100])
Beispiel #11
0
def save_colors_and_ls(fpaths, labels=None):
    if not labels:
        labels = [qutils.label_from_fpath(fpath) for fpath in fpaths]
    if not dict_color_and_ls:
        color_id = 0
        for i, fpath in enumerate(fpaths):
            ls = primary_line_style
            label = labels[i]
            # contigs and scaffolds should be equally colored but scaffolds should be dashed
            if fpath and fpath in qconfig.dict_of_broken_scaffolds:
                color = dict_color_and_ls[qutils.label_from_fpath(qconfig.dict_of_broken_scaffolds[fpath])][0]
                ls = secondary_line_style
            else:
                 color = colors[color_id % len(colors)]
                 color_id += 1
            dict_color_and_ls[label] = (color, ls)
Beispiel #12
0
def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath):
    total_len = dict()
    contigs_dict = dict()

    contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]]
    for contigs_fpath in contigs_fpaths:
        total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN)
        contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS)
    cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]
    num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]

    common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs)
    histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)'
    plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram',
                               histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
    for contigs_fpath in contigs_with_coverage:
        coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]],
                                                                                             [contigs_dict[contigs_fpath]])
        label = qutils.label_from_fpath(contigs_fpath)
        corr_label = qutils.label_from_fpath_for_fname(contigs_fpath)
        histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)'
        histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram')
        plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath,
                                   histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov,
                                   low_threshold=low_threshold, high_threshold=high_threshold)
Beispiel #13
0
def save_coord(output_dirpath, coord_x, coord_y, name_coord, contigs_fpaths):
    coord_fn = name_coord + suffix_fn
    return save(join(output_dirpath, coord_fn), {
        'coord_x': coord_x,
        'coord_y': coord_y,
        'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths]
    })
def Nx_plot(results_dir, reduce_points, contigs_fpaths, lists_of_lengths, plot_fpath, title='Nx', reference_lengths=None):
    if can_draw_plots:
        logger.info('  Drawing ' + title + ' plot...')

    plots = []
    json_vals_x = []  # coordinates for Nx-like plots in HTML-report
    json_vals_y = []

    for id, (contigs_fpath, lengths) in enumerate(zip(contigs_fpaths, lists_of_lengths)):
        if not lengths:
            json_vals_x.append([])
            json_vals_y.append([])
            continue
        lengths.sort(reverse=True)
        vals_x = [0.0]
        vals_y = [lengths[0]]
        # calculate values for the plot
        vals_Nx = [0.0]
        vals_l = [lengths[0]]
        lcur = 0
        # if Nx-plot then we just use sum of contigs lengths, else use reference_length
        lsum = sum(lengths)
        if reference_lengths:
            lsum = reference_lengths[id]
        min_difference = 0
        if reduce_points:
            min_difference = qconfig.min_difference
        for l in lengths:
            lcur += l
            x = lcur * 100.0 / lsum
            if can_draw_plots:
                vals_Nx.append(vals_Nx[-1] + 1e-10) # eps
                vals_l.append(l)
                vals_Nx.append(x)
                vals_l.append(l)
            if vals_y[-1] - l > min_difference or len(vals_x) == 1:
                vals_x.append(vals_x[-1] + 1e-10) # eps
                vals_y.append(l)
                vals_x.append(x)
                vals_y.append(l)
            # add to plot
        json_vals_x.append(vals_x)
        json_vals_y.append(vals_y)
        if can_draw_plots:
            vals_Nx.append(vals_Nx[-1] + 1e-10) # eps
            vals_l.append(0.0)
            vals_x.append(vals_x[-1] + 1e-10) # eps
            vals_y.append(0.0)
            color, ls = get_color_and_ls(contigs_fpath)
            plots.append(Plot(vals_Nx, vals_l, color, ls))

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_coord(results_dir, json_vals_x, json_vals_y, 'coord' + title, contigs_fpaths)

    if not can_draw_plots:
        return

    legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths]
    create_plot(plot_fpath, title, plots, legend_list, x_label='x', y_label='Contig length', x_limit=[0, 100])
Beispiel #15
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath,
                  tool_dirpath, tmp_dirpath):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    out_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer')
    err_fpath = os.path.join(out_dirpath,
                             corr_assembly_label + '_glimmer.stderr')

    #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir,
    #    fasta_path, out_path, gene_lengths, err_path)

    out_gff_path, genes, unique, total, full_genes, partial_genes = glimmerHMM(
        tool_dirpath, contigs_fpath, out_fpath, gene_lengths, err_fpath,
        tmp_dirpath, index)

    if out_gff_path:
        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' +
                    str(unique) + ' unique, ' + str(total) + ' total')
        logger.info('  ' + qutils.index_to_str(index) +
                    '  Predicted genes (GFF): ' + out_gff_path)

    return genes, unique, full_genes, partial_genes
Beispiel #16
0
def save_contigs_lengths(output_dirpath, contigs_fpaths, lists_of_lengths):
    lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]

    return save(join(output_dirpath, contigs_lengths_fn), {
        'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths],
        'lists_of_lengths': lists_of_lengths
    })
Beispiel #17
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function,
                  prokaryote, num_threads):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_genemark.stderr')

    genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads)

    if not genes:
        unique_count = None
        count = None  # [None] * len(gene_lengths)
    else:
        tool_name = "genemark"
        out_gff_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else ''))
        add_genes_to_gff(genes, out_gff_fpath, prokaryote)
        if OUTPUT_FASTA:
            out_fasta_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.fasta')
            add_genes_to_fasta(genes, out_fasta_fpath)

        count = [sum([gene.end - gene.start > x for gene in genes]) for x in gene_lengths]
        gene_ids = [gene.seq if gene.seq else gene.name for gene in genes]
        unique_count = len(set(gene_ids))
        total_count = len(genes)

        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_fpath)

    return genes, unique_count, count
Beispiel #18
0
def save_GC_info(output_dirpath, contigs_fpaths, list_of_GC_distributions, reference_index):
    return save(join(output_dirpath, gc_fn), {
        'filenames': [qutils.label_from_fpath(label) for label in  contigs_fpaths],
        'reference_index': reference_index,
        'list_of_GC_distributions': list_of_GC_distributions,
        'lists_of_gc_info': None,
    })
Beispiel #19
0
def get_assemblies_data(contigs_fpaths, icarus_dirpath, stdout_pattern, nx_marks):
    assemblies_n50 = defaultdict(dict)
    assemblies_data = ''
    assemblies_data += 'var assemblies_links = {};\n'
    assemblies_data += 'var assemblies_len = {};\n'
    assemblies_data += 'var assemblies_contigs = {};\n'
    assemblies_data += 'var assemblies_misassemblies = {};\n'
    assemblies_data += 'var assemblies_n50 = {};\n'
    assemblies_contig_size_data = ''
    for contigs_fpath in contigs_fpaths:
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        report = reporting.get(contigs_fpath)
        l = report.get_field(reporting.Fields.TOTALLEN)
        contigs = report.get_field(reporting.Fields.CONTIGS)
        n50 = report.get_field(reporting.Fields.N50)
        if stdout_pattern:
            contig_stdout_fpath = stdout_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) + '.stdout'
            contig_stdout_fpath = qutils.relpath(contig_stdout_fpath, icarus_dirpath)
            assemblies_data += 'assemblies_links["' + assembly_label + '"] = "' + contig_stdout_fpath + '";\n'
        assemblies_contig_size_data += 'assemblies_len["' + assembly_label + '"] = ' + str(l) + ';\n'
        assemblies_contig_size_data += 'assemblies_contigs["' + assembly_label + '"] = ' + str(contigs) + ';\n'
        assemblies_contig_size_data += 'assemblies_n50["' + assembly_label + '"] = "' + str(n50) + '";\n'
        for nx in nx_marks:
            assemblies_n50[assembly_label][nx] = report.get_field(nx)
    return assemblies_data, assemblies_contig_size_data, assemblies_n50
Beispiel #20
0
def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath):
    total_len = dict()
    contigs_dict = dict()

    contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]]
    for contigs_fpath in contigs_fpaths:
        total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN)
        contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS)
    cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]
    num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]

    common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs)
    histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)'
    plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram',
                               histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
    for contigs_fpath in contigs_with_coverage:
        coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]],
                                                                                             [contigs_dict[contigs_fpath]])
        label = qutils.label_from_fpath(contigs_fpath)
        corr_label = qutils.label_from_fpath_for_fname(contigs_fpath)
        histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)'
        histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram')
        plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath,
                                   histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov,
                                   low_threshold=low_threshold, high_threshold=high_threshold)
Beispiel #21
0
def get_color_and_ls(fpath, label=None):
    if not label:
        label = qutils.label_from_fpath(fpath)
    """
    Returns tuple: color, line style
    """
    return dict_color_and_ls[label]
Beispiel #22
0
def save_colors(results_dirpath,
                contigs_fpaths,
                dict_colors,
                meta=False):  # coordinates for Nx, NAx, NGx, NGAX
    if meta:
        html_fpath = os.path.join(results_dirpath, report_fname)
        with open(html_fpath) as f_html:
            html_text = f_html.read()
        html_text = re.sub('{{ ' + 'colors' + ' }}', 'standard_colors',
                           html_text)
        html_text = re.sub('{{ ' + 'broken_scaffolds' + ' }}', '[]', html_text)
        with open(html_fpath, 'w') as f_html:
            f_html.write(html_text)
    else:
        contig_labels = [
            qutils.label_from_fpath(contigs_fpath)
            for contigs_fpath in contigs_fpaths
        ]
        colors_and_ls = [
            dict_colors[contig_label] for contig_label in contig_labels
        ]
        colors = [color_and_ls[0] for color_and_ls in colors_and_ls]
        colors_for_html = [
            html_colors[plotter_data.colors.index(color)] for color in colors
        ]
        save_record(results_dirpath, 'colors', colors_for_html)
        broken_contig_names = [
            label for i, label in enumerate(contig_labels)
            if colors_and_ls[i][1] == secondary_line_style
        ]
        save_record(results_dirpath, 'broken_scaffolds', broken_contig_names)
Beispiel #23
0
def save_total_report(output_dirpath, min_contig, ref_fpath):
    from quast_libs import reporting
    asm_names = [
        qutils.label_from_fpath(this) for this in reporting.assembly_fpaths
    ]
    report = reporting.table(reporting.Fields.grouped_order)
    subreports = []
    ref_names = []
    if qconfig.is_combined_ref and ref_labels_by_chromosomes:
        ref_names = sorted(
            list(set([ref for ref in ref_labels_by_chromosomes.values()])))
        subreports = [
            reporting.table(reporting.Fields.grouped_order, ref_name=ref_name)
            for ref_name in ref_names
        ]
    t = datetime.datetime.now()

    return save(
        join(output_dirpath, total_report_fname), {
            'date': t.strftime('%d %B %Y, %A, %H:%M:%S'),
            'assembliesNames': asm_names,
            'referenceName':
            qutils.name_from_fpath(ref_fpath) if ref_fpath else '',
            'order': [i for i, _ in enumerate(asm_names)],
            'report': report,
            'subreferences': ref_names,
            'subreports': subreports,
            'minContig': min_contig
        })
def genes_operons_plot(reference_value, contigs_fpaths, files_feature_in_contigs, plot_fpath, title):
    if not can_draw_plots:
        return

    logger.info('  Drawing ' + title + ' cumulative plot...')

    plots = []
    max_x = 0

    for contigs_fpath in contigs_fpaths:
        # calculate values for the plot
        feature_in_contigs = files_feature_in_contigs[contigs_fpath]

        x_vals = list(range(len(feature_in_contigs) + 1))
        y_vals = [0]
        total_full = 0
        for feature_amount in feature_in_contigs:
            total_full += feature_amount
            y_vals.append(total_full)

        if len(x_vals) > 0:
            max_x = max(x_vals[-1], max_x)

        color, ls = get_color_and_ls(contigs_fpath)
        plots.append(Plot(x_vals, y_vals, color, ls))

    if reference_value:
        plots.append(Plot([0, max_x], [reference_value, reference_value], reference_color, reference_ls))

    title = 'Cumulative # complete ' + title
    legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths]
    if reference_value:
        legend_list += ['Reference']
    create_plot(plot_fpath, title, plots, legend_list, x_label='Contig index', y_label=title)
Beispiel #25
0
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath,
                               alignments_fpath_template):
    assembly_label = qutils.label_from_fpath(asm.fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath)
    logger.info('  ' + 'processing ' + assembly_label)
    added_ref_asm = []
    not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta'
    not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname)
    contigs = {}
    aligned_contig_names = set()
    aligned_contigs_for_each_ref = {}
    contigs_seq = fastaparser.read_fasta_one_time(asm.fpath)
    alignments_fpath = alignments_fpath_template % corr_assembly_label
    if os.path.exists(alignments_fpath):
        with open(alignments_fpath) as f:
            for line in f:
                values = line.split()
                if values[
                        0] in contigs_analyzer.ref_labels_by_chromosomes.keys(
                        ):
                    ref_name = contigs_analyzer.ref_labels_by_chromosomes[
                        values[0]]
                    ref_contigs_names = values[1:]
                    ref_contigs_fpath = os.path.join(
                        corrected_dirpath,
                        corr_assembly_label + '_to_' + ref_name + '.fasta')
                    if ref_name not in aligned_contigs_for_each_ref:
                        aligned_contigs_for_each_ref[ref_name] = []

                    for (cont_name, seq) in contigs_seq:
                        if not cont_name in contigs:
                            contigs[cont_name] = seq

                        if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[
                                ref_name]:
                            # Collecting all aligned contigs names in order to further extract not aligned
                            aligned_contig_names.add(cont_name)
                            aligned_contigs_for_each_ref[ref_name].append(
                                cont_name)
                            fastaparser.write_fasta(ref_contigs_fpath,
                                                    [(cont_name, seq)], 'a')

                    ref_asm = Assembly(ref_contigs_fpath, assembly_label)
                    if ref_asm.name not in added_ref_asm:
                        if ref_name in assemblies_by_ref:
                            assemblies_by_ref[ref_name].append(ref_asm)
                            added_ref_asm.append(ref_asm.name)
        if qconfig.space_efficient:
            os.remove(alignments_fpath)

    # Extraction not aligned contigs
    all_contigs_names = set(contigs.keys())
    not_aligned_contigs_names = all_contigs_names - aligned_contig_names
    fastaparser.write_fasta(not_aligned_fpath,
                            [(name, contigs[name])
                             for name in not_aligned_contigs_names])

    not_aligned_asm = Assembly(not_aligned_fpath, asm.label)
    return assemblies_by_ref, not_aligned_asm
Beispiel #26
0
def save_assembly_lengths(output_dirpath, contigs_fpaths, assemblies_lengths):
    return save(
        join(output_dirpath, assemblies_lengths_fn), {
            'filenames':
            [qutils.label_from_fpath(label) for label in contigs_fpaths],
            'assemblies_lengths':
            assemblies_lengths
        })
Beispiel #27
0
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger):
    ref_misassemblies = [result['istranslocations_by_refs'] if result else [] for result in results]
    potential_misassemblies_by_refs = [result['potential_misassemblies_by_refs'] if result else [] for result in results]
    all_refs = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()])))
    misassemblies_by_refs_rows = []
    row = {'metricName': 'References', 'values': all_refs}
    misassemblies_by_refs_rows.append(row)
    if ref_misassemblies:
        for i, fpath in enumerate(contigs_fpaths):
            row = {'metricName': qutils.label_from_fpath(fpath), 'values': []}
            misassemblies_by_refs_rows.append(row)
            if ref_misassemblies[i]:
                assembly_name = qutils.name_from_fpath(fpath)
                all_rows = []
                row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]}
                all_rows.append(row)
                for k in all_refs:
                    row = {'metricName': k, 'values': []}
                    for ref in all_refs:
                        if ref == k or ref not in ref_misassemblies[i]:
                            row['values'].append(None)
                        else:
                            row['values'].append(ref_misassemblies[i][ref][k])
                    misassemblies_by_refs_rows[-1]['values'].append(max(0, sum([r for r in row['values'] if r]) +
                                                                        potential_misassemblies_by_refs[i][k]))
                    all_rows.append(row)
                misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name)
                with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file:
                    misassembly_by_ref_file.write('Number of interspecies translocations by references: \n')
                print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True)

                with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file:
                    misassembly_by_ref_file.write('References:\n')
                    for ref_num, ref in enumerate(all_refs):
                        misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n')
                logger.info('  Information about interspecies translocations by references for %s is saved to %s' %
                            (assembly_name, misassembly_by_ref_fpath))
    misassemblies = []
    if qconfig.draw_plots:
        from quast_libs import plotter

        aligned_contigs_labels = []
        for row in misassemblies_by_refs_rows[1:]:
            if row['values']:
                aligned_contigs_labels.append(row['metricName'])
            else:
                misassemblies_by_refs_rows.remove(row)
        for i in range(len(all_refs)):
            cur_results = []
            for row in misassemblies_by_refs_rows[1:]:
                if row['values']:
                    cur_results.append(row['values'][i])
            misassemblies.append(cur_results)
        is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies.' + qconfig.plot_extension)
        plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs, misassemblies_by_refs_rows,
                                       misassemblies, is_translocations_plot_fpath,
                                       title='Intergenomic misassemblies (found and supposed)', reverse=False,
                                       yaxis_title=None, print_all_refs=True)
Beispiel #28
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')
    tool_exec_fpath = compile_glimmer(logger)
    if not tool_exec_fpath:
        return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if qconfig.memory_efficient:
        results = Parallel(n_jobs=n_jobs)(
            delayed(predict_genes)(index, contigs_fpath, gene_lengths,
                                   out_dirpath, tool_dirpath, tmp_dirpath)
            for index, contigs_fpath in enumerate(contigs_fpaths))
    else:
        results = [
            predict_genes(index, contigs_fpath, gene_lengths, out_dirpath,
                          tool_dirpath, tmp_dirpath)
            for index, contigs_fpath in enumerate(contigs_fpaths)
        ]

    genes_by_labels = dict()
    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        label = qutils.label_from_fpath(contigs_fpath)
        genes_by_labels[label], unique, full_genes, partial_genes = results[i]
        if unique is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique)
        if full_genes is not None:
            genes = [
                '%s + %s part' % (full_cnt, partial_cnt)
                for full_cnt, partial_cnt in zip(full_genes, partial_genes)
            ]
            report.add_field(reporting.Fields.PREDICTED_GENES, genes)
        if unique is None and full_genes is None:
            logger.error(
                'Glimmer failed running Glimmer for %s. ' +
                ('Run with the --debug option'
                 ' to see the command line.' if not qconfig.debug else '') %
                label)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
    return genes_by_labels
Beispiel #29
0
def get_color_and_ls(fpath, label=None):
    from quast_libs import qutils
    if not label:
        label = qutils.label_from_fpath(fpath)
    if not dict_color_and_ls:
        return None, None
    """
    Returns tuple: color, line style
    """
    return dict_color_and_ls[label]
Beispiel #30
0
def get_color_and_ls(fpath, label=None):
    from quast_libs import qutils
    if not label:
        label = qutils.label_from_fpath(fpath)
    if not dict_color_and_ls:
        return None, None
    """
    Returns tuple: color, line style
    """
    return dict_color_and_ls[label]
Beispiel #31
0
def save_coord(output_dirpath, coord_x, coord_y, name_coord, contigs_fpaths):
    coord_fn = name_coord + suffix_fn
    return save(
        join(output_dirpath, coord_fn), {
            'coord_x':
            coord_x,
            'coord_y':
            coord_y,
            'filenames':
            [qutils.label_from_fpath(label) for label in contigs_fpaths]
        })
Beispiel #32
0
def save_contigs_lengths(output_dirpath, contigs_fpaths, lists_of_lengths):
    lists_of_lengths = [
        sorted(list, reverse=True) for list in lists_of_lengths
    ]

    return save(
        join(output_dirpath, contigs_lengths_fn), {
            'filenames':
            [qutils.label_from_fpath(label) for label in contigs_fpaths],
            'lists_of_lengths':
            lists_of_lengths
        })
Beispiel #33
0
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath):
    from quast_libs import reporting

    ref_reads_stats = None
    if ref_fpath:
        ref_name = qutils.name_from_fpath(ref_fpath)
        stats_fpath = join(output_dir, ref_name + '.stat')
        if isfile(stats_fpath):
            ref_reads_stats = parse_reads_stats(stats_fpath)
            if int(ref_reads_stats['mapped']) == 0:
                logger.info('  BWA: nothing aligned for reference.')

    # process all contigs files
    for index, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        stats_fpath = join(output_dir, assembly_name + '.stat')
        if ref_reads_stats:
            report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped'])
            report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt'])
            report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons'])
            report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt'])
            report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth'])
            if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
                report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS,
                                [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
                report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0])
        if not isfile(stats_fpath):
            continue
        reads_stats = parse_reads_stats(stats_fpath)
        report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total'])
        report.add_field(reporting.Fields.LEFT_READS, reads_stats['left'])
        report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right'])
        report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped'])
        report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt'])
        if int(reads_stats['mapped']) == 0:
            logger.info('  ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.')
        report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons'])
        report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt'])
        report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint'])
        report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt'])
        report.add_field(reporting.Fields.DEPTH, reads_stats['depth'])
        if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
            report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS,
                            [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
            report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])
Beispiel #34
0
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template):
    assembly_label = qutils.label_from_fpath(asm.fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath)
    logger.info('  ' + 'processing ' + assembly_label)
    added_ref_asm = []
    not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta'
    not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname)
    contigs = {}
    aligned_contig_names = set()
    aligned_contigs_for_each_ref = {}
    contigs_seq = fastaparser.read_fasta_one_time(asm.fpath)
    alignments_fpath = alignments_fpath_template % corr_assembly_label
    if os.path.exists(alignments_fpath):
        with open(alignments_fpath) as f:
            for line in f:
                values = line.split()
                if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys():
                    ref_name = contigs_analyzer.ref_labels_by_chromosomes[values[0]]
                    ref_contigs_names = values[1:]
                    ref_contigs_fpath = os.path.join(
                        corrected_dirpath, corr_assembly_label + '_to_' + ref_name + '.fasta')
                    if ref_name not in aligned_contigs_for_each_ref:
                        aligned_contigs_for_each_ref[ref_name] = []

                    for (cont_name, seq) in contigs_seq:
                        if not cont_name in contigs:
                            contigs[cont_name] = seq

                        if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ref_name]:
                            # Collecting all aligned contigs names in order to further extract not aligned
                            aligned_contig_names.add(cont_name)
                            aligned_contigs_for_each_ref[ref_name].append(cont_name)
                            fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a')

                    ref_asm = Assembly(ref_contigs_fpath, assembly_label)
                    if ref_asm.name not in added_ref_asm:
                        if ref_name in assemblies_by_ref:
                            assemblies_by_ref[ref_name].append(ref_asm)
                            added_ref_asm.append(ref_asm.name)
        if qconfig.space_efficient:
            os.remove(alignments_fpath)

    # Extraction not aligned contigs
    all_contigs_names = set(contigs.keys())
    not_aligned_contigs_names = all_contigs_names - aligned_contig_names
    fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names])

    not_aligned_asm = Assembly(not_aligned_fpath, asm.label)
    return assemblies_by_ref, not_aligned_asm
Beispiel #35
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')
    tool_exec_fpath = compile_glimmer(logger)
    if not tool_exec_fpath:
        return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if qconfig.memory_efficient:
        results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
            index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath)
            for index, contigs_fpath in enumerate(contigs_fpaths))
    else:
        results = [predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath)
                   for index, contigs_fpath in enumerate(contigs_fpaths)]

    genes_by_labels = dict()
    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        label = qutils.label_from_fpath(contigs_fpath)
        genes_by_labels[label], unique, full_genes, partial_genes = results[i]
        if unique is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique)
        if full_genes is not None:
            genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes)]
            report.add_field(reporting.Fields.PREDICTED_GENES, genes)
        if unique is None and full_genes is None:
            logger.error(
                'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option'
                ' to see the command line.' if not qconfig.debug else '') % label)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
    return genes_by_labels
Beispiel #36
0
def correct_assemblies(contigs_fpaths, output_dirpath, labels):
    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
    # we need correction but do not need min-contig filtration
    min_contig = qconfig.min_contig
    qconfig.min_contig = 0
    corrected_contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting=None)
    qconfig.min_contig = min_contig
    assemblies = [Assembly(fpath, qutils.label_from_fpath(fpath)) for fpath in old_contigs_fpaths]
    corrected_labels = [asm.label for asm in assemblies]

    if qconfig.draw_plots or qconfig.html_report:
        corr_fpaths = [asm.fpath for asm in assemblies]
        corr_labels = [asm.label for asm in assemblies]
        plotter_data.save_colors_and_ls(corr_fpaths, labels=corr_labels)
    return assemblies, corrected_labels
Beispiel #37
0
def save_total_report(output_dirpath, min_contig, ref_fpath):
    from quast_libs import reporting
    asm_names = [qutils.label_from_fpath(this) for this in reporting.assembly_fpaths]
    report = reporting.table(reporting.Fields.grouped_order)
    t = datetime.datetime.now()

    return save(join(output_dirpath, total_report_fname), {
        'date': t.strftime('%d %B %Y, %A, %H:%M:%S'),
        'assembliesNames': asm_names,
        'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '',
        'order': [i for i, _ in enumerate(asm_names)],
        'report': report,
        'minContig': min_contig,
        'assembliesWithNs': qconfig.potential_scaffolds_assemblies if qconfig.potential_scaffolds_assemblies else None
    })
Beispiel #38
0
def correct_assemblies(contigs_fpaths, output_dirpath, labels):
    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
    # we need correction but do not need min-contig filtration
    min_contig = qconfig.min_contig
    qconfig.min_contig = 0
    corrected_contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting=None)
    qconfig.min_contig = min_contig
    assemblies = [Assembly(fpath, qutils.label_from_fpath(fpath)) for fpath in old_contigs_fpaths]
    corrected_labels = [asm.label for asm in assemblies]

    if qconfig.draw_plots or qconfig.html_report:
        corr_fpaths = [asm.fpath for asm in assemblies]
        corr_labels = [asm.label for asm in assemblies]
        plotter_data.save_colors_and_ls(corr_fpaths, labels=corr_labels)
    return assemblies, corrected_labels
Beispiel #39
0
def save_GC_info(output_dirpath, contigs_fpaths, list_of_GC_distributions,
                 list_of_GC_contigs_distributions, reference_index):
    return save(
        join(output_dirpath, gc_fn), {
            'filenames':
            [qutils.label_from_fpath(label) for label in contigs_fpaths],
            'reference_index':
            reference_index,
            'list_of_GC_distributions':
            list_of_GC_distributions,
            'list_of_GC_contigs_distributions':
            list_of_GC_contigs_distributions,
            'lists_of_gc_info':
            None,
        })
Beispiel #40
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')
    tool_exec_fpath = compile_glimmer(logger)
    if not tool_exec_fpath:
        return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath,
                      tool_dirpath, tool_exec_fpath, tmp_dirpath)
                     for index, contigs_fpath in enumerate(contigs_fpaths)]
    genes_list, unique, full_genes, partial_genes = run_parallel(
        predict_genes, parallel_args, n_jobs)

    genes_by_labels = dict()
    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        label = qutils.label_from_fpath(contigs_fpath)
        genes_by_labels[label] = genes_list[i]
        if unique[i] is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE,
                             unique[i])
        if full_genes[i] is not None:
            genes = [
                '%s + %s part' % (full_cnt, partial_cnt) for full_cnt,
                partial_cnt in zip(full_genes[i], partial_genes[i])
            ]
            report.add_field(reporting.Fields.PREDICTED_GENES, genes)
        if unique[i] is None and full_genes[i] is None:
            logger.error('Failed running Glimmer for %s. ' % label + (
                'Run with the --debug option'
                ' to see the command line.' if not qconfig.debug else ''))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
    return genes_by_labels
Beispiel #41
0
def save_colors(results_dirpath, contigs_fpaths, dict_colors, meta=False):  # coordinates for Nx, NAx, NGx, NGAX
    if meta:
        html_fpath = os.path.join(results_dirpath, report_fname)
        with open(html_fpath) as f_html:
            html_text = f_html.read()
        html_text = re.sub('{{ ' + 'colors' + ' }}', 'standard_colors', html_text)
        html_text = re.sub('{{ ' + 'broken_scaffolds' + ' }}', '[]', html_text)
        with open(html_fpath, 'w') as f_html:
            f_html.write(html_text)
    else:
        contig_labels = [qutils.label_from_fpath(contigs_fpath) for contigs_fpath in contigs_fpaths]
        colors_and_ls = [dict_colors[contig_label] for contig_label in contig_labels]
        colors = [color_and_ls[0] for color_and_ls in colors_and_ls]
        colors_for_html = [html_colors[plotter_data.colors.index(color)] for color in colors]
        save_record(results_dirpath, 'colors', colors_for_html)
        broken_contig_names = [label for i, label in enumerate(contig_labels) if colors_and_ls[i][1] == secondary_line_style]
        save_record(results_dirpath, 'broken_scaffolds', broken_contig_names)
Beispiel #42
0
def cumulative_plot(reference, contigs_fpaths, lists_of_lengths, plot_fpath,
                    title):
    if not can_draw_plots:
        return

    logger.info('  Drawing cumulative plot...')

    plots = []
    max_x = 0

    for (contigs_fpath, lengths) in zip(contigs_fpaths, lists_of_lengths):
        y_vals = [0]
        for l in sorted(lengths, reverse=True):
            y_vals.append(y_vals[-1] + l)
        x_vals = list(range(0, len(y_vals)))
        if x_vals:
            max_x = max(x_vals[-1], max_x)
        color, ls = get_color_and_ls(contigs_fpath)
        plots.append(Plot(x_vals, y_vals, color, ls))

    if reference:
        y_vals = [0]
        for l in sorted(
                fastaparser.get_chr_lengths_from_fastafile(reference).values(),
                reverse=True):
            y_vals.append(y_vals[-1] + l)
        x_vals = list(range(0, len(y_vals)))
        # extend reference curve to the max X-axis point
        reference_length = y_vals[-1]
        max_x = max(max_x, x_vals[-1])
        y_vals.append(reference_length)
        x_vals.append(max_x)
        plots.append(Plot(x_vals, y_vals, reference_color, reference_ls))

    legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths]
    if reference:
        legend_list += ['Reference']

    create_plot(plot_fpath,
                title,
                plots,
                legend_list,
                x_label='Contig index',
                y_label='Cumulative length',
                x_limit=[0, max_x])
def histogram(contigs_fpaths, values, plot_fpath, title='', yaxis_title='', bottom_value=None,
              top_value=None):
    if not can_draw_plots:
        return
    if len(contigs_fpaths) < 2:  #
        logger.info('  Skipping drawing ' + title + ' histogram... (less than 2 columns histogram makes no sense)')
        return

    logger.info('  Drawing ' + title + ' histogram...')

    plots = []
    min_value = sorted(values)[0]
    max_value = sorted(values, reverse=True)[0]
    exponent = None
    if max_value == min_value:
        if max_value > 0:
            exponent = math.pow(10, math.floor(math.log(max_value, 10)))
        else:
            exponent = 1
    else:
        exponent = math.pow(10, math.floor(math.log(max_value - min_value, 10)))

    if not bottom_value:
        bottom_value = (math.floor(min_value / exponent) - 5) * exponent
    if not top_value:
        top_value = (math.ceil(max_value / exponent) + 1) * exponent

    #bars' params
    width = 0.3
    interval = width // 3
    start_pos = interval // 2

    for i, (contigs_fpath, val) in enumerate(zip(contigs_fpaths, values)):
        color, ls = get_color_and_ls(contigs_fpath)
        if ls == primary_line_style:
            hatch = ''
        else:
            hatch = 'x'
        plots.append(Bar(start_pos + (width + interval) * i, val, color, width=width, hatch=hatch))

    legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths]
    create_plot(plot_fpath, title, plots, legend_list, x_label='', y_label=yaxis_title, is_histogram=True,
                x_limit=[0, start_pos + width * len(contigs_fpaths) + interval * (len(contigs_fpaths) - 1)],
                y_limit=[max(bottom_value, 0), top_value])
Beispiel #44
0
def calculate_ave_read_support(combined_output_dirpath, assemblies):
    unique_contigs_fpath = os.path.join(combined_output_dirpath, 'contigs_reports', qconfig.unique_contigs_fname_pattern)
    for assembly in assemblies:
        aligned_contigs_by_ref = dict()
        assembly_label = qutils.label_from_fpath(assembly.fpath)
        corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
        with open(unique_contigs_fpath % corr_assembly_label) as in_f:
            for line in in_f:
                ref_name, contig_len, contig_cov = line.strip().split('\t')
                aligned_contigs_by_ref.setdefault(ref_name, []).append((float(contig_len), float(contig_cov)))
        for ref_name, contigs in aligned_contigs_by_ref.items():
            ref_cov = sum(contig_cov * aligned_len for (aligned_len, contig_cov) in contigs)
            ref_cov /= sum(aligned_len for (aligned_len, contig_cov) in contigs)
            corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
            ref_contigs_fpath = os.path.join(
                        os.path.dirname(assembly.fpath), corr_assembly_label + '_to_' + ref_name + '.fasta')
            qconfig.assembly_labels_by_fpath[ref_contigs_fpath] = assembly_label
            report = reporting.get(ref_contigs_fpath, ref_name=ref_name)
            report.add_field(reporting.Fields.AVE_READ_SUPPORT, '%.2f' % ref_cov)
Beispiel #45
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    out_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer')
    err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer.stderr')

    #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir,
    #    fasta_path, out_path, gene_lengths, err_path)

    out_gff_path, genes, unique, total, cnt = glimmerHMM(tool_dirpath,
        contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index)

    if out_gff_path:
        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique) + ' unique, ' + str(total) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_path)

    return genes, unique, cnt
Beispiel #46
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')
    tool_exec_fpath = compile_glimmer(logger)
    if not tool_exec_fpath:
        return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tool_exec_fpath, tmp_dirpath)
                     for index, contigs_fpath in enumerate(contigs_fpaths)]
    genes_list, unique, full_genes, partial_genes = run_parallel(predict_genes, parallel_args, n_jobs)

    genes_by_labels = dict()
    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        label = qutils.label_from_fpath(contigs_fpath)
        genes_by_labels[label] = genes_list[i]
        if unique[i] is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique[i])
        if full_genes[i] is not None:
            genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i])]
            report.add_field(reporting.Fields.PREDICTED_GENES, genes)
        if unique[i] is None and full_genes[i] is None:
            logger.error(
                'Failed running Glimmer for %s. ' % label + ('Run with the --debug option'
                ' to see the command line.' if not qconfig.debug else ''))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
    return genes_by_labels
Beispiel #47
0
def save_total_report(output_dirpath, min_contig, ref_fpath):
    from quast_libs import reporting
    asm_names = [qutils.label_from_fpath(this) for this in reporting.assembly_fpaths]
    report = reporting.table(reporting.Fields.grouped_order)
    subreports = []
    ref_names = []
    if qconfig.is_combined_ref and ref_labels_by_chromosomes:
        ref_names = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()])))
        subreports = [reporting.table(reporting.Fields.grouped_order, ref_name=ref_name) for ref_name in ref_names]
    t = datetime.datetime.now()

    return save(join(output_dirpath, total_report_fname), {
        'date': t.strftime('%d %B %Y, %A, %H:%M:%S'),
        'assembliesNames': asm_names,
        'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '',
        'order': [i for i, _ in enumerate(asm_names)],
        'report': report,
        'subreferences': ref_names,
        'subreports': subreports,
        'minContig': min_contig
    })
Beispiel #48
0
def save_colors(results_dirpath, contigs_fpaths, dict_colors, meta=False):  # coordinates for Nx, NAx, NGx, NGAX
    from quast_libs import plotter

    if meta:
        html_fpath = os.path.join(results_dirpath, report_fname)
        with open(html_fpath) as f_html:
            html_text = f_html.read()
        html_text = re.sub("{{ " + "colors" + " }}", "standard_colors", html_text)
        html_text = re.sub("{{ " + "broken_scaffolds" + " }}", "[]", html_text)
        with open(html_fpath, "w") as f_html:
            f_html.write(html_text)
    else:
        contig_labels = [qutils.label_from_fpath(contigs_fpath) for contigs_fpath in contigs_fpaths]
        colors_and_ls = [dict_colors[contig_label] for contig_label in contig_labels]
        colors = [color_and_ls[0] for color_and_ls in colors_and_ls]
        colors_for_html = [html_colors[plotter.colors.index(color)] for color in colors]
        json_fpath = json_saver.save_colors(results_dirpath, colors_for_html)
        append(results_dirpath, json_fpath, "colors")
        broken_contig_names = [
            label for i, label in enumerate(contig_labels) if colors_and_ls[i][1] == secondary_line_style
        ]
        json_fpath = json_saver.save_broken_scaffolds(results_dirpath, broken_contig_names)
        append(results_dirpath, json_fpath, "broken_scaffolds")
Beispiel #49
0
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference,
             tmp_dir):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath,
                                 'gage_' + corr_assembly_label + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath,
                                 'gage_' + corr_assembly_label + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess([
        'sh', gage_tool_path,
        abspath(ca_utils.misc.contig_aligner_dirpath), reference,
        contigs_fpath, tmp_dir,
        str(qconfig.min_contig)
    ],
                                         stdout=log_out_f,
                                         stderr=log_err_f,
                                         indent='  ' + qutils.index_to_str(i),
                                         only_if_debug=False)
    if return_code != 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
Beispiel #50
0
def save_colors(results_dirpath,
                contigs_fpaths,
                dict_colors,
                meta=False):  # coordinates for Nx, NAx, NGx, NGAX
    from quast_libs import plotter
    if meta:
        html_fpath = os.path.join(results_dirpath, report_fname)
        with open(html_fpath) as f_html:
            html_text = f_html.read()
        html_text = re.sub('{{ ' + 'colors' + ' }}', 'standard_colors',
                           html_text)
        with open(html_fpath, 'w') as f_html:
            f_html.write(html_text)
    else:
        colors_and_ls = [
            dict_colors[qutils.label_from_fpath(contigs_fpath)]
            for contigs_fpath in contigs_fpaths
        ]
        colors = [color_and_ls[0] for color_and_ls in colors_and_ls]
        colors_for_html = [
            html_colors[plotter.colors.index(color)] for color in colors
        ]
        json_fpath = json_saver.save_colors(results_dirpath, colors_for_html)
        append(results_dirpath, json_fpath, 'colors')
Beispiel #51
0
def save_result(result, report, fname, ref_fpath):
    region_misassemblies = result['region_misassemblies']
    misassemblies_by_ref = result['misassemblies_by_ref']
    region_struct_variations = result['region_struct_variations']
    misassemblies_matched_sv = result['misassemblies_matched_sv']
    misassembled_contigs = result['misassembled_contigs']
    misassembled_bases = result['misassembled_bases']
    misassembly_internal_overlap = result['misassembly_internal_overlap']
    unaligned = result['unaligned']
    partially_unaligned = result['partially_unaligned']
    partially_unaligned_bases = result['partially_unaligned_bases']
    fully_unaligned_bases = result['fully_unaligned_bases']
    ambiguous_contigs = result['ambiguous_contigs']
    ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases']
    SNPs = result['SNPs']
    indels_list = result['indels_list']
    total_aligned_bases = result['total_aligned_bases']
    half_unaligned_with_misassembly = result['half_unaligned_with_misassembly']

    report.add_field(reporting.Fields.MISLOCAL, region_misassemblies.count(Misassembly.LOCAL))
    report.add_field(reporting.Fields.MISASSEMBL, region_misassemblies.count(Misassembly.RELOCATION) +
                     region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) +
                     region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs))
    report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases)
    report.add_field(reporting.Fields.MISINTERNALOVERLAP, misassembly_internal_overlap)
    if qconfig.bed:
        report.add_field(reporting.Fields.STRUCT_VARIATIONS, misassemblies_matched_sv)
    report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned, partially_unaligned))
    report.add_field(reporting.Fields.UNALIGNEDBASES, (fully_unaligned_bases + partially_unaligned_bases))
    report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs)
    report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES, ambiguous_contigs_extra_bases)
    report.add_field(reporting.Fields.MISMATCHES, SNPs)
    # different types of indels:
    if indels_list is not None:
        report.add_field(reporting.Fields.INDELS, len(indels_list))
        report.add_field(reporting.Fields.INDELSBASES, sum(indels_list))
        report.add_field(reporting.Fields.MIS_SHORT_INDELS, len([i for i in indels_list if i <= qconfig.SHORT_INDEL_THRESHOLD]))
        report.add_field(reporting.Fields.MIS_LONG_INDELS, len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD]))

    if total_aligned_bases:
        report.add_field(reporting.Fields.SUBSERROR, "%.2f" % (float(SNPs) * 100000.0 / float(total_aligned_bases)))
        report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS))
                                                                 * 100000.0 / float(total_aligned_bases)))

    # for misassemblies report:
    report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) +
                     region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) +
                     region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION))
    report.add_field(reporting.Fields.MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION))
    report.add_field(reporting.Fields.MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS, len(misassembled_contigs))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases)
    report.add_field(reporting.Fields.MIS_LOCAL, region_misassemblies.count(Misassembly.LOCAL))
    if qconfig.is_combined_ref:
        report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
        report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        all_references = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()])))
        for ref_name in all_references:
            subreport = reporting.get(fname, ref_name=ref_name)
            ref_misassemblies = misassemblies_by_ref[ref_name]
            subreport.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, ref_misassemblies.count(Misassembly.RELOCATION) +
                                ref_misassemblies.count(Misassembly.INVERSION) + ref_misassemblies.count(Misassembly.TRANSLOCATION) +
                                ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_RELOCATION, ref_misassemblies.count(Misassembly.RELOCATION))
            subreport.add_field(reporting.Fields.MIS_TRANSLOCATION, ref_misassemblies.count(Misassembly.TRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_INVERTION, ref_misassemblies.count(Misassembly.INVERSION))
            subreport.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_LOCAL, ref_misassemblies.count(Misassembly.LOCAL))
            subreport.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
            subreport.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
            if fname not in qconfig.dict_of_broken_scaffolds:
                subreport.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.SCAFFOLD_GAP))
            if qconfig.check_for_fragmented_ref:
                subreport.add_field(reporting.Fields.MIS_FRAGMENTED, ref_misassemblies.count(Misassembly.FRAGMENTED))
    elif intergenomic_misassemblies_by_asm:
        label = qutils.label_from_fpath(fname)
        ref_name = qutils.name_from_fpath(ref_fpath)
        ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name]
        report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
    if fname not in qconfig.dict_of_broken_scaffolds:
        report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.SCAFFOLD_GAP))
    if qconfig.check_for_fragmented_ref:
        report.add_field(reporting.Fields.MIS_FRAGMENTED, region_misassemblies.count(Misassembly.FRAGMENTED))
    # for unaligned report:
    report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned)
    report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, fully_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS, partially_unaligned)
    report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH, partially_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS, half_unaligned_with_misassembly)
    return report
Beispiel #52
0
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger):
    istranslocations_by_asm = [result['istranslocations_by_refs'] if result else None for result in results]
    misassemblies_by_asm = [result['misassemblies_by_ref'] if result else None for result in results]
    all_refs = []
    for ref in ref_labels_by_chromosomes.values():
        if ref not in all_refs:
            all_refs.append(ref)
    if not qconfig.use_input_ref_order:
        all_refs.sort()
    misassemblies_by_refs_rows = []
    row = {'metricName': 'References', 'values': all_refs}
    misassemblies_by_refs_rows.append(row)
    if not istranslocations_by_asm:
        return
    for i, fpath in enumerate(contigs_fpaths):
        label = qutils.label_from_fpath(fpath)
        row = {'metricName': label, 'values': []}
        misassemblies_by_refs_rows.append(row)
        istranslocations_by_ref = istranslocations_by_asm[i]
        intergenomic_misassemblies_by_asm[label] = defaultdict(list)
        for ref in all_refs:
            intergenomic_misassemblies_by_asm[label][ref] = misassemblies_by_asm[i][ref] if misassemblies_by_asm[i] else []
        if istranslocations_by_ref:
            assembly_name = qutils.name_from_fpath(fpath)
            all_rows = []
            row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]}
            all_rows.append(row)
            for ref in all_refs:
                row = {'metricName': ref, 'values': []}
                for second_ref in all_refs:
                    if ref == second_ref or second_ref not in istranslocations_by_ref:
                        row['values'].append(None)
                    else:
                        row['values'].append(istranslocations_by_ref[ref][second_ref])
                possible_misassemblies = 0
                misassemblies_by_ref = misassemblies_by_asm[i]
                if misassemblies_by_ref:
                    possible_misassemblies = misassemblies_by_ref[ref].count(Misassembly.POSSIBLE_MISASSEMBLIES)
                istranslocations = max(0, sum([r for r in row['values'] if r]))
                misassemblies_by_refs_rows[-1]['values'].append(istranslocations + possible_misassemblies)
                all_rows.append(row)
            misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name)
            with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file:
                misassembly_by_ref_file.write('Number of interspecies translocations by references: \n')
            print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True)

            with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file:
                misassembly_by_ref_file.write('References:\n')
                for ref_num, ref in enumerate(all_refs):
                    misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n')
            logger.info('  Information about interspecies translocations by references for %s is saved to %s' %
                        (assembly_name, misassembly_by_ref_fpath))
    misassemblies = []
    if qconfig.draw_plots:
        from quast_libs import plotter

        aligned_contigs_labels = []
        for row in misassemblies_by_refs_rows[1:]:
            if row['values']:
                aligned_contigs_labels.append(row['metricName'])
            else:
                misassemblies_by_refs_rows.remove(row)
        for i in range(len(all_refs)):
            cur_results = []
            for row in misassemblies_by_refs_rows[1:]:
                if row['values']:
                    cur_results.append(row['values'][i])
            misassemblies.append(cur_results)
        is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies')
        plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs,
                                       misassemblies, is_translocations_plot_fpath,
                                       title='Intergenomic misassemblies (found and supposed)', reverse=False,
                                       yaxis_title=None, print_all_refs=True, logger=logger)
Beispiel #53
0
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    reference_lengths = []
    reference_fragments = None
    if ref_fpath:
        reference_lengths = sorted(
            fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(),
            reverse=True)
        reference_fragments = len(reference_lengths)
        reference_length = sum(reference_lengths)
        reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(
            ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', length = ' +
                    str(reference_length) + ', num fragments = ' +
                    str(reference_fragments) + ', GC % = ' + '%.2f' %
                    reference_GC if reference_GC is not None else 'undefined')
        if reference_fragments > 30 and not qconfig.check_for_fragmented_ref:
            logger.warning(
                '  Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.'
                ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).'
            )
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        reference_lengths = [reference_length]
        logger.info('  Estimated reference length = ' + str(reference_length))

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    coverage_dict = dict()
    cov_pattern = re.compile(r'_cov_(\d+\.?\d*)')
    for id, contigs_fpath in enumerate(contigs_fpaths):
        coverage_dict[contigs_fpath] = []
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        is_potential_scaffold = False
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')
            if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(
                    seq):
                is_potential_scaffold = True
                qconfig.potential_scaffolds_assemblies.append(assembly_label)
            if cov_pattern.findall(name):
                cov = int(float(cov_pattern.findall(name)[0]))
                if len(coverage_dict[contigs_fpath]) <= cov:
                    coverage_dict[contigs_fpath] += [0] * (
                        cov - len(coverage_dict[contigs_fpath]) + 1)
                coverage_dict[contigs_fpath][cov] += len(seq)

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    lists_of_lengths = [
        sorted(list, reverse=True) for list in lists_of_lengths
    ]
    num_contigs = max(
        [len(list_of_length) for list_of_length in lists_of_lengths])
    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math
        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs // multiplicator
        corr_lists_of_lengths = [[
            sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)])
            for i in range(1, max_points)
            if (i * multiplicator) < len(list_of_length)
        ] for list_of_length in lists_of_lengths]
        if len(reference_lengths) > 1:
            reference_lengths = [
                sum(reference_lengths[(
                    (i - 1) * multiplicator):(i * multiplicator)]) if
                (i * multiplicator) < len(reference_lengths) else sum(
                    reference_lengths[((i - 1) * multiplicator):])
                for i in range(1, max_points)
            ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(
                sum(lists_of_lengths[num_list][last_index * multiplicator:]))
    else:
        corr_lists_of_lengths = [
            sorted(list, reverse=True) for list in lists_of_lengths
        ]

    if reference_lengths:
        # Saving for an HTML report
        if qconfig.html_report:
            from quast_libs.html_saver import html_saver
            html_saver.save_reference_lengths(results_dir, reference_lengths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths,
                                        corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    list_of_GC_contigs_distributions = []
    largest_contig = 0
    from . import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
            zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution, GC_contigs_distribution = GC_content(
            contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        list_of_GC_contigs_distributions.append(GC_contigs_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(
                    reporting.Fields.GC,
                    ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(
                reporting.Fields.UNCALLED_PERCENT,
                ('%.2f' %
                 (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REF_FRAGMENTS,
                             reference_fragments)
            if not qconfig.is_combined_ref:
                report.add_field(
                    reporting.Fields.REFGC,
                    ('%.2f' %
                     reference_GC if reference_GC is not None else None))
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil(
        (largest_contig / 1000) / 600)  # divide on height of plot

    list_of_GC_distributions_with_ref = list_of_GC_distributions
    reference_index = None
    if ref_fpath:
        reference_index = len(list_of_GC_distributions_with_ref)
        list_of_GC_distributions_with_ref.append(reference_GC_distribution)

    if qconfig.html_report and not qconfig.no_gc:
        from quast_libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths,
                                list_of_GC_distributions_with_ref,
                                list_of_GC_contigs_distributions,
                                reference_index)

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points,
                    contigs_fpaths, lists_of_lengths,
                    join(output_dirpath, 'Nx_plot'), 'Nx', [])
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points,
                        contigs_fpaths, lists_of_lengths,
                        join(output_dirpath, 'NGx_plot'), 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))])

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths,
                                join(output_dirpath, 'cumulative_plot'),
                                'Cumulative length')
        if not qconfig.no_gc:
            ########################################################################
            # Drawing GC content plot...
            plotter.GC_content_plot(ref_fpath, contigs_fpaths,
                                    list_of_GC_distributions_with_ref,
                                    join(output_dirpath, 'GC_content_plot'))
            for contigs_fpath, GC_distribution in zip(
                    contigs_fpaths, list_of_GC_contigs_distributions):
                plotter.contigs_GC_content_plot(
                    contigs_fpath, GC_distribution,
                    join(
                        output_dirpath,
                        qutils.label_from_fpath(contigs_fpath) +
                        '_GC_content_plot'))

        if any(coverage_dict[contigs_fpath]
               for contigs_fpath in contigs_fpaths):
            draw_coverage_histograms(coverage_dict, contigs_fpaths,
                                     output_dirpath)

    logger.main_info('Done.')
Beispiel #54
0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
                        reference_chromosomes, genes_container, operons_container):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    results = dict()
    ref_lengths = {}
    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    nucmer_base_fpath = os.path.join(nucmer_path_dirpath, corr_assembly_label + '.coords')
    if qconfig.use_all_alignments:
        nucmer_fpath = nucmer_base_fpath
    else:
        nucmer_fpath = nucmer_base_fpath + '.filtered'

    if not os.path.isfile(nucmer_fpath):
        logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.',
            indent='  ')
        return None

    coordfile = open(nucmer_fpath, 'r')
    for line in coordfile:
        if line.startswith('='):
            break

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.items():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True)
    sorted_contigs_names = [name for (name, seq) in contig_tuples]

    genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock

    gene_searching_enabled = len(genes_container.region_list) or len(operons_container.region_list)
    if qconfig.memory_efficient and gene_searching_enabled:
        logger.warning('Run QUAST without genes and operons files to reduce memory consumption.')
    if gene_searching_enabled:
        for name in sorted_contigs_names:
            aligned_blocks_by_contig_name[name] = []
    for line in coordfile:
        if line.strip() == '':
            break
        s1 = int(line.split('|')[0].split()[0])
        e1 = int(line.split('|')[0].split()[1])
        s2 = int(line.split('|')[1].split()[0])
        e2 = int(line.split('|')[1].split()[1])
        contig_name = line.split()[12].strip()
        chr_name = line.split()[11].strip()

        if chr_name not in genome_mapping:
            logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \
                         "differ from the names in the reference. Try to remove the file and restart QUAST.")
            return None

        if gene_searching_enabled:
            aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1))
        if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
            for i in range(s1, len(genome_mapping[chr_name])):
                genome_mapping[chr_name][i] = 1
            for i in range(1, e1 + 1):
                genome_mapping[chr_name][i] = 1
        else: #if s1 <= e1:
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1
    coordfile.close()
    if qconfig.space_efficient and nucmer_fpath.endswith('.filtered'):
        os.remove(nucmer_fpath)

    # counting genome coverage and gaps number
    covered_bp = 0
    gaps_count = 0
    gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null'
    gaps_file = open(gaps_fpath, 'w')
    for chr_name, chr_len in reference_chromosomes.items():
        gaps_file.write(chr_name + '\n')
        cur_gap_size = 0
        aligned_len = 0
        for i in range(1, chr_len + 1):
            if genome_mapping[chr_name][i] == 1:
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n')
                aligned_len += 1
                covered_bp += 1
                cur_gap_size = 0
            else:
                cur_gap_size += 1
        ref_lengths[chr_name] = aligned_len
        if cur_gap_size >= qconfig.min_gap_size:
            gaps_count += 1
            gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n')
    gaps_file.close()

    results["covered_bp"] = covered_bp
    results["gaps_count"] = gaps_count

    # finding genes and operons
    for container, feature_in_contigs, field, suffix in [
        (genes_container,
         genes_in_contigs,
         reporting.Fields.GENES,
         '_genes.txt'),

        (operons_container,
         operons_in_contigs,
         reporting.Fields.OPERONS,
         '_operons.txt')]:

        if not container.region_list:
            results[field + "_full"] = None
            results[field + "_partial"] = None
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + suffix)
        found_file = open(found_fpath, 'w')
        found_file.write('%s\t\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type'))
        found_file.write('=========================================\n')

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if container.chr_names_dict[region.seqname] != cur_block.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1),
                                  AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)]
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            region_id = str(region.id)
                            if region_id == 'None':
                                region_id = '# ' + str(region.number + 1)
                            found_file.write('%s\t\t%d\t%d\tcomplete\n' % (region_id, region.start, region.end))
                            feature_in_contigs[contig_id] += 1  # inc number of found genes/operons in id-th contig

                            cur_feature_is_found = True
                            break
                        elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap:
                            found_list[i] = 2
                            total_partial += 1
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break
            # adding info about partially found genes/operons
            if found_list[i] == 2:  # partial gene/operon
                region_id = str(region.id)
                if region_id == 'None':
                    region_id = '# ' + str(region.number + 1)
                found_file.write('%s\t\t%d\t%d\tpartial\n' % (region_id, region.start, region.end))

        results[field + "_full"] = total_full
        results[field + "_partial"] = total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')

    return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
Beispiel #55
0
def process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath,
                        reference_chromosomes, ns_by_chromosomes, containers):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    results = dict()
    ref_lengths = defaultdict(int)
    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    coords_base_fpath = os.path.join(coords_dirpath, corr_assembly_label + '.coords')
    if qconfig.use_all_alignments:
        coords_fpath = coords_base_fpath
    else:
        coords_fpath = coords_base_fpath + '.filtered'

    if not os.path.isfile(coords_fpath):
        logger.error('File with alignment coords (' + coords_fpath + ') not found! Try to restart QUAST.',
            indent='  ')
        return None, None

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.items():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    sorted_contig_tuples = sorted(enumerate(contig_tuples), key=lambda x: len(x[1][1]), reverse=True)
    sorted_contigs_names = []
    contigs_order = []
    for idx, (name, _) in sorted_contig_tuples:
        sorted_contigs_names.append(name)
        contigs_order.append(idx)

    features_in_contigs = [0] * len(sorted_contigs_names)  # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock

    gene_searching_enabled = len(containers)
    if qconfig.memory_efficient and gene_searching_enabled:
        logger.warning('Run QUAST without genes and operons files to reduce memory consumption.')
    if gene_searching_enabled:
        for name in sorted_contigs_names:
            aligned_blocks_by_contig_name[name] = []
    with open(coords_fpath) as coordfile:
        for line in coordfile:
            s1 = int(line.split('|')[0].split()[0])
            e1 = int(line.split('|')[0].split()[1])
            s2 = int(line.split('|')[1].split()[0])
            e2 = int(line.split('|')[1].split()[1])
            contig_name = line.split()[12].strip()
            chr_name = line.split()[11].strip()

            if chr_name not in genome_mapping:
                logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \
                             "differ from the names in the reference. Try to remove the file and restart QUAST.")
                return None

            if gene_searching_enabled:
                aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1,
                                                                               contig=contig_name, start_in_contig=s2, end_in_contig=e2))
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1

    for chr_name in genome_mapping.keys():
        for i in ns_by_chromosomes[chr_name]:
            genome_mapping[chr_name][i] = 0
        ref_lengths[chr_name] = sum(genome_mapping[chr_name])

    if qconfig.space_efficient and coords_fpath.endswith('.filtered'):
        os.remove(coords_fpath)

    # counting genome coverage and gaps number
    gaps_count = 0
    if qconfig.analyze_gaps:
        gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null'
        with open(gaps_fpath, 'w') as gaps_file:
            for chr_name, chr_len in reference_chromosomes.items():
                gaps_file.write(chr_name + '\n')
                cur_gap_size = 0
                for i in range(1, chr_len + 1):
                    if genome_mapping[chr_name][i] == 1 or i in ns_by_chromosomes[chr_name]:
                        if cur_gap_size >= qconfig.min_gap_size:
                            gaps_count += 1
                            gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n')
                        cur_gap_size = 0
                    else:
                        cur_gap_size += 1
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n')

    results["gaps_count"] = gaps_count
    results[reporting.Fields.GENES + "_full"] = None
    results[reporting.Fields.GENES + "_partial"] = None
    results[reporting.Fields.OPERONS + "_full"] = None
    results[reporting.Fields.OPERONS + "_partial"] = None

    # finding genes and operons
    for container in containers:
        if not container.region_list:
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_genomic_features_' + container.kind.lower() + '.txt')
        found_file = open(found_fpath, 'w')
        found_file.write('%s\t\t%s\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type', 'Contig'))
        found_file.write('=' * 50 + '\n')

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            gene_blocks = []
            if region.id is None:
                region.id = '# ' + str(region.number + 1)
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if cur_block.seqname != region.seqname:
                        continue
                    if region.end <= cur_block.start or cur_block.end <= region.start:
                        continue
                    elif cur_block.start <= region.start and region.end <= cur_block.end:
                        if found_list[i] == 2:  # already found as partial gene
                            total_partial -= 1
                        found_list[i] = 1
                        total_full += 1
                        contig_info = cur_block.format_gene_info(region)
                        found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' % (region.id, region.start, region.end, contig_info))
                        if container.kind == 'operon':
                            operons_in_contigs[contig_id] += 1  # inc number of found genes/operons in id-th contig
                        else:
                            features_in_contigs[contig_id] += 1

                        cur_feature_is_found = True
                        break
                    elif min(region.end, cur_block.end) - max(region.start, cur_block.start) >= qconfig.min_gene_overlap:
                        if found_list[i] == 0:
                            found_list[i] = 2
                            total_partial += 1
                        gene_blocks.append(cur_block)
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break
            # adding info about partially found genes/operons
            if found_list[i] == 2:  # partial gene/operon
                contig_info = ','.join([block.format_gene_info(region) for block in sorted(gene_blocks, key=lambda block: block.start)])
                found_file.write('%s\t\t%d\t%d\tpartial\t%s\n' % (region.id, region.start, region.end, contig_info))

        if container.kind == 'operon':
            results[reporting.Fields.OPERONS + "_full"] = total_full
            results[reporting.Fields.OPERONS + "_partial"] = total_partial
        else:
            if results[reporting.Fields.GENES + "_full"] is None:
                results[reporting.Fields.GENES + "_full"] = 0
                results[reporting.Fields.GENES + "_partial"] = 0
            results[reporting.Fields.GENES + "_full"] += total_full
            results[reporting.Fields.GENES + "_partial"] += total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    unsorted_features_in_contigs = [features_in_contigs[idx] for idx in contigs_order]
    unsorted_operons_in_contigs = [operons_in_contigs[idx] for idx in contigs_order]

    return ref_lengths, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)
Beispiel #56
0
def main(args):
    check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n.' +
                  'Please, put QUAST in a different directory, then try again.\n', exit_code=3)

    if not args:
        qconfig.usage(stream=sys.stderr)
        sys.exit(1)

    try:
        import imp
        imp.reload(qconfig)
        imp.reload(qutils)
    except:
        reload(qconfig)
        reload(qutils)

    try:
        locale.setlocale(locale.LC_ALL, 'en_US.utf8')
    except Exception:
        try:
            locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
        except Exception:
            logger.warning('Python locale settings can\'t be changed')
    quast_path = [os.path.realpath(__file__)]
    quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args)
    output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels
    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
    logger.main_info()
    logger.print_params()

    ########################################################################
    from quast_libs import reporting
    reports = reporting.reports
    try:
        import imp
        imp.reload(reporting)
    except:
        reload(reporting)
    reporting.reports = reports
    reporting.assembly_fpaths = []
    from quast_libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.

    if qconfig.is_combined_ref:
        corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname)
    else:
        if os.path.isdir(corrected_dirpath):
            shutil.rmtree(corrected_dirpath)
        os.mkdir(corrected_dirpath)

    qconfig.set_max_threads(logger)
    check_reads_fpaths(logger)
    # PROCESSING REFERENCE
    if ref_fpath:
        logger.main_info()
        logger.main_info('Reference:')
        original_ref_fpath = ref_fpath
        ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath)
        if qconfig.ideal_assembly:
            ideal_assembly_fpath = ideal_assembly.do(ref_fpath, original_ref_fpath,
                                                     os.path.join(output_dirpath, qconfig.ideal_assembly_basename))
            if ideal_assembly_fpath is not None:
                contigs_fpaths.insert(0, ideal_assembly_fpath)
                labels.insert(0, 'IDEAL ASSEMBLY')
                labels = qutils.process_labels(contigs_fpaths, labels)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')

    contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    cov_fpath = qconfig.cov_fpath
    physical_cov_fpath = qconfig.phys_cov_fpath
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_sam or qconfig.sam_fpaths or qconfig.bam_fpaths:
        bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths,
                                                                     os.path.join(output_dirpath, qconfig.reads_stats_dirname),
                                                                     external_logger=logger)
        qconfig.bed = bed_fpath

    if not contigs_fpaths:
        logger.error("None of the assembly files contains correct contigs. "
              "Please, provide different files or decrease --min-contig threshold.",
              fake_if_nested_run=True)
        return 4

    if qconfig.used_colors and qconfig.used_ls:
        for i, label in enumerate(labels):
            plotter_data.dict_color_and_ls[label] = (qconfig.used_colors[i], qconfig.used_ls[i])

    qconfig.assemblies_fpaths = contigs_fpaths

    # Where all pdfs will be saved
    all_pdf_fpath = None
    if qconfig.draw_plots and plotter.can_draw_plots:
        all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)

    if qconfig.json_output_dirpath:
        from quast_libs.html_saver import json_saver
        if json_saver.simplejson_error:
            qconfig.json_output_dirpath = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from quast_libs import basic_stats
    icarus_gc_fpath, circos_gc_fpath = basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), output_dirpath)

    if qconfig.large_genome and ref_fpath:
        unique_kmers.do(os.path.join(output_dirpath, 'basic_stats'), ref_fpath, contigs_fpaths, logger)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    icarus_html_fpath = None
    circos_png_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from quast_libs import contigs_analyzer
        is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, is_cyclic, os.path.join(output_dirpath, 'contigs_reports'),
            old_contigs_fpaths, qconfig.bed)
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    features_containers = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from quast_libs import aligned_stats
        aligned_stats.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath,
            aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from quast_libs import genome_analyzer
        features_containers = genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath,
            qconfig.genes, qconfig.operons, detailed_contigs_reports_dirpath,
            os.path.join(output_dirpath, 'genome_stats'))

    genes_by_labels = None
    if qconfig.gene_finding:
        if qconfig.glimmer:
            ########################################################################
            ### Glimmer
            ########################################################################
            from quast_libs import glimmer
            genes_by_labels = glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'))
        if not qconfig.glimmer or qconfig.test:
            ########################################################################
            ### GeneMark
            ########################################################################
            from quast_libs import genemark
            genes_by_labels = genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'),
                        qconfig.prokaryote, qconfig.metagenemark)
    else:
        logger.main_info("")
        logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.")

    if qconfig.rna_gene_finding:
        run_barrnap.do(contigs_fpaths, os.path.join(output_dirpath, 'predicted_genes'), logger)

    if qconfig.run_busco and not qconfig.is_combined_ref:
        if qconfig.platform_name == 'macosx':
            logger.main_info("")
            logger.warning("BUSCO can be run on Linux only")
        elif sys.version[0:3] == '2.5':
            logger.main_info("")
            logger.warning("BUSCO does not support Python versions older than 2.6.")
        else:
            from quast_libs import run_busco
            run_busco.do(contigs_fpaths, os.path.join(output_dirpath, qconfig.busco_dirname), logger)
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots or qconfig.create_icarus_html:
        logger.print_timestamp()
        logger.main_info('Creating large visual summaries...')
        logger.main_info('This may take a while: press Ctrl-C to skip this step..')
        try:
            if detailed_contigs_reports_dirpath:
                report_for_icarus_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.icarus_report_fname_pattern)
                stdout_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.contig_report_fname_pattern)
            else:
                report_for_icarus_fpath_pattern = None
                stdout_pattern = None
            draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html
            draw_circos_plot = qconfig.draw_plots and ref_fpath and len(aligned_contigs_fpaths) and not qconfig.space_efficient
            number_of_steps = sum([int(bool(value)) for value in [draw_alignment_plots, draw_circos_plot, all_pdf_fpath]])
            if draw_alignment_plots:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.main_info('  1 of %d: Creating Icarus viewers...' % number_of_steps)
                from quast_libs import icarus
                icarus_html_fpath, contig_alignment_plot_fpath = icarus.do(
                    contigs_fpaths, report_for_icarus_fpath_pattern, output_dirpath, ref_fpath,
                    stdout_pattern=stdout_pattern, features=features_containers,
                    cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, gc_fpath=icarus_gc_fpath,
                    json_output_dir=qconfig.json_output_dirpath, genes_by_labels=genes_by_labels)

            if draw_circos_plot:
                logger.main_info('  %d of %d: Creating Circos plots...' % (2 if draw_alignment_plots else 1, number_of_steps))
                from quast_libs import circos
                circos_png_fpath, circos_legend_fpath = circos.do(ref_fpath, contigs_fpaths, report_for_icarus_fpath_pattern, circos_gc_fpath,
                                                                  features_containers, cov_fpath, os.path.join(output_dirpath, 'circos'), logger)

            if all_pdf_fpath:
                # full report in PDF format: all tables and plots
                logger.main_info('  %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_fpath)
            logger.main_info('Done')
        except KeyboardInterrupt:
            logger.main_info('..step skipped!')
            if all_pdf_fpath and os.path.isfile(all_pdf_fpath):
                os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.main_info('RESULTS:')
    logger.main_info('  Text versions of total report are saved to ' + reports_fpaths)
    logger.main_info('  Text versions of transposed total report are saved to ' + transposed_reports_fpaths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls)
        html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath)

    if all_pdf_fpath and os.path.isfile(all_pdf_fpath):
        logger.main_info('  PDF version (tables and plots) is saved to ' + all_pdf_fpath)

    if circos_png_fpath:
        logger.main_info('  Circos plot is saved to %s (the annotation is in %s). Circos configuration file is saved to %s' %
                         (circos_png_fpath, circos_legend_fpath, circos_png_fpath.replace('.png', '.conf')))

    if icarus_html_fpath:
        logger.main_info('  Icarus (contig browser) is saved to %s' % icarus_html_fpath)

    if qconfig.draw_svg and contig_alignment_plot_fpath:
        logger.main_info('  Contig alignment plot is saved to %s' % contig_alignment_plot_fpath)

    cleanup(corrected_dirpath)
    return logger.finish_up(check_test=qconfig.test)
Beispiel #57
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning("GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning('  Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name)
    else:
        successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name))
        if not successful:
            return

        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
            index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads)
            for index, fasta_fpath in enumerate(fasta_fpaths))

        genes_by_labels = dict()
        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            label = qutils.label_from_fpath(fasta_path)
            genes_by_labels[label], unique_count, count = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count)
            if count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES, count)
            if unique_count is None and count is None:
                logger.error('  ' + qutils.index_to_str(i) +
                     'Failed predicting genes in ' + label + '. ' +
                     ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                         if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            for dirpath in glob.iglob(tmp_dirpath + '*'):
                if os.path.isdir(dirpath):
                    shutil.rmtree(dirpath)

        logger.main_info('Done.')
        return genes_by_labels
Beispiel #58
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index,
                                  parallel_by_chr, threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_lens = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        ref_lens[name] = len(seq)
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(line[0]) # Kolya: python don't convert int<->str types automatically
            loc = int(line[3]) # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in ref_lens.items():
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file, icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)

    # if qconfig.large_genome:
    #     log_out_f.write('Analyzing large blocks...\n')
    #     large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null'
    #     ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'),
    #                                coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w'))
    #     min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD
    #     result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null',
    #                                   aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0])
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Beispiel #59
0
def main(args):
    check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '.\n' +
                  'Please, put QUAST in a different directory, then try again.\n', exit_code=3)

    if not args:
        qconfig.usage(stream=sys.stderr)
        sys.exit(1)

    metaquast_path = [os.path.realpath(__file__)]
    quast_py_args, contigs_fpaths = parse_options(logger, metaquast_path + args)
    output_dirpath, ref_fpaths, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels
    html_report = qconfig.html_report
    test_mode = qconfig.test

    # Directories
    output_dirpath, _, _ = qutils.set_up_output_dir(
        output_dirpath, None, not output_dirpath,
        save_json=False)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    qconfig.set_max_threads(logger)
    qutils.logger = logger

    ########################################################################

    from quast_libs import reporting
    try:
        import imp
        imp.reload(reporting)
    except:
        reload(reporting)
    from quast_libs import plotter

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCES
    if ref_fpaths:
        logger.main_info()
        logger.main_info('Reference(s):')

        corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
            correct_meta_references(ref_fpaths, corrected_dirpath)

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')
    qconfig.no_check_meta = True
    assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels)
    if not assemblies:
        logger.error("None of the assembly files contains correct contigs. "
                     "Please, provide different files or decrease --min-contig threshold.")
        return 4

    # Running QUAST(s)
    if qconfig.gene_finding:
        quast_py_args += ['--mgm']
    if qconfig.min_IDY is None: # special case: user not specified min-IDY, so we need to use MetaQUAST default value
        quast_py_args += ['--min-identity', str(qconfig.META_MIN_IDY)]

    downloaded_refs = False

    # SEARCHING REFERENCES
    if not ref_fpaths:
        logger.main_info()
        if qconfig.max_references == 0:
            logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled")
        else:
            if qconfig.references_txt:
                logger.main_info("List of references was provided, starting to download reference genomes from NCBI...")
            else:
                logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database "
                        "and to download them from NCBI...")
            downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname)
            if not os.path.isdir(downloaded_dirpath):
                os.mkdir(downloaded_dirpath)
            corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
            ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, corrected_dirpath, qconfig.references_txt)
            if ref_fpaths:
                search_references_meta.is_quast_first_run = True
                if not qconfig.references_txt:
                    downloaded_refs = True
                logger.main_info()
                logger.main_info('Downloaded reference(s):')
                corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
                    correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=True)
            elif test_mode and not ref_fpaths:
                logger.error('Failed to download or setup SILVA 16S rRNA database for working without '
                             'references on metagenome datasets!', to_stderr=True, exit_with_code=4)

    if not ref_fpaths:
        # No references, running regular quast with MetaGenemark gene finder
        logger.main_info()
        logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder')
        assemblies = [Assembly(fpath, qutils.label_from_fpath(fpath)) for fpath in contigs_fpaths]
        _start_quast_main(quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, run_regular_quast=True)
        exit(0)

    # Running combined reference
    combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name)
    qconfig.reference = combined_ref_fpath

    if qconfig.bed:
        quast_py_args += ['--sv-bed']
        quast_py_args += [qconfig.bed]

    quast_py_args += ['--combined-ref']
    if qconfig.draw_plots or qconfig.html_report:
        if plotter_data.dict_color_and_ls:
            colors_and_ls = [plotter_data.dict_color_and_ls[asm.label] for asm in assemblies]
            quast_py_args += ['--colors']
            quast_py_args += [','.join([style[0] for style in colors_and_ls])]
            quast_py_args += ['--ls']
            quast_py_args += [','.join([style[1] for style in colors_and_ls])]
    run_name = 'for the combined reference'
    logger.main_info()
    logger.main_info('Starting quast.py ' + run_name + '...')
    total_num_notices = 0
    total_num_warnings = 0
    total_num_nf_errors = 0
    total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors)
    if qconfig.html_report:
        from quast_libs.html_saver import json_saver
        json_texts = []
    else:
        json_texts = None
    if qconfig.unique_mapping:
        ambiguity_opts = []
    else:
        ambiguity_opts = ["--ambiguity-usage", 'all']
    return_code, total_num_notifications = \
        _start_quast_main(quast_py_args + ambiguity_opts,
        labels=labels,
        assemblies=assemblies,
        reference_fpath=combined_ref_fpath,
        output_dirpath=combined_output_dirpath,
        num_notifications_tuple=total_num_notifications,
        is_combined_ref=True)

    if json_texts is not None:
        json_texts.append(json_saver.json_text)
    search_references_meta.is_quast_first_run = False

    genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats')
    genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt')
    if not os.path.exists(genome_info_fpath):
        logger.main_info('')
        if not downloaded_refs:
            msg = 'Try to restart MetaQUAST with another references.'
        else:
            msg = 'Try to use option --max-ref-number to change maximum number of references (per each assembly) to download.'
        logger.main_info('Failed aligning the contigs for all the references. ' + msg)
        logger.main_info('')
        cleanup(corrected_dirpath)
        logger.main_info('MetaQUAST finished.')
        return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)

    if downloaded_refs and return_code == 0:
        logger.main_info()
        logger.main_info('Excluding downloaded references with low genome fraction from further analysis..')
        corr_ref_fpaths = get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs)
        if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths:
            logger.main_info()
            logger.main_info('Filtered reference(s):')
            os.remove(combined_ref_fpath)
            contigs_analyzer.ref_labels_by_chromosomes = OrderedDict()
            corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names = \
                correct_meta_references(corr_ref_fpaths, corrected_dirpath)
            assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels)
            run_name = 'for the corrected combined reference'
            logger.main_info()
            logger.main_info('Starting quast.py ' + run_name + '...')
            return_code, total_num_notifications = \
                _start_quast_main(quast_py_args + ambiguity_opts,
                labels=labels,
                assemblies=assemblies,
                reference_fpath=combined_ref_fpath,
                output_dirpath=combined_output_dirpath,
                num_notifications_tuple=total_num_notifications,
                is_combined_ref=True)
            if json_texts is not None:
                json_texts = json_texts[:-1]
                json_texts.append(json_saver.json_text)
        elif corr_ref_fpaths == ref_fpaths:
            logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.')
        else:
            logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.')

    if return_code != 0:
        logger.main_info('MetaQUAST finished.')
        return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)

    if qconfig.calculate_read_support:
        calculate_ave_read_support(combined_output_dirpath, assemblies)

    prepare_regular_quast_args(quast_py_args, combined_output_dirpath)
    logger.main_info()
    logger.main_info('Partitioning contigs into bins aligned to each reference..')

    assemblies_by_reference, not_aligned_assemblies = partition_contigs(
        assemblies, corrected_ref_fpaths, corrected_dirpath,
        os.path.join(combined_output_dirpath, 'contigs_reports', 'alignments_%s.tsv'), labels)

    output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname)
    if not qconfig.memory_efficient and \
                    len(assemblies_by_reference) > len(assemblies) and len(assemblies) < qconfig.max_threads:
        logger.main_info()
        logger.main_info('Run QUAST on different references in parallel..')
        threads_per_ref = max(1, qconfig.max_threads // len(assemblies_by_reference))
        quast_py_args += ['--memory-efficient']
        quast_py_args += ['-t', str(threads_per_ref)]

        num_notifications = (0, 0, 0)
        parallel_run_args = [(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, num_notifications, True)
                             for ref_fpath, ref_assemblies in assemblies_by_reference]
        ref_names, ref_json_texts, ref_notifications = \
            run_parallel(_run_quast_per_ref, parallel_run_args, qconfig.max_threads, filter_results=True)
        per_ref_num_notifications = list(map(sum, zip(*ref_notifications)))
        total_num_notifications = list(map(sum, zip(total_num_notifications, per_ref_num_notifications)))
        if json_texts is not None:
            json_texts.extend(ref_json_texts)
        quast_py_args.remove('--memory-efficient')
        quast_py_args = remove_from_quast_py_args(quast_py_args, '-t', str(threads_per_ref))
    else:
        ref_names = []
        for ref_fpath, ref_assemblies in assemblies_by_reference:
            ref_name, json_text, total_num_notifications = \
                _run_quast_per_ref(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, total_num_notifications)
            if not ref_name:
                continue
            ref_names.append(ref_name)
            if json_texts is not None:
                json_texts.append(json_text)

    # Finally running for the contigs that has not been aligned to any reference
    no_unaligned_contigs = True
    for assembly in not_aligned_assemblies:
        if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0:
            no_unaligned_contigs = False
            break

    run_name = 'for the contigs not aligned anywhere'
    logger.main_info()
    if no_unaligned_contigs:
        logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)')
    else:
        logger.main_info('Starting quast.py ' + run_name + '... (logging to ' +
                        os.path.join(output_dirpath, qconfig.not_aligned_name, qconfig.LOGGER_DEFAULT_NAME + '.log)'))

        return_code, total_num_notifications = _start_quast_main(quast_py_args + ['-t', str(qconfig.max_threads)],
            assemblies=not_aligned_assemblies,
            output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name),
            num_notifications_tuple=total_num_notifications)

        if return_code not in [0, 4]:
            logger.error('Error running quast.py for the contigs not aligned anywhere')
        elif return_code == 4:  # no unaligned contigs, i.e. everything aligned
            no_unaligned_contigs = True
        if not no_unaligned_contigs:
            if json_texts is not None:
                json_texts.append(json_saver.json_text)

    if ref_names:
        logger.print_timestamp()
        logger.main_info("Summarizing results...")

        summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir)
        if not os.path.isdir(summary_output_dirpath):
            os.makedirs(summary_output_dirpath)
        if html_report and json_texts:
            from quast_libs.html_saver import html_saver
            html_summary_report_fpath = html_saver.init_meta_report(output_dirpath)
        else:
            html_summary_report_fpath = None
        from quast_libs import create_meta_summary
        metrics_for_plots = reporting.Fields.main_metrics
        misassembly_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION,
                              reporting.Fields.MIS_ISTRANSLOCATIONS]
        if no_unaligned_contigs:
            full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths]
        else:
            full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] + [qconfig.not_aligned_name]
        create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath,
                               output_dirpath_per_ref, metrics_for_plots, misassembly_metrics, full_ref_names)
        if html_report and json_texts:
            html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls, meta=True)
            if qconfig.create_icarus_html:
                icarus_html_fpath = html_saver.create_meta_icarus(output_dirpath, ref_names)
                logger.main_info('  Icarus (contig browser) is saved to %s' % icarus_html_fpath)
            html_saver.create_meta_report(output_dirpath, json_texts)

    cleanup(corrected_dirpath)
    logger.main_info('')
    logger.main_info('MetaQUAST finished.')
    return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)