def save_features_in_contigs(output_dirpath, contigs_fpaths, feature_name, features_in_contigs, ref_features_num): return save(join(output_dirpath, feature_name + in_contigs_suffix_fn), { 'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths], feature_name + '_in_contigs': dict((qutils.label_from_fpath(contigs_fpath), feature_amounts) for (contigs_fpath, feature_amounts) in features_in_contigs.items()), 'ref_' + feature_name + '_number': ref_features_num, })
def save_features_in_contigs(output_dirpath, contigs_fpaths, feature_name, features_in_contigs, ref_features_num): return save( join(output_dirpath, feature_name + in_contigs_suffix_fn), { 'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths], feature_name + '_in_contigs': dict((qutils.label_from_fpath(contigs_fpath), feature_amounts) for (contigs_fpath, feature_amounts) in features_in_contigs.items()), 'ref_' + feature_name + '_number': ref_features_num, })
def contigs_GC_content_plot(contigs_fpath, GC_distributions, plot_fpath): if not can_draw_plots or qconfig.no_gc: return title = label_from_fpath(contigs_fpath) + ' GC content' logger.info(' Drawing ' + title + ' plot...') plots = [] color, ls = get_color_and_ls(contigs_fpath) x_vals, y_vals = GC_distributions for GC_x, GC_y in zip(x_vals, y_vals): plots.append(Bar(GC_x, GC_y, color, width=5)) legend_list = [label_from_fpath(contigs_fpath)] create_plot(plot_fpath, title, plots, legend_list, x_label='GC (%)', y_label='# contigs', x_limit=[0, 100])
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_genemark.stderr') genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads) if not genes: unique_count = None count = None # [None] * len(gene_lengths) else: tool_name = "genemark" out_gff_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')) add_genes_to_gff(genes, out_gff_fpath, prokaryote) if OUTPUT_FASTA: out_fasta_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.fasta') add_genes_to_fasta(genes, out_fasta_fpath) count = [sum([gene.end - gene.start > x for gene in genes]) for x in gene_lengths] gene_ids = [gene.seq if gene.seq else gene.name for gene in genes] unique_count = len(set(gene_ids)) total_count = len(genes) logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_fpath) return genes, unique_count, count
def save_colors_and_ls(fpaths, labels=None): if not labels: labels = [qutils.label_from_fpath(fpath) for fpath in fpaths] if not dict_color_and_ls: color_id = 0 for i, fpath in enumerate(fpaths): ls = primary_line_style label = labels[i] # contigs and scaffolds should be equally colored but scaffolds should be dashed if fpath and fpath in qconfig.dict_of_broken_scaffolds: color = dict_color_and_ls[qutils.label_from_fpath(qconfig.dict_of_broken_scaffolds[fpath])][0] ls = secondary_line_style else: color = colors[color_id % len(colors)] color_id += 1 dict_color_and_ls[label] = (color, ls)
def get_color_and_ls(fpath, label=None): if not label: label = qutils.label_from_fpath(fpath) """ Returns tuple: color, line style """ return dict_color_and_ls[label]
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def get(assembly_fpath, ref_name=None): if not ref_name and qconfig.reference: ref_name = qutils.name_from_fpath(qconfig.reference) if assembly_fpath not in assembly_fpaths: assembly_fpaths.append(assembly_fpath) return reports.setdefault((os.path.abspath(assembly_fpath), ref_name), Report(qutils.label_from_fpath(assembly_fpath)))
def calculate_ave_read_support(combined_output_dirpath, assemblies): unique_contigs_fpath = os.path.join(combined_output_dirpath, 'contigs_reports', qconfig.unique_contigs_fname_pattern) for assembly in assemblies: aligned_contigs_by_ref = dict() assembly_label = qutils.label_from_fpath(assembly.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) with open(unique_contigs_fpath % corr_assembly_label) as in_f: for line in in_f: ref_name, contig_len, contig_cov = line.strip().split('\t') aligned_contigs_by_ref.setdefault(ref_name, []).append( (float(contig_len), float(contig_cov))) for ref_name, contigs in aligned_contigs_by_ref.items(): ref_cov = sum(contig_cov * aligned_len for (aligned_len, contig_cov) in contigs) ref_cov /= sum(aligned_len for (aligned_len, contig_cov) in contigs) corr_assembly_label = qutils.label_from_fpath_for_fname( assembly.fpath) ref_contigs_fpath = os.path.join( os.path.dirname(assembly.fpath), corr_assembly_label + '_to_' + ref_name + '.fasta') qconfig.assembly_labels_by_fpath[ ref_contigs_fpath] = assembly_label report = reporting.get(ref_contigs_fpath, ref_name=ref_name) report.add_field(reporting.Fields.AVE_READ_SUPPORT, '%.2f' % ref_cov)
def GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions, plot_fpath): if not can_draw_plots or qconfig.no_gc: return title = 'GC content' logger.info(' Drawing ' + title + ' plot...') plots = [] all_fpaths = contigs_fpaths if ref_fpath: all_fpaths = contigs_fpaths + [ref_fpath] for i, (GC_distribution_x, GC_distribution_y) in enumerate(list_of_GC_distributions): # for log scale for id2, v in enumerate(GC_distribution_y): if v == 0: GC_distribution_y[id2] = 0.1 # add to plot if ref_fpath and (i == len(all_fpaths) - 1): color = reference_color ls = reference_ls else: color, ls = get_color_and_ls(all_fpaths[i]) plots.append(Plot(GC_distribution_x, GC_distribution_y, color, ls)) legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths] if ref_fpath: legend_list += ['Reference'] create_plot(plot_fpath, title, plots, legend_list, x_label='GC (%)', y_label='# windows', x_limit=[0, 100])
def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath): total_len = dict() contigs_dict = dict() contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]] for contigs_fpath in contigs_fpaths: total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN) contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS) cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage] num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage] common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs) histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)' plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram', histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold) for contigs_fpath in contigs_with_coverage: coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]], [contigs_dict[contigs_fpath]]) label = qutils.label_from_fpath(contigs_fpath) corr_label = qutils.label_from_fpath_for_fname(contigs_fpath) histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)' histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram') plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath, histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
def save_coord(output_dirpath, coord_x, coord_y, name_coord, contigs_fpaths): coord_fn = name_coord + suffix_fn return save(join(output_dirpath, coord_fn), { 'coord_x': coord_x, 'coord_y': coord_y, 'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths] })
def Nx_plot(results_dir, reduce_points, contigs_fpaths, lists_of_lengths, plot_fpath, title='Nx', reference_lengths=None): if can_draw_plots: logger.info(' Drawing ' + title + ' plot...') plots = [] json_vals_x = [] # coordinates for Nx-like plots in HTML-report json_vals_y = [] for id, (contigs_fpath, lengths) in enumerate(zip(contigs_fpaths, lists_of_lengths)): if not lengths: json_vals_x.append([]) json_vals_y.append([]) continue lengths.sort(reverse=True) vals_x = [0.0] vals_y = [lengths[0]] # calculate values for the plot vals_Nx = [0.0] vals_l = [lengths[0]] lcur = 0 # if Nx-plot then we just use sum of contigs lengths, else use reference_length lsum = sum(lengths) if reference_lengths: lsum = reference_lengths[id] min_difference = 0 if reduce_points: min_difference = qconfig.min_difference for l in lengths: lcur += l x = lcur * 100.0 / lsum if can_draw_plots: vals_Nx.append(vals_Nx[-1] + 1e-10) # eps vals_l.append(l) vals_Nx.append(x) vals_l.append(l) if vals_y[-1] - l > min_difference or len(vals_x) == 1: vals_x.append(vals_x[-1] + 1e-10) # eps vals_y.append(l) vals_x.append(x) vals_y.append(l) # add to plot json_vals_x.append(vals_x) json_vals_y.append(vals_y) if can_draw_plots: vals_Nx.append(vals_Nx[-1] + 1e-10) # eps vals_l.append(0.0) vals_x.append(vals_x[-1] + 1e-10) # eps vals_y.append(0.0) color, ls = get_color_and_ls(contigs_fpath) plots.append(Plot(vals_Nx, vals_l, color, ls)) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_coord(results_dir, json_vals_x, json_vals_y, 'coord' + title, contigs_fpaths) if not can_draw_plots: return legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths] create_plot(plot_fpath, title, plots, legend_list, x_label='x', y_label='Contig length', x_limit=[0, 100])
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) out_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer') err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer.stderr') #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir, # fasta_path, out_path, gene_lengths, err_path) out_gff_path, genes, unique, total, full_genes, partial_genes = glimmerHMM( tool_dirpath, contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index) if out_gff_path: logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique) + ' unique, ' + str(total) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_path) return genes, unique, full_genes, partial_genes
def save_contigs_lengths(output_dirpath, contigs_fpaths, lists_of_lengths): lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] return save(join(output_dirpath, contigs_lengths_fn), { 'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths], 'lists_of_lengths': lists_of_lengths })
def save_GC_info(output_dirpath, contigs_fpaths, list_of_GC_distributions, reference_index): return save(join(output_dirpath, gc_fn), { 'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths], 'reference_index': reference_index, 'list_of_GC_distributions': list_of_GC_distributions, 'lists_of_gc_info': None, })
def get_assemblies_data(contigs_fpaths, icarus_dirpath, stdout_pattern, nx_marks): assemblies_n50 = defaultdict(dict) assemblies_data = '' assemblies_data += 'var assemblies_links = {};\n' assemblies_data += 'var assemblies_len = {};\n' assemblies_data += 'var assemblies_contigs = {};\n' assemblies_data += 'var assemblies_misassemblies = {};\n' assemblies_data += 'var assemblies_n50 = {};\n' assemblies_contig_size_data = '' for contigs_fpath in contigs_fpaths: assembly_label = qutils.label_from_fpath(contigs_fpath) report = reporting.get(contigs_fpath) l = report.get_field(reporting.Fields.TOTALLEN) contigs = report.get_field(reporting.Fields.CONTIGS) n50 = report.get_field(reporting.Fields.N50) if stdout_pattern: contig_stdout_fpath = stdout_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) + '.stdout' contig_stdout_fpath = qutils.relpath(contig_stdout_fpath, icarus_dirpath) assemblies_data += 'assemblies_links["' + assembly_label + '"] = "' + contig_stdout_fpath + '";\n' assemblies_contig_size_data += 'assemblies_len["' + assembly_label + '"] = ' + str(l) + ';\n' assemblies_contig_size_data += 'assemblies_contigs["' + assembly_label + '"] = ' + str(contigs) + ';\n' assemblies_contig_size_data += 'assemblies_n50["' + assembly_label + '"] = "' + str(n50) + '";\n' for nx in nx_marks: assemblies_n50[assembly_label][nx] = report.get_field(nx) return assemblies_data, assemblies_contig_size_data, assemblies_n50
def save_colors(results_dirpath, contigs_fpaths, dict_colors, meta=False): # coordinates for Nx, NAx, NGx, NGAX if meta: html_fpath = os.path.join(results_dirpath, report_fname) with open(html_fpath) as f_html: html_text = f_html.read() html_text = re.sub('{{ ' + 'colors' + ' }}', 'standard_colors', html_text) html_text = re.sub('{{ ' + 'broken_scaffolds' + ' }}', '[]', html_text) with open(html_fpath, 'w') as f_html: f_html.write(html_text) else: contig_labels = [ qutils.label_from_fpath(contigs_fpath) for contigs_fpath in contigs_fpaths ] colors_and_ls = [ dict_colors[contig_label] for contig_label in contig_labels ] colors = [color_and_ls[0] for color_and_ls in colors_and_ls] colors_for_html = [ html_colors[plotter_data.colors.index(color)] for color in colors ] save_record(results_dirpath, 'colors', colors_for_html) broken_contig_names = [ label for i, label in enumerate(contig_labels) if colors_and_ls[i][1] == secondary_line_style ] save_record(results_dirpath, 'broken_scaffolds', broken_contig_names)
def save_total_report(output_dirpath, min_contig, ref_fpath): from quast_libs import reporting asm_names = [ qutils.label_from_fpath(this) for this in reporting.assembly_fpaths ] report = reporting.table(reporting.Fields.grouped_order) subreports = [] ref_names = [] if qconfig.is_combined_ref and ref_labels_by_chromosomes: ref_names = sorted( list(set([ref for ref in ref_labels_by_chromosomes.values()]))) subreports = [ reporting.table(reporting.Fields.grouped_order, ref_name=ref_name) for ref_name in ref_names ] t = datetime.datetime.now() return save( join(output_dirpath, total_report_fname), { 'date': t.strftime('%d %B %Y, %A, %H:%M:%S'), 'assembliesNames': asm_names, 'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '', 'order': [i for i, _ in enumerate(asm_names)], 'report': report, 'subreferences': ref_names, 'subreports': subreports, 'minContig': min_contig })
def genes_operons_plot(reference_value, contigs_fpaths, files_feature_in_contigs, plot_fpath, title): if not can_draw_plots: return logger.info(' Drawing ' + title + ' cumulative plot...') plots = [] max_x = 0 for contigs_fpath in contigs_fpaths: # calculate values for the plot feature_in_contigs = files_feature_in_contigs[contigs_fpath] x_vals = list(range(len(feature_in_contigs) + 1)) y_vals = [0] total_full = 0 for feature_amount in feature_in_contigs: total_full += feature_amount y_vals.append(total_full) if len(x_vals) > 0: max_x = max(x_vals[-1], max_x) color, ls = get_color_and_ls(contigs_fpath) plots.append(Plot(x_vals, y_vals, color, ls)) if reference_value: plots.append(Plot([0, max_x], [reference_value, reference_value], reference_color, reference_ls)) title = 'Cumulative # complete ' + title legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths] if reference_value: legend_list += ['Reference'] create_plot(plot_fpath, title, plots, legend_list, x_label='Contig index', y_label=title)
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template): assembly_label = qutils.label_from_fpath(asm.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath) logger.info(' ' + 'processing ' + assembly_label) added_ref_asm = [] not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() aligned_contigs_for_each_ref = {} contigs_seq = fastaparser.read_fasta_one_time(asm.fpath) alignments_fpath = alignments_fpath_template % corr_assembly_label if os.path.exists(alignments_fpath): with open(alignments_fpath) as f: for line in f: values = line.split() if values[ 0] in contigs_analyzer.ref_labels_by_chromosomes.keys( ): ref_name = contigs_analyzer.ref_labels_by_chromosomes[ values[0]] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, corr_assembly_label + '_to_' + ref_name + '.fasta') if ref_name not in aligned_contigs_for_each_ref: aligned_contigs_for_each_ref[ref_name] = [] for (cont_name, seq) in contigs_seq: if not cont_name in contigs: contigs[cont_name] = seq if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ ref_name]: # Collecting all aligned contigs names in order to further extract not aligned aligned_contig_names.add(cont_name) aligned_contigs_for_each_ref[ref_name].append( cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, assembly_label) if ref_asm.name not in added_ref_asm: if ref_name in assemblies_by_ref: assemblies_by_ref[ref_name].append(ref_asm) added_ref_asm.append(ref_asm.name) if qconfig.space_efficient: os.remove(alignments_fpath) # Extraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) return assemblies_by_ref, not_aligned_asm
def save_assembly_lengths(output_dirpath, contigs_fpaths, assemblies_lengths): return save( join(output_dirpath, assemblies_lengths_fn), { 'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths], 'assemblies_lengths': assemblies_lengths })
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger): ref_misassemblies = [result['istranslocations_by_refs'] if result else [] for result in results] potential_misassemblies_by_refs = [result['potential_misassemblies_by_refs'] if result else [] for result in results] all_refs = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()]))) misassemblies_by_refs_rows = [] row = {'metricName': 'References', 'values': all_refs} misassemblies_by_refs_rows.append(row) if ref_misassemblies: for i, fpath in enumerate(contigs_fpaths): row = {'metricName': qutils.label_from_fpath(fpath), 'values': []} misassemblies_by_refs_rows.append(row) if ref_misassemblies[i]: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]} all_rows.append(row) for k in all_refs: row = {'metricName': k, 'values': []} for ref in all_refs: if ref == k or ref not in ref_misassemblies[i]: row['values'].append(None) else: row['values'].append(ref_misassemblies[i][ref][k]) misassemblies_by_refs_rows[-1]['values'].append(max(0, sum([r for r in row['values'] if r]) + potential_misassemblies_by_refs[i][k])) all_rows.append(row) misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file: misassembly_by_ref_file.write('Number of interspecies translocations by references: \n') print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file: misassembly_by_ref_file.write('References:\n') for ref_num, ref in enumerate(all_refs): misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n') logger.info(' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) misassemblies = [] if qconfig.draw_plots: from quast_libs import plotter aligned_contigs_labels = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: aligned_contigs_labels.append(row['metricName']) else: misassemblies_by_refs_rows.remove(row) for i in range(len(all_refs)): cur_results = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: cur_results.append(row['values'][i]) misassemblies.append(cur_results) is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies.' + qconfig.plot_extension) plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs, misassemblies_by_refs_rows, misassemblies, is_translocations_plot_fpath, title='Intergenomic misassemblies (found and supposed)', reverse=False, yaxis_title=None, print_all_refs=True)
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)( delayed(predict_genes)(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) else: results = [ predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths) ] genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label], unique, full_genes, partial_genes = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if full_genes is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique is None and full_genes is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % label) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def get_color_and_ls(fpath, label=None): from quast_libs import qutils if not label: label = qutils.label_from_fpath(fpath) if not dict_color_and_ls: return None, None """ Returns tuple: color, line style """ return dict_color_and_ls[label]
def save_coord(output_dirpath, coord_x, coord_y, name_coord, contigs_fpaths): coord_fn = name_coord + suffix_fn return save( join(output_dirpath, coord_fn), { 'coord_x': coord_x, 'coord_y': coord_y, 'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths] })
def save_contigs_lengths(output_dirpath, contigs_fpaths, lists_of_lengths): lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] return save( join(output_dirpath, contigs_lengths_fn), { 'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths], 'lists_of_lengths': lists_of_lengths })
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath): from quast_libs import reporting ref_reads_stats = None if ref_fpath: ref_name = qutils.name_from_fpath(ref_fpath) stats_fpath = join(output_dir, ref_name + '.stat') if isfile(stats_fpath): ref_reads_stats = parse_reads_stats(stats_fpath) if int(ref_reads_stats['mapped']) == 0: logger.info(' BWA: nothing aligned for reference.') # process all contigs files for index, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) stats_fpath = join(output_dir, assembly_name + '.stat') if ref_reads_stats: report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped']) report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt']) report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons']) report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint']) report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth']) if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS, [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0]) if not isfile(stats_fpath): continue reads_stats = parse_reads_stats(stats_fpath) report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total']) report.add_field(reporting.Fields.LEFT_READS, reads_stats['left']) report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right']) report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped']) report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt']) if int(reads_stats['mapped']) == 0: logger.info(' ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.') report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons']) report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint']) report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.DEPTH, reads_stats['depth']) if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS, [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template): assembly_label = qutils.label_from_fpath(asm.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath) logger.info(' ' + 'processing ' + assembly_label) added_ref_asm = [] not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() aligned_contigs_for_each_ref = {} contigs_seq = fastaparser.read_fasta_one_time(asm.fpath) alignments_fpath = alignments_fpath_template % corr_assembly_label if os.path.exists(alignments_fpath): with open(alignments_fpath) as f: for line in f: values = line.split() if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys(): ref_name = contigs_analyzer.ref_labels_by_chromosomes[values[0]] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, corr_assembly_label + '_to_' + ref_name + '.fasta') if ref_name not in aligned_contigs_for_each_ref: aligned_contigs_for_each_ref[ref_name] = [] for (cont_name, seq) in contigs_seq: if not cont_name in contigs: contigs[cont_name] = seq if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ref_name]: # Collecting all aligned contigs names in order to further extract not aligned aligned_contig_names.add(cont_name) aligned_contigs_for_each_ref[ref_name].append(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, assembly_label) if ref_asm.name not in added_ref_asm: if ref_name in assemblies_by_ref: assemblies_by_ref[ref_name].append(ref_asm) added_ref_asm.append(ref_asm.name) if qconfig.space_efficient: os.remove(alignments_fpath) # Extraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) return assemblies_by_ref, not_aligned_asm
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) else: results = [predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)] genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label], unique, full_genes, partial_genes = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if full_genes is not None: genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes)] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique is None and full_genes is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % label) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def correct_assemblies(contigs_fpaths, output_dirpath, labels): corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) # we need correction but do not need min-contig filtration min_contig = qconfig.min_contig qconfig.min_contig = 0 corrected_contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting=None) qconfig.min_contig = min_contig assemblies = [Assembly(fpath, qutils.label_from_fpath(fpath)) for fpath in old_contigs_fpaths] corrected_labels = [asm.label for asm in assemblies] if qconfig.draw_plots or qconfig.html_report: corr_fpaths = [asm.fpath for asm in assemblies] corr_labels = [asm.label for asm in assemblies] plotter_data.save_colors_and_ls(corr_fpaths, labels=corr_labels) return assemblies, corrected_labels
def save_total_report(output_dirpath, min_contig, ref_fpath): from quast_libs import reporting asm_names = [qutils.label_from_fpath(this) for this in reporting.assembly_fpaths] report = reporting.table(reporting.Fields.grouped_order) t = datetime.datetime.now() return save(join(output_dirpath, total_report_fname), { 'date': t.strftime('%d %B %Y, %A, %H:%M:%S'), 'assembliesNames': asm_names, 'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '', 'order': [i for i, _ in enumerate(asm_names)], 'report': report, 'minContig': min_contig, 'assembliesWithNs': qconfig.potential_scaffolds_assemblies if qconfig.potential_scaffolds_assemblies else None })
def save_GC_info(output_dirpath, contigs_fpaths, list_of_GC_distributions, list_of_GC_contigs_distributions, reference_index): return save( join(output_dirpath, gc_fn), { 'filenames': [qutils.label_from_fpath(label) for label in contigs_fpaths], 'reference_index': reference_index, 'list_of_GC_distributions': list_of_GC_distributions, 'list_of_GC_contigs_distributions': list_of_GC_contigs_distributions, 'lists_of_gc_info': None, })
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tool_exec_fpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)] genes_list, unique, full_genes, partial_genes = run_parallel( predict_genes, parallel_args, n_jobs) genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label] = genes_list[i] if unique[i] is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique[i]) if full_genes[i] is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i]) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique[i] is None and full_genes[i] is None: logger.error('Failed running Glimmer for %s. ' % label + ( 'Run with the --debug option' ' to see the command line.' if not qconfig.debug else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def save_colors(results_dirpath, contigs_fpaths, dict_colors, meta=False): # coordinates for Nx, NAx, NGx, NGAX if meta: html_fpath = os.path.join(results_dirpath, report_fname) with open(html_fpath) as f_html: html_text = f_html.read() html_text = re.sub('{{ ' + 'colors' + ' }}', 'standard_colors', html_text) html_text = re.sub('{{ ' + 'broken_scaffolds' + ' }}', '[]', html_text) with open(html_fpath, 'w') as f_html: f_html.write(html_text) else: contig_labels = [qutils.label_from_fpath(contigs_fpath) for contigs_fpath in contigs_fpaths] colors_and_ls = [dict_colors[contig_label] for contig_label in contig_labels] colors = [color_and_ls[0] for color_and_ls in colors_and_ls] colors_for_html = [html_colors[plotter_data.colors.index(color)] for color in colors] save_record(results_dirpath, 'colors', colors_for_html) broken_contig_names = [label for i, label in enumerate(contig_labels) if colors_and_ls[i][1] == secondary_line_style] save_record(results_dirpath, 'broken_scaffolds', broken_contig_names)
def cumulative_plot(reference, contigs_fpaths, lists_of_lengths, plot_fpath, title): if not can_draw_plots: return logger.info(' Drawing cumulative plot...') plots = [] max_x = 0 for (contigs_fpath, lengths) in zip(contigs_fpaths, lists_of_lengths): y_vals = [0] for l in sorted(lengths, reverse=True): y_vals.append(y_vals[-1] + l) x_vals = list(range(0, len(y_vals))) if x_vals: max_x = max(x_vals[-1], max_x) color, ls = get_color_and_ls(contigs_fpath) plots.append(Plot(x_vals, y_vals, color, ls)) if reference: y_vals = [0] for l in sorted( fastaparser.get_chr_lengths_from_fastafile(reference).values(), reverse=True): y_vals.append(y_vals[-1] + l) x_vals = list(range(0, len(y_vals))) # extend reference curve to the max X-axis point reference_length = y_vals[-1] max_x = max(max_x, x_vals[-1]) y_vals.append(reference_length) x_vals.append(max_x) plots.append(Plot(x_vals, y_vals, reference_color, reference_ls)) legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths] if reference: legend_list += ['Reference'] create_plot(plot_fpath, title, plots, legend_list, x_label='Contig index', y_label='Cumulative length', x_limit=[0, max_x])
def histogram(contigs_fpaths, values, plot_fpath, title='', yaxis_title='', bottom_value=None, top_value=None): if not can_draw_plots: return if len(contigs_fpaths) < 2: # logger.info(' Skipping drawing ' + title + ' histogram... (less than 2 columns histogram makes no sense)') return logger.info(' Drawing ' + title + ' histogram...') plots = [] min_value = sorted(values)[0] max_value = sorted(values, reverse=True)[0] exponent = None if max_value == min_value: if max_value > 0: exponent = math.pow(10, math.floor(math.log(max_value, 10))) else: exponent = 1 else: exponent = math.pow(10, math.floor(math.log(max_value - min_value, 10))) if not bottom_value: bottom_value = (math.floor(min_value / exponent) - 5) * exponent if not top_value: top_value = (math.ceil(max_value / exponent) + 1) * exponent #bars' params width = 0.3 interval = width // 3 start_pos = interval // 2 for i, (contigs_fpath, val) in enumerate(zip(contigs_fpaths, values)): color, ls = get_color_and_ls(contigs_fpath) if ls == primary_line_style: hatch = '' else: hatch = 'x' plots.append(Bar(start_pos + (width + interval) * i, val, color, width=width, hatch=hatch)) legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths] create_plot(plot_fpath, title, plots, legend_list, x_label='', y_label=yaxis_title, is_histogram=True, x_limit=[0, start_pos + width * len(contigs_fpaths) + interval * (len(contigs_fpaths) - 1)], y_limit=[max(bottom_value, 0), top_value])
def calculate_ave_read_support(combined_output_dirpath, assemblies): unique_contigs_fpath = os.path.join(combined_output_dirpath, 'contigs_reports', qconfig.unique_contigs_fname_pattern) for assembly in assemblies: aligned_contigs_by_ref = dict() assembly_label = qutils.label_from_fpath(assembly.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) with open(unique_contigs_fpath % corr_assembly_label) as in_f: for line in in_f: ref_name, contig_len, contig_cov = line.strip().split('\t') aligned_contigs_by_ref.setdefault(ref_name, []).append((float(contig_len), float(contig_cov))) for ref_name, contigs in aligned_contigs_by_ref.items(): ref_cov = sum(contig_cov * aligned_len for (aligned_len, contig_cov) in contigs) ref_cov /= sum(aligned_len for (aligned_len, contig_cov) in contigs) corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) ref_contigs_fpath = os.path.join( os.path.dirname(assembly.fpath), corr_assembly_label + '_to_' + ref_name + '.fasta') qconfig.assembly_labels_by_fpath[ref_contigs_fpath] = assembly_label report = reporting.get(ref_contigs_fpath, ref_name=ref_name) report.add_field(reporting.Fields.AVE_READ_SUPPORT, '%.2f' % ref_cov)
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) out_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer') err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer.stderr') #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir, # fasta_path, out_path, gene_lengths, err_path) out_gff_path, genes, unique, total, cnt = glimmerHMM(tool_dirpath, contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index) if out_gff_path: logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique) + ' unique, ' + str(total) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_path) return genes, unique, cnt
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tool_exec_fpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)] genes_list, unique, full_genes, partial_genes = run_parallel(predict_genes, parallel_args, n_jobs) genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label] = genes_list[i] if unique[i] is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique[i]) if full_genes[i] is not None: genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i])] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique[i] is None and full_genes[i] is None: logger.error( 'Failed running Glimmer for %s. ' % label + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def save_total_report(output_dirpath, min_contig, ref_fpath): from quast_libs import reporting asm_names = [qutils.label_from_fpath(this) for this in reporting.assembly_fpaths] report = reporting.table(reporting.Fields.grouped_order) subreports = [] ref_names = [] if qconfig.is_combined_ref and ref_labels_by_chromosomes: ref_names = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()]))) subreports = [reporting.table(reporting.Fields.grouped_order, ref_name=ref_name) for ref_name in ref_names] t = datetime.datetime.now() return save(join(output_dirpath, total_report_fname), { 'date': t.strftime('%d %B %Y, %A, %H:%M:%S'), 'assembliesNames': asm_names, 'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '', 'order': [i for i, _ in enumerate(asm_names)], 'report': report, 'subreferences': ref_names, 'subreports': subreports, 'minContig': min_contig })
def save_colors(results_dirpath, contigs_fpaths, dict_colors, meta=False): # coordinates for Nx, NAx, NGx, NGAX from quast_libs import plotter if meta: html_fpath = os.path.join(results_dirpath, report_fname) with open(html_fpath) as f_html: html_text = f_html.read() html_text = re.sub("{{ " + "colors" + " }}", "standard_colors", html_text) html_text = re.sub("{{ " + "broken_scaffolds" + " }}", "[]", html_text) with open(html_fpath, "w") as f_html: f_html.write(html_text) else: contig_labels = [qutils.label_from_fpath(contigs_fpath) for contigs_fpath in contigs_fpaths] colors_and_ls = [dict_colors[contig_label] for contig_label in contig_labels] colors = [color_and_ls[0] for color_and_ls in colors_and_ls] colors_for_html = [html_colors[plotter.colors.index(color)] for color in colors] json_fpath = json_saver.save_colors(results_dirpath, colors_for_html) append(results_dirpath, json_fpath, "colors") broken_contig_names = [ label for i, label in enumerate(contig_labels) if colors_and_ls[i][1] == secondary_line_style ] json_fpath = json_saver.save_broken_scaffolds(results_dirpath, broken_contig_names) append(results_dirpath, json_fpath, "broken_scaffolds")
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess([ 'sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference, contigs_fpath, tmp_dir, str(qconfig.min_contig) ], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def save_colors(results_dirpath, contigs_fpaths, dict_colors, meta=False): # coordinates for Nx, NAx, NGx, NGAX from quast_libs import plotter if meta: html_fpath = os.path.join(results_dirpath, report_fname) with open(html_fpath) as f_html: html_text = f_html.read() html_text = re.sub('{{ ' + 'colors' + ' }}', 'standard_colors', html_text) with open(html_fpath, 'w') as f_html: f_html.write(html_text) else: colors_and_ls = [ dict_colors[qutils.label_from_fpath(contigs_fpath)] for contigs_fpath in contigs_fpaths ] colors = [color_and_ls[0] for color_and_ls in colors_and_ls] colors_for_html = [ html_colors[plotter.colors.index(color)] for color in colors ] json_fpath = json_saver.save_colors(results_dirpath, colors_for_html) append(results_dirpath, json_fpath, 'colors')
def save_result(result, report, fname, ref_fpath): region_misassemblies = result['region_misassemblies'] misassemblies_by_ref = result['misassemblies_by_ref'] region_struct_variations = result['region_struct_variations'] misassemblies_matched_sv = result['misassemblies_matched_sv'] misassembled_contigs = result['misassembled_contigs'] misassembled_bases = result['misassembled_bases'] misassembly_internal_overlap = result['misassembly_internal_overlap'] unaligned = result['unaligned'] partially_unaligned = result['partially_unaligned'] partially_unaligned_bases = result['partially_unaligned_bases'] fully_unaligned_bases = result['fully_unaligned_bases'] ambiguous_contigs = result['ambiguous_contigs'] ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases'] SNPs = result['SNPs'] indels_list = result['indels_list'] total_aligned_bases = result['total_aligned_bases'] half_unaligned_with_misassembly = result['half_unaligned_with_misassembly'] report.add_field(reporting.Fields.MISLOCAL, region_misassemblies.count(Misassembly.LOCAL)) report.add_field(reporting.Fields.MISASSEMBL, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases) report.add_field(reporting.Fields.MISINTERNALOVERLAP, misassembly_internal_overlap) if qconfig.bed: report.add_field(reporting.Fields.STRUCT_VARIATIONS, misassemblies_matched_sv) report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned, partially_unaligned)) report.add_field(reporting.Fields.UNALIGNEDBASES, (fully_unaligned_bases + partially_unaligned_bases)) report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs) report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES, ambiguous_contigs_extra_bases) report.add_field(reporting.Fields.MISMATCHES, SNPs) # different types of indels: if indels_list is not None: report.add_field(reporting.Fields.INDELS, len(indels_list)) report.add_field(reporting.Fields.INDELSBASES, sum(indels_list)) report.add_field(reporting.Fields.MIS_SHORT_INDELS, len([i for i in indels_list if i <= qconfig.SHORT_INDEL_THRESHOLD])) report.add_field(reporting.Fields.MIS_LONG_INDELS, len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD])) if total_aligned_bases: report.add_field(reporting.Fields.SUBSERROR, "%.2f" % (float(SNPs) * 100000.0 / float(total_aligned_bases))) report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS)) * 100000.0 / float(total_aligned_bases))) # for misassemblies report: report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION)) report.add_field(reporting.Fields.MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION)) report.add_field(reporting.Fields.MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION)) report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases) report.add_field(reporting.Fields.MIS_LOCAL, region_misassemblies.count(Misassembly.LOCAL)) if qconfig.is_combined_ref: report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) all_references = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()]))) for ref_name in all_references: subreport = reporting.get(fname, ref_name=ref_name) ref_misassemblies = misassemblies_by_ref[ref_name] subreport.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, ref_misassemblies.count(Misassembly.RELOCATION) + ref_misassemblies.count(Misassembly.INVERSION) + ref_misassemblies.count(Misassembly.TRANSLOCATION) + ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_RELOCATION, ref_misassemblies.count(Misassembly.RELOCATION)) subreport.add_field(reporting.Fields.MIS_TRANSLOCATION, ref_misassemblies.count(Misassembly.TRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_INVERTION, ref_misassemblies.count(Misassembly.INVERSION)) subreport.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_LOCAL, ref_misassemblies.count(Misassembly.LOCAL)) subreport.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) subreport.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: subreport.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: subreport.add_field(reporting.Fields.MIS_FRAGMENTED, ref_misassemblies.count(Misassembly.FRAGMENTED)) elif intergenomic_misassemblies_by_asm: label = qutils.label_from_fpath(fname) ref_name = qutils.name_from_fpath(ref_fpath) ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name] report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: report.add_field(reporting.Fields.MIS_FRAGMENTED, region_misassemblies.count(Misassembly.FRAGMENTED)) # for unaligned report: report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned) report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, fully_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS, partially_unaligned) report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH, partially_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS, half_unaligned_with_misassembly) return report
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger): istranslocations_by_asm = [result['istranslocations_by_refs'] if result else None for result in results] misassemblies_by_asm = [result['misassemblies_by_ref'] if result else None for result in results] all_refs = [] for ref in ref_labels_by_chromosomes.values(): if ref not in all_refs: all_refs.append(ref) if not qconfig.use_input_ref_order: all_refs.sort() misassemblies_by_refs_rows = [] row = {'metricName': 'References', 'values': all_refs} misassemblies_by_refs_rows.append(row) if not istranslocations_by_asm: return for i, fpath in enumerate(contigs_fpaths): label = qutils.label_from_fpath(fpath) row = {'metricName': label, 'values': []} misassemblies_by_refs_rows.append(row) istranslocations_by_ref = istranslocations_by_asm[i] intergenomic_misassemblies_by_asm[label] = defaultdict(list) for ref in all_refs: intergenomic_misassemblies_by_asm[label][ref] = misassemblies_by_asm[i][ref] if misassemblies_by_asm[i] else [] if istranslocations_by_ref: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]} all_rows.append(row) for ref in all_refs: row = {'metricName': ref, 'values': []} for second_ref in all_refs: if ref == second_ref or second_ref not in istranslocations_by_ref: row['values'].append(None) else: row['values'].append(istranslocations_by_ref[ref][second_ref]) possible_misassemblies = 0 misassemblies_by_ref = misassemblies_by_asm[i] if misassemblies_by_ref: possible_misassemblies = misassemblies_by_ref[ref].count(Misassembly.POSSIBLE_MISASSEMBLIES) istranslocations = max(0, sum([r for r in row['values'] if r])) misassemblies_by_refs_rows[-1]['values'].append(istranslocations + possible_misassemblies) all_rows.append(row) misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file: misassembly_by_ref_file.write('Number of interspecies translocations by references: \n') print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file: misassembly_by_ref_file.write('References:\n') for ref_num, ref in enumerate(all_refs): misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n') logger.info(' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) misassemblies = [] if qconfig.draw_plots: from quast_libs import plotter aligned_contigs_labels = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: aligned_contigs_labels.append(row['metricName']) else: misassemblies_by_refs_rows.remove(row) for i in range(len(all_refs)): cur_results = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: cur_results.append(row['values'][i]) misassemblies.append(cur_results) is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies') plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs, misassemblies, is_translocations_plot_fpath, title='Intergenomic misassemblies (found and supposed)', reverse=False, yaxis_title=None, print_all_refs=True, logger=logger)
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None if ref_fpath: reference_lengths = sorted( fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content( ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning( ' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).' ) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold( seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * ( cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] num_contigs = max( [len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[ sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length) ] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [ sum(reference_lengths[( (i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum( reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points) ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append( sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content( contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil( (largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip( contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot( contigs_fpath, GC_distribution, join( output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.')
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = {} logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') return None coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(genes_container.region_list) or len(operons_container.region_list) if qconfig.memory_efficient and gene_searching_enabled: logger.warning('Run QUAST without genes and operons files to reduce memory consumption.') if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() if qconfig.space_efficient and nucmer_fpath.endswith('.filtered'): os.remove(nucmer_fpath) # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 aligned_len = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') aligned_len += 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 ref_lengths[chr_name] = aligned_len if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt')]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + suffix) found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type')) found_file.write('=========================================\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) found_file.write('%s\t\t%d\t%d\tcomplete\n' % (region_id, region.start, region.end)) feature_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) found_file.write('%s\t\t%d\t%d\tpartial\n' % (region_id, region.start, region.end)) results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
def process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = defaultdict(int) logger.info(' ' + qutils.index_to_str(index) + assembly_label) coords_base_fpath = os.path.join(coords_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: coords_fpath = coords_base_fpath else: coords_fpath = coords_base_fpath + '.filtered' if not os.path.isfile(coords_fpath): logger.error('File with alignment coords (' + coords_fpath + ') not found! Try to restart QUAST.', indent=' ') return None, None # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) sorted_contig_tuples = sorted(enumerate(contig_tuples), key=lambda x: len(x[1][1]), reverse=True) sorted_contigs_names = [] contigs_order = [] for idx, (name, _) in sorted_contig_tuples: sorted_contigs_names.append(name) contigs_order.append(idx) features_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(containers) if qconfig.memory_efficient and gene_searching_enabled: logger.warning('Run QUAST without genes and operons files to reduce memory consumption.') if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] with open(coords_fpath) as coordfile: for line in coordfile: s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1, contig=contig_name, start_in_contig=s2, end_in_contig=e2)) for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 for chr_name in genome_mapping.keys(): for i in ns_by_chromosomes[chr_name]: genome_mapping[chr_name][i] = 0 ref_lengths[chr_name] = sum(genome_mapping[chr_name]) if qconfig.space_efficient and coords_fpath.endswith('.filtered'): os.remove(coords_fpath) # counting genome coverage and gaps number gaps_count = 0 if qconfig.analyze_gaps: gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' with open(gaps_fpath, 'w') as gaps_file: for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1 or i in ns_by_chromosomes[chr_name]: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') results["gaps_count"] = gaps_count results[reporting.Fields.GENES + "_full"] = None results[reporting.Fields.GENES + "_partial"] = None results[reporting.Fields.OPERONS + "_full"] = None results[reporting.Fields.OPERONS + "_partial"] = None # finding genes and operons for container in containers: if not container.region_list: continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_genomic_features_' + container.kind.lower() + '.txt') found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type', 'Contig')) found_file.write('=' * 50 + '\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 gene_blocks = [] if region.id is None: region.id = '# ' + str(region.number + 1) for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if cur_block.seqname != region.seqname: continue if region.end <= cur_block.start or cur_block.end <= region.start: continue elif cur_block.start <= region.start and region.end <= cur_block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 contig_info = cur_block.format_gene_info(region) found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': operons_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig else: features_in_contigs[contig_id] += 1 cur_feature_is_found = True break elif min(region.end, cur_block.end) - max(region.start, cur_block.start) >= qconfig.min_gene_overlap: if found_list[i] == 0: found_list[i] = 2 total_partial += 1 gene_blocks.append(cur_block) if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon contig_info = ','.join([block.format_gene_info(region) for block in sorted(gene_blocks, key=lambda block: block.start)]) found_file.write('%s\t\t%d\t%d\tpartial\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': results[reporting.Fields.OPERONS + "_full"] = total_full results[reporting.Fields.OPERONS + "_partial"] = total_partial else: if results[reporting.Fields.GENES + "_full"] is None: results[reporting.Fields.GENES + "_full"] = 0 results[reporting.Fields.GENES + "_partial"] = 0 results[reporting.Fields.GENES + "_full"] += total_full results[reporting.Fields.GENES + "_partial"] += total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') unsorted_features_in_contigs = [features_in_contigs[idx] for idx in contigs_order] unsorted_operons_in_contigs = [operons_in_contigs[idx] for idx in contigs_order] return ref_lengths, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)
def main(args): check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n.' + 'Please, put QUAST in a different directory, then try again.\n', exit_code=3) if not args: qconfig.usage(stream=sys.stderr) sys.exit(1) try: import imp imp.reload(qconfig) imp.reload(qutils) except: reload(qconfig) reload(qutils) try: locale.setlocale(locale.LC_ALL, 'en_US.utf8') except Exception: try: locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') except Exception: logger.warning('Python locale settings can\'t be changed') quast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args) output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.main_info() logger.print_params() ######################################################################## from quast_libs import reporting reports = reporting.reports try: import imp imp.reload(reporting) except: reload(reporting) reporting.reports = reports reporting.assembly_fpaths = [] from quast_libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. if qconfig.is_combined_ref: corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname) else: if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) qconfig.set_max_threads(logger) check_reads_fpaths(logger) # PROCESSING REFERENCE if ref_fpath: logger.main_info() logger.main_info('Reference:') original_ref_fpath = ref_fpath ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath) if qconfig.ideal_assembly: ideal_assembly_fpath = ideal_assembly.do(ref_fpath, original_ref_fpath, os.path.join(output_dirpath, qconfig.ideal_assembly_basename)) if ideal_assembly_fpath is not None: contigs_fpaths.insert(0, ideal_assembly_fpath) labels.insert(0, 'IDEAL ASSEMBLY') labels = qutils.process_labels(contigs_fpaths, labels) else: ref_fpath = '' # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) cov_fpath = qconfig.cov_fpath physical_cov_fpath = qconfig.phys_cov_fpath if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_sam or qconfig.sam_fpaths or qconfig.bam_fpaths: bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, qconfig.reads_stats_dirname), external_logger=logger) qconfig.bed = bed_fpath if not contigs_fpaths: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 if qconfig.used_colors and qconfig.used_ls: for i, label in enumerate(labels): plotter_data.dict_color_and_ls[label] = (qconfig.used_colors[i], qconfig.used_ls[i]) qconfig.assemblies_fpaths = contigs_fpaths # Where all pdfs will be saved all_pdf_fpath = None if qconfig.draw_plots and plotter.can_draw_plots: all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) if qconfig.json_output_dirpath: from quast_libs.html_saver import json_saver if json_saver.simplejson_error: qconfig.json_output_dirpath = None ######################################################################## ### Stats and plots ######################################################################## from quast_libs import basic_stats icarus_gc_fpath, circos_gc_fpath = basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), output_dirpath) if qconfig.large_genome and ref_fpath: unique_kmers.do(os.path.join(output_dirpath, 'basic_stats'), ref_fpath, contigs_fpaths, logger) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None icarus_html_fpath = None circos_png_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from quast_libs import contigs_analyzer is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, is_cyclic, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, qconfig.bed) for contigs_fpath in contigs_fpaths: if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None features_containers = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from quast_libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from quast_libs import genome_analyzer features_containers = genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, qconfig.genes, qconfig.operons, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) genes_by_labels = None if qconfig.gene_finding: if qconfig.glimmer: ######################################################################## ### Glimmer ######################################################################## from quast_libs import glimmer genes_by_labels = glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) if not qconfig.glimmer or qconfig.test: ######################################################################## ### GeneMark ######################################################################## from quast_libs import genemark genes_by_labels = genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote, qconfig.metagenemark) else: logger.main_info("") logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.") if qconfig.rna_gene_finding: run_barrnap.do(contigs_fpaths, os.path.join(output_dirpath, 'predicted_genes'), logger) if qconfig.run_busco and not qconfig.is_combined_ref: if qconfig.platform_name == 'macosx': logger.main_info("") logger.warning("BUSCO can be run on Linux only") elif sys.version[0:3] == '2.5': logger.main_info("") logger.warning("BUSCO does not support Python versions older than 2.6.") else: from quast_libs import run_busco run_busco.do(contigs_fpaths, os.path.join(output_dirpath, qconfig.busco_dirname), logger) ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots or qconfig.create_icarus_html: logger.print_timestamp() logger.main_info('Creating large visual summaries...') logger.main_info('This may take a while: press Ctrl-C to skip this step..') try: if detailed_contigs_reports_dirpath: report_for_icarus_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.icarus_report_fname_pattern) stdout_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.contig_report_fname_pattern) else: report_for_icarus_fpath_pattern = None stdout_pattern = None draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html draw_circos_plot = qconfig.draw_plots and ref_fpath and len(aligned_contigs_fpaths) and not qconfig.space_efficient number_of_steps = sum([int(bool(value)) for value in [draw_alignment_plots, draw_circos_plot, all_pdf_fpath]]) if draw_alignment_plots: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.main_info(' 1 of %d: Creating Icarus viewers...' % number_of_steps) from quast_libs import icarus icarus_html_fpath, contig_alignment_plot_fpath = icarus.do( contigs_fpaths, report_for_icarus_fpath_pattern, output_dirpath, ref_fpath, stdout_pattern=stdout_pattern, features=features_containers, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, gc_fpath=icarus_gc_fpath, json_output_dir=qconfig.json_output_dirpath, genes_by_labels=genes_by_labels) if draw_circos_plot: logger.main_info(' %d of %d: Creating Circos plots...' % (2 if draw_alignment_plots else 1, number_of_steps)) from quast_libs import circos circos_png_fpath, circos_legend_fpath = circos.do(ref_fpath, contigs_fpaths, report_for_icarus_fpath_pattern, circos_gc_fpath, features_containers, cov_fpath, os.path.join(output_dirpath, 'circos'), logger) if all_pdf_fpath: # full report in PDF format: all tables and plots logger.main_info(' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_fpath) logger.main_info('Done') except KeyboardInterrupt: logger.main_info('..step skipped!') if all_pdf_fpath and os.path.isfile(all_pdf_fpath): os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.main_info('RESULTS:') logger.main_info(' Text versions of total report are saved to ' + reports_fpaths) logger.main_info(' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls) html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath) if all_pdf_fpath and os.path.isfile(all_pdf_fpath): logger.main_info(' PDF version (tables and plots) is saved to ' + all_pdf_fpath) if circos_png_fpath: logger.main_info(' Circos plot is saved to %s (the annotation is in %s). Circos configuration file is saved to %s' % (circos_png_fpath, circos_legend_fpath, circos_png_fpath.replace('.png', '.conf'))) if icarus_html_fpath: logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) if qconfig.draw_svg and contig_alignment_plot_fpath: logger.main_info(' Contig alignment plot is saved to %s' % contig_alignment_plot_fpath) cleanup(corrected_dirpath) return logger.finish_up(check_test=qconfig.test)
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning("GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning(' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) else: successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name)) if not successful: return if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[label], unique_count, count = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if count is not None: report.add_field(reporting.Fields.PREDICTED_GENES, count) if unique_count is None and count is None: logger.error(' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_lens = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header ref_lens[name] = len(seq) log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in ref_lens.items(): regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) # if qconfig.large_genome: # log_out_f.write('Analyzing large blocks...\n') # large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null' # ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'), # coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w')) # min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD # result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null', # aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0]) # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def main(args): check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '.\n' + 'Please, put QUAST in a different directory, then try again.\n', exit_code=3) if not args: qconfig.usage(stream=sys.stderr) sys.exit(1) metaquast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, metaquast_path + args) output_dirpath, ref_fpaths, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels html_report = qconfig.html_report test_mode = qconfig.test # Directories output_dirpath, _, _ = qutils.set_up_output_dir( output_dirpath, None, not output_dirpath, save_json=False) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) qconfig.set_max_threads(logger) qutils.logger = logger ######################################################################## from quast_libs import reporting try: import imp imp.reload(reporting) except: reload(reporting) from quast_libs import plotter if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCES if ref_fpaths: logger.main_info() logger.main_info('Reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath) # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') qconfig.no_check_meta = True assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) if not assemblies: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.") return 4 # Running QUAST(s) if qconfig.gene_finding: quast_py_args += ['--mgm'] if qconfig.min_IDY is None: # special case: user not specified min-IDY, so we need to use MetaQUAST default value quast_py_args += ['--min-identity', str(qconfig.META_MIN_IDY)] downloaded_refs = False # SEARCHING REFERENCES if not ref_fpaths: logger.main_info() if qconfig.max_references == 0: logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled") else: if qconfig.references_txt: logger.main_info("List of references was provided, starting to download reference genomes from NCBI...") else: logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database " "and to download them from NCBI...") downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname) if not os.path.isdir(downloaded_dirpath): os.mkdir(downloaded_dirpath) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, corrected_dirpath, qconfig.references_txt) if ref_fpaths: search_references_meta.is_quast_first_run = True if not qconfig.references_txt: downloaded_refs = True logger.main_info() logger.main_info('Downloaded reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=True) elif test_mode and not ref_fpaths: logger.error('Failed to download or setup SILVA 16S rRNA database for working without ' 'references on metagenome datasets!', to_stderr=True, exit_with_code=4) if not ref_fpaths: # No references, running regular quast with MetaGenemark gene finder logger.main_info() logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder') assemblies = [Assembly(fpath, qutils.label_from_fpath(fpath)) for fpath in contigs_fpaths] _start_quast_main(quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, run_regular_quast=True) exit(0) # Running combined reference combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name) qconfig.reference = combined_ref_fpath if qconfig.bed: quast_py_args += ['--sv-bed'] quast_py_args += [qconfig.bed] quast_py_args += ['--combined-ref'] if qconfig.draw_plots or qconfig.html_report: if plotter_data.dict_color_and_ls: colors_and_ls = [plotter_data.dict_color_and_ls[asm.label] for asm in assemblies] quast_py_args += ['--colors'] quast_py_args += [','.join([style[0] for style in colors_and_ls])] quast_py_args += ['--ls'] quast_py_args += [','.join([style[1] for style in colors_and_ls])] run_name = 'for the combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') total_num_notices = 0 total_num_warnings = 0 total_num_nf_errors = 0 total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors) if qconfig.html_report: from quast_libs.html_saver import json_saver json_texts = [] else: json_texts = None if qconfig.unique_mapping: ambiguity_opts = [] else: ambiguity_opts = ["--ambiguity-usage", 'all'] return_code, total_num_notifications = \ _start_quast_main(quast_py_args + ambiguity_opts, labels=labels, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_combined_ref=True) if json_texts is not None: json_texts.append(json_saver.json_text) search_references_meta.is_quast_first_run = False genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats') genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt') if not os.path.exists(genome_info_fpath): logger.main_info('') if not downloaded_refs: msg = 'Try to restart MetaQUAST with another references.' else: msg = 'Try to use option --max-ref-number to change maximum number of references (per each assembly) to download.' logger.main_info('Failed aligning the contigs for all the references. ' + msg) logger.main_info('') cleanup(corrected_dirpath) logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) if downloaded_refs and return_code == 0: logger.main_info() logger.main_info('Excluding downloaded references with low genome fraction from further analysis..') corr_ref_fpaths = get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs) if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths: logger.main_info() logger.main_info('Filtered reference(s):') os.remove(combined_ref_fpath) contigs_analyzer.ref_labels_by_chromosomes = OrderedDict() corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names = \ correct_meta_references(corr_ref_fpaths, corrected_dirpath) assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) run_name = 'for the corrected combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications = \ _start_quast_main(quast_py_args + ambiguity_opts, labels=labels, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_combined_ref=True) if json_texts is not None: json_texts = json_texts[:-1] json_texts.append(json_saver.json_text) elif corr_ref_fpaths == ref_fpaths: logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.') else: logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.') if return_code != 0: logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) if qconfig.calculate_read_support: calculate_ave_read_support(combined_output_dirpath, assemblies) prepare_regular_quast_args(quast_py_args, combined_output_dirpath) logger.main_info() logger.main_info('Partitioning contigs into bins aligned to each reference..') assemblies_by_reference, not_aligned_assemblies = partition_contigs( assemblies, corrected_ref_fpaths, corrected_dirpath, os.path.join(combined_output_dirpath, 'contigs_reports', 'alignments_%s.tsv'), labels) output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname) if not qconfig.memory_efficient and \ len(assemblies_by_reference) > len(assemblies) and len(assemblies) < qconfig.max_threads: logger.main_info() logger.main_info('Run QUAST on different references in parallel..') threads_per_ref = max(1, qconfig.max_threads // len(assemblies_by_reference)) quast_py_args += ['--memory-efficient'] quast_py_args += ['-t', str(threads_per_ref)] num_notifications = (0, 0, 0) parallel_run_args = [(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, num_notifications, True) for ref_fpath, ref_assemblies in assemblies_by_reference] ref_names, ref_json_texts, ref_notifications = \ run_parallel(_run_quast_per_ref, parallel_run_args, qconfig.max_threads, filter_results=True) per_ref_num_notifications = list(map(sum, zip(*ref_notifications))) total_num_notifications = list(map(sum, zip(total_num_notifications, per_ref_num_notifications))) if json_texts is not None: json_texts.extend(ref_json_texts) quast_py_args.remove('--memory-efficient') quast_py_args = remove_from_quast_py_args(quast_py_args, '-t', str(threads_per_ref)) else: ref_names = [] for ref_fpath, ref_assemblies in assemblies_by_reference: ref_name, json_text, total_num_notifications = \ _run_quast_per_ref(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, total_num_notifications) if not ref_name: continue ref_names.append(ref_name) if json_texts is not None: json_texts.append(json_text) # Finally running for the contigs that has not been aligned to any reference no_unaligned_contigs = True for assembly in not_aligned_assemblies: if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0: no_unaligned_contigs = False break run_name = 'for the contigs not aligned anywhere' logger.main_info() if no_unaligned_contigs: logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)') else: logger.main_info('Starting quast.py ' + run_name + '... (logging to ' + os.path.join(output_dirpath, qconfig.not_aligned_name, qconfig.LOGGER_DEFAULT_NAME + '.log)')) return_code, total_num_notifications = _start_quast_main(quast_py_args + ['-t', str(qconfig.max_threads)], assemblies=not_aligned_assemblies, output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name), num_notifications_tuple=total_num_notifications) if return_code not in [0, 4]: logger.error('Error running quast.py for the contigs not aligned anywhere') elif return_code == 4: # no unaligned contigs, i.e. everything aligned no_unaligned_contigs = True if not no_unaligned_contigs: if json_texts is not None: json_texts.append(json_saver.json_text) if ref_names: logger.print_timestamp() logger.main_info("Summarizing results...") summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir) if not os.path.isdir(summary_output_dirpath): os.makedirs(summary_output_dirpath) if html_report and json_texts: from quast_libs.html_saver import html_saver html_summary_report_fpath = html_saver.init_meta_report(output_dirpath) else: html_summary_report_fpath = None from quast_libs import create_meta_summary metrics_for_plots = reporting.Fields.main_metrics misassembly_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION, reporting.Fields.MIS_ISTRANSLOCATIONS] if no_unaligned_contigs: full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] else: full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] + [qconfig.not_aligned_name] create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots, misassembly_metrics, full_ref_names) if html_report and json_texts: html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls, meta=True) if qconfig.create_icarus_html: icarus_html_fpath = html_saver.create_meta_icarus(output_dirpath, ref_names) logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) html_saver.create_meta_report(output_dirpath, json_texts) cleanup(corrected_dirpath) logger.main_info('') logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)