def create_plot(plot_fpath, title, plots, legend_list=None, x_label=None, y_label=None, vertical_legend=False, is_histogram=False, x_limit=None, y_limit=None, x_ticks=None, vertical_ticks=False, add_to_report=True, logger=logger): figure = plt.gcf() plt.rc('font', **font) max_y = 0 ax = set_ax(vertical_legend) for plot in plots: max_y = max(max_y, plot.get_max_y()) plot.plot() if legend_list: add_legend(ax, legend_list, n_columns=n_columns, vertical_legend=vertical_legend) add_labels(x_label, y_label, max_y, ax, is_histogram=is_histogram) if x_limit: plt.xlim(x_limit) y_limit = y_limit or [0, max(5, int(math.ceil(max_y * 1.1)))] plt.ylim(y_limit) if x_ticks: plt.xticks(range(len(x_ticks)), x_ticks, size='small', rotation='vertical' if vertical_ticks else None) if not can_draw_plots: plt.close() return if with_title: plt.title(title) plot_fpath += '.' + qconfig.plot_extension if qconfig.is_combined_ref: # matplotlib needs to be run in parallel for combined reference to prevent fail in parallel runs per reference run_parallel(save_plot, [(plot_fpath,)], 2) else: save_plot(plot_fpath) logger.info(' saved to ' + plot_fpath) if add_to_report: pdf_plots_figures.append(figure) plt.close('all')
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running Barrnap...') n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if not os.path.isdir(output_dir): os.makedirs(output_dir) log_fpath = join(output_dir, 'barrnap.log') logger.info('Logging to ' + log_fpath + '...') kingdom = 'bac' if qconfig.prokaryote else 'euk' gff_fpaths = [ join(output_dir, qutils.label_from_fpath_for_fname(contigs_fpath) + '.rna.gff') for contigs_fpath in contigs_fpaths ] barrnap_args = [ (contigs_fpath, gff_fpath, log_fpath, threads, kingdom) for contigs_fpath, gff_fpath in zip(contigs_fpaths, gff_fpaths) ] run_parallel(run, barrnap_args, qconfig.max_threads) if not any(fpath for fpath in gff_fpaths): logger.info('Failed predicting the location of ribosomal RNA genes.') return # saving results for index, (contigs_fpath, gff_fpath) in enumerate(zip(contigs_fpaths, gff_fpaths)): genes = parse_gff(open(gff_fpath), 'rrna') report = reporting.get(contigs_fpath) if not os.path.isfile(gff_fpath): logger.error('Failed running Barrnap for ' + contigs_fpath + '. See ' + log_fpath + ' for information.') continue part_count = len([ gene for gene in genes if 'product' in gene.attributes and 'partial' in gene.attributes['product'] ]) total_count = len(genes) report.add_field( reporting.Fields.RNA_GENES, '%s + %s part' % (total_count - part_count, part_count)) logger.info(' ' + qutils.index_to_str(index) + ' Ribosomal RNA genes = ' + str(total_count)) logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + gff_fpath) logger.info('Done.')
def search_sv_with_gridss(main_ref_fpath, bam_fpath, meta_ref_fpaths, output_dirpath, err_fpath): logger.info(' Searching structural variations with GRIDSS...') final_bed_fpath = join(output_dirpath, qutils.name_from_fpath(main_ref_fpath) + '_' + qconfig.sv_bed_fname) if isfile(final_bed_fpath): logger.info(' Using existing file: ' + final_bed_fpath) return final_bed_fpath if not get_path_to_program('java') or not check_java_version(1.8): logger.warning('Java 1.8 or later is required to run GRIDSS. Please install it and rerun QUAST.') return None if not get_path_to_program('Rscript'): logger.warning('R is required to run GRIDSS. Please install it and rerun QUAST.') return None if meta_ref_fpaths: n_jobs = min(len(meta_ref_fpaths), qconfig.max_threads) threads_per_job = max(1, qconfig.max_threads // n_jobs) parallel_args = [(cur_ref_fpath, output_dirpath, err_fpath, threads_per_job) for cur_ref_fpath in meta_ref_fpaths] bed_fpaths = run_parallel(process_one_ref, parallel_args, n_jobs, filter_results=True) if bed_fpaths: qutils.cat_files(bed_fpaths, final_bed_fpath) else: process_one_ref(main_ref_fpath, output_dirpath, err_fpath, qconfig.max_threads, bam_fpath=bam_fpath, bed_fpath=final_bed_fpath) logger.info(' Saving to: ' + final_bed_fpath) return final_bed_fpath
def partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels): # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) n_jobs = min(qconfig.max_threads, len(assemblies)) parallel_run_args = [(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies] assemblies_dicts, not_aligned_assemblies = run_parallel( parallel_partition_contigs, parallel_run_args, n_jobs) assemblies_by_ref = [] for ref_fpath in ref_fpaths: ref_name = qutils.name_from_fpath(ref_fpath) not_sorted_assemblies = set([ val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist ]) sorted_assemblies = [] for label in labels: # sort by label for assembly in not_sorted_assemblies: if assembly.label == label: sorted_assemblies.append(assembly) break assemblies_by_ref.append((ref_fpath, sorted_assemblies)) return assemblies_by_ref, not_aligned_assemblies
def fill_all_pdf_file(all_pdf_fpath): if not can_draw_plots or not all_pdf_fpath: return # moving main report in the beginning global pdf_tables_figures global pdf_plots_figures if len(pdf_tables_figures): pdf_tables_figures = [pdf_tables_figures[-1]] + pdf_tables_figures[:-1] if qconfig.is_combined_ref: run_parallel(save_to_pdf, [(all_pdf_fpath, )], 2) else: save_to_pdf(all_pdf_fpath) pdf_tables_figures = [] pdf_plots_figures = []
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tool_exec_fpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)] genes_list, unique, full_genes, partial_genes = run_parallel( predict_genes, parallel_args, n_jobs) genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label] = genes_list[i] if unique[i] is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique[i]) if full_genes[i] is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i]) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique[i] is None and full_genes[i] is None: logger.error('Failed running Glimmer for %s. ' % label + ( 'Run with the --debug option' ' to see the command line.' if not qconfig.debug else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tool_exec_fpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)] genes_list, unique, full_genes, partial_genes = run_parallel(predict_genes, parallel_args, n_jobs) genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label] = genes_list[i] if unique[i] is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique[i]) if full_genes[i] is not None: genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i])] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique[i] is None and full_genes[i] is None: logger.error( 'Failed running Glimmer for %s. ' % label + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): coords_dirpath = os.path.join(detailed_contigs_reports_dirpath, qconfig.minimap_output_dirname) from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: coords_dirpath = os.path.join(coords_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats(ref_fpath) # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt') res_file = open(result_fpath, 'w') containers = [] for feature, feature_fpath in features_dict.items(): containers.append(FeatureContainer([feature_fpath], feature)) if not features_dict: logger.notice('No file with genomic features were provided. ' 'Use the --features option if you want to specify it.\n', indent=' ') if operons_fpaths: containers.append(FeatureContainer(operons_fpaths, 'operon')) else: logger.notice('No file with operons were provided. ' 'Use the -O option if you want to specify it.', indent=' ') for container in containers: if not container.fpaths: continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No genomic features of type "' + container.kind + '" were loaded.', indent=' ') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' genomic features of type "' + container.kind + '"') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys())) ref_genes_num, ref_operons_num = None, None for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) genomic_features = 0 for container in containers: if container.kind == 'operon': ref_operons_num = len(container.region_list) report.add_field(reporting.Fields.REF_OPERONS, len(container.region_list)) else: genomic_features += len(container.region_list) if genomic_features: ref_genes_num = genomic_features report.add_field(reporting.Fields.REF_GENES, genomic_features) # for cumulative plots: files_features_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_unsorted_features_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} files_unsorted_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) parallel_run_args = [(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)] ref_lengths, results_genes_operons_tuples = run_parallel(process_single_file, parallel_run_args, n_jobs, filter_results=True) num_nf_errors += len(aligned_contigs_fpaths) - len(ref_lengths) logger._num_nf_errors = num_nf_errors if not ref_lengths: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, ' + 'total length without N\'s: ' + str(chr_len - len(ns_by_chromosomes[chr_name])) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('=' * 120 + '\n') for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\ in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_features_in_contigs[contigs_fpath] = features_in_contigs files_unsorted_features_in_contigs[contigs_fpath] = unsorted_features_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs files_unsorted_operons_in_contigs[contigs_fpath] = unsorted_operons_in_contigs full_found_genes.append(sum(features_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], report.get_field(reporting.Fields.MAPPEDGENOME), report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count)) genome_mapped.append(float(report.get_field(reporting.Fields.MAPPEDGENOME))) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if qconfig.html_report: from quast_libs.html_saver import html_saver if ref_genes_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'features', files_features_in_contigs, ref_genes_num) if ref_operons_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter from quast_libs.ca_utils.misc import contigs_aligned_lengths if ref_genes_num: plotter.genes_operons_plot(ref_genes_num, aligned_contigs_fpaths, files_features_in_contigs, genome_stats_dirpath + '/features_cumulative_plot', 'genomic features') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_features_in_contigs, genome_stats_dirpath + '/features_frcurve_plot', 'genomic features') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_features_histogram', '# complete genomic features') if ref_operons_num: plotter.genes_operons_plot(ref_operons_num, aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs, genome_stats_dirpath + '/operons_frcurve_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return containers
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return dict( zip(contigs_fpaths, [AlignerStatus.FAILED] * len(contigs_fpaths))), None num_nf_errors = logger._num_nf_errors create_minimap_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats( reference, skip_ns=True) threads = qconfig.max_threads if qconfig.memory_efficient else threads args = [(is_cyclic, i, contigs_fpath, output_dir, reference, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads) for i, (contigs_fpath, old_contigs_fpath ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))] statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel( align_and_analyze, args, n_jobs) reports = [] aligner_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict( zip(contigs_fpaths, aligned_lengths_by_contigs)) if AlignerStatus.OK in aligner_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == AlignerStatus.OK: reports.append( save_result(results[index], report, fname, reference, genome_size)) elif statuses[index] == AlignerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if AlignerStatus.OK in aligner_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot( reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict( (contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(aligner_statuses.values()).count(AlignerStatus.OK) not_aligned = list(aligner_statuses.values()).count( AlignerStatus.NOT_ALIGNED) failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED) errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR) problems = not_aligned + failed + errors all = len(aligner_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info( 'Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return aligner_statuses, aligned_lengths_per_fpath
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries(logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return set_augustus_dir(augustus_dirpath) if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus) if not clade_dirpath: logger.info('Failed finding conservative genes.') return log_fpath = join(output_dir, 'busco.log') logger.info('Logging to ' + log_fpath + '...') busco_args = [(['-i', contigs_fpath, '-o', qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath, '-m', 'genome', '-f', '-z', '-c', str(busco_threads), '-t', tmp_dir, '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' + join(augustus_dirpath, 'config') + '\'' ], output_dir) for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error('Failed running BUSCO for all the assemblies. See ' + log_fpath + ' for information.') return # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field(reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) else: logger.error( 'Failed running BUSCO for ' + contigs_fpath + '. See ' + log_fpath + ' for information.') logger.info('Done.')
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries( logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus) if not clade_dirpath: logger.info('Failed finding conservative genes.') return config_fpath = make_config(output_dir, tmp_dir, busco_threads, clade_dirpath, augustus_dirpath) logger.info('Logs and results will be saved under ' + output_dir + '...') os.environ['BUSCO_CONFIG_FILE'] = config_fpath os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_configs( augustus_dirpath, tmp_dir) if not os.environ['AUGUSTUS_CONFIG_PATH']: logger.error( 'Augustus configs not found, failed to run BUSCO without them.') busco_args = [[ contigs_fpath, qutils.label_from_fpath_for_fname(contigs_fpath) ] for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco_main_handler, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error( 'Failed running BUSCO for all the assemblies. See log files in ' + output_dir + ' for information ' '(rerun with --debug to keep all intermediate files).') return # saving results zero_output_for_all = True for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field( reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) if complete_buscos + part_buscos > 0: zero_output_for_all = False shutil.copy(summary_fpaths[i], output_dir) else: logger.error( 'Failed running BUSCO for ' + contigs_fpath + '. See the log for detailed information' ' (rerun with --debug to keep all intermediate files).') if zero_output_for_all: logger.warning( 'BUSCO did not fail explicitly but found nothing for all assemblies! ' 'Possible reasons and workarounds:\n' ' 1. Provided assemblies are so small that they do not contain even a single partial BUSCO gene. Not likely but may happen -- nothing to worry then.\n' ' 2. Incorrect lineage database was used. To run with fungi DB use --fungus, to run with eukaryota DB use --eukaryote, otherwise BUSCO uses bacteria DB.\n' ' 3. Problem with BUSCO dependencies, most likely Augustus. Check that the binaries in ' + augustus_dirpath + '/bin/ are working properly.\n' ' If something is wrong with Augustus, you may try to install it yourself (https://github.com/Gaius-Augustus/Augustus) and add "augustus" binary to PATH.\n' ' 4. Some other problem with BUSCO. Check the logs (you may need to rerun QUAST with --debug to see all intermediate files).\n' ' If you cannot solve the problem yourself, post an issue at https://github.com/ablab/quast/issues or write to [email protected]' ) if not qconfig.debug: cleanup(output_dir) logger.info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): coords_dirpath = os.path.join(detailed_contigs_reports_dirpath, qconfig.minimap_output_dirname) from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: coords_dirpath = os.path.join(coords_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats( ref_fpath) # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt') res_file = open(result_fpath, 'w') containers = [] for feature, feature_fpath in features_dict.items(): containers.append(FeatureContainer([feature_fpath], feature)) if not features_dict: logger.notice( 'No file with genomic features were provided. ' 'Use the --features option if you want to specify it.\n', indent=' ') if operons_fpaths: containers.append(FeatureContainer(operons_fpaths, 'operon')) else: logger.notice( 'No file with operons were provided. ' 'Use the -O option if you want to specify it.', indent=' ') for container in containers: if not container.fpaths: continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file( fpath, container.kind) if len(container.region_list) == 0: logger.warning('No genomic features of type "' + container.kind + '" were loaded.', indent=' ') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' genomic features of type "' + container.kind + '"') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict( container.kind, container.region_list, list(reference_chromosomes.keys())) ref_genes_num, ref_operons_num = None, None for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) genomic_features = 0 for container in containers: if container.kind == 'operon': ref_operons_num = len(container.region_list) report.add_field(reporting.Fields.REF_OPERONS, len(container.region_list)) else: genomic_features += len(container.region_list) if genomic_features: ref_genes_num = genomic_features report.add_field(reporting.Fields.REF_GENES, genomic_features) # for cumulative plots: files_features_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_unsorted_features_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} files_unsorted_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) parallel_run_args = [ (contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths) ] ref_lengths, results_genes_operons_tuples = run_parallel( process_single_file, parallel_run_args, n_jobs, filter_results=True) num_nf_errors += len(aligned_contigs_fpaths) - len(ref_lengths) logger._num_nf_errors = num_nf_errors if not ref_lengths: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ ref_lengths[i][ref] for i in range(len(ref_lengths)) ] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, ' + 'total length without N\'s: ' + str(chr_len - len(ns_by_chromosomes[chr_name])) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('=' * 120 + '\n') for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\ in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_features_in_contigs[contigs_fpath] = features_in_contigs files_unsorted_features_in_contigs[ contigs_fpath] = unsorted_features_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs files_unsorted_operons_in_contigs[ contigs_fpath] = unsorted_operons_in_contigs full_found_genes.append(sum(features_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) res_file.write( '%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], report.get_field( reporting.Fields.MAPPEDGENOME), report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count)) genome_mapped.append( float(report.get_field(reporting.Fields.MAPPEDGENOME))) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if qconfig.html_report: from quast_libs.html_saver import html_saver if ref_genes_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'features', files_features_in_contigs, ref_genes_num) if ref_operons_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter from quast_libs.ca_utils.misc import contigs_aligned_lengths if ref_genes_num: plotter.genes_operons_plot( ref_genes_num, aligned_contigs_fpaths, files_features_in_contigs, genome_stats_dirpath + '/features_cumulative_plot', 'genomic features') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_features_in_contigs, genome_stats_dirpath + '/features_frcurve_plot', 'genomic features') plotter.histogram( aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_features_histogram', '# complete genomic features') if ref_operons_num: plotter.genes_operons_plot( ref_operons_num, aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs, genome_stats_dirpath + '/operons_frcurve_plot', 'operons') plotter.histogram( aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return containers
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning( "GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning( ' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) elif not install_genemark(): logger.warning( ' Can\'t copy the license key to ~/.gm_key, skipping gene prediction.' ) else: if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) parallel_run_args = [ (index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths) ] genes_list, unique_count, full_genes, partial_genes = run_parallel( predict_genes, parallel_run_args, n_jobs) if not is_license_valid(out_dirpath, fasta_fpaths): return genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[label] = genes_list[i] if unique_count[i] is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count[i]) if full_genes[i] is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i]) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique_count[i] is None and full_genes[i] is None: logger.error( ' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir, log_path, err_fpath): required_files = [] bed_fpath, cov_fpath, physical_cov_fpath = None, None, None if main_ref_fpath: ref_name = qutils.name_from_fpath(main_ref_fpath) bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed') cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov') required_files = [bed_fpath, cov_fpath, physical_cov_fpath] if qconfig.no_sv: logger.info(' Will not search Structural Variations (--fast or --no-sv is specified)') bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) elif not qconfig.forward_reads and not qconfig.interlaced_reads: if not qconfig.reference_sam and not qconfig.reference_bam: logger.info(' Will not search Structural Variations (needs paired-end reads)') bed_fpath = None qconfig.no_sv = True if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info(' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)') cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): required_files = [] n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1) max_threads_per_job = max(1, qconfig.max_threads // n_jobs) sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths) bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths) parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)] if main_ref_fpath: parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True)) correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs) qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)] qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)] add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath) save_reads(output_dir) if not main_ref_fpath: return None, None, None correct_chr_names = correct_chr_names[-1] sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1] qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not required_files: return bed_fpath, cov_fpath, physical_cov_fpath if not all([sam_fpath, bam_fpath]): logger.info(' Failed searching structural variations.') return None, None, None sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted')) bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: if not is_non_empty_file(bam_sorted_fpath): sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger) if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_lengths = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_lengths[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False ref_files = {} if meta_ref_fpaths: global ref_sam_fpaths for cur_ref_fpath in meta_ref_fpaths: cur_ref_name = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam') ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath if is_non_empty_file(ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath)) ref_files[cur_ref_name] = None else: ref_sam_file = open(ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): ref_sam_file.write(headers[0] + '\n') for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name: ref_sam_file.write(h + '\n') ref_sam_file.write(headers[-1] + '\n') ref_files[cur_ref_name] = ref_sam_file need_ref_splitting = True trivial_deletions_fpath = \ search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting) if get_gridss_fpath() and isfile(get_gridss_fpath()): try: gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath) qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info(' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries( logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return set_augustus_dir(augustus_dirpath) if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote) if not clade_dirpath: logger.info('Failed finding conservative genes.') return log_fpath = join(output_dir, 'busco.log') logger.info('Logging to ' + log_fpath + '...') busco_args = [([ '-i', contigs_fpath, '-o', qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath, '-m', 'genome', '-f', '-z', '-c', str(busco_threads), '-t', tmp_dir, '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' + join(augustus_dirpath, 'config') + '\'' ], output_dir) for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error('Failed running BUSCO for all the assemblies. See ' + log_fpath + ' for information.') return # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field( reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) else: logger.error('Failed running BUSCO for ' + contigs_fpath + '. See ' + log_fpath + ' for information.') logger.info('Done.')
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning("GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning(' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) elif not install_genemark(): logger.warning(' Can\'t copy the license key to ~/.gm_key, skipping gene prediction.') else: if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) parallel_run_args = [(index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)] genes_list, unique_count, full_genes, partial_genes = run_parallel(predict_genes, parallel_run_args, n_jobs) if not is_license_valid(out_dirpath, fasta_fpaths): return genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[label] = genes_list[i] if unique_count[i] is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count[i]) if full_genes[i] is not None: genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i])] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique_count[i] is None and full_genes[i] is None: logger.error(' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries( logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus) if not clade_dirpath: logger.info('Failed finding conservative genes.') return config_fpath = make_config(output_dir, tmp_dir, busco_threads, clade_dirpath, augustus_dirpath) logger.info('Logs and results will be saved under ' + output_dir + '...') os.environ['BUSCO_CONFIG_FILE'] = config_fpath os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_contigs( augustus_dirpath, tmp_dir) if not os.environ['AUGUSTUS_CONFIG_PATH']: logger.error( 'Augustus configs not found, failed to run BUSCO without them.') busco_args = [[ contigs_fpath, qutils.label_from_fpath_for_fname(contigs_fpath) ] for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco_main_handler, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error( 'Failed running BUSCO for all the assemblies. See log files in ' + output_dir + ' for information.') return # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field( reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) shutil.copy(summary_fpaths[i], output_dir) else: logger.error('Failed running BUSCO for ' + contigs_fpath + '. See the log for detailed information.') if not qconfig.debug: cleanup(output_dir) logger.info('Done.')
def main(args): check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '.\n' + 'Please, put QUAST in a different directory, then try again.\n', exit_code=3) if not args: qconfig.usage(stream=sys.stderr) sys.exit(1) metaquast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, metaquast_path + args) output_dirpath, ref_fpaths, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels html_report = qconfig.html_report test_mode = qconfig.test # Directories output_dirpath, _, _ = qutils.set_up_output_dir( output_dirpath, None, not output_dirpath, save_json=False) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) qconfig.set_max_threads(logger) qutils.logger = logger ######################################################################## from quast_libs import reporting try: import imp imp.reload(reporting) except: reload(reporting) from quast_libs import plotter if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCES if ref_fpaths: logger.main_info() logger.main_info('Reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath) # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') qconfig.no_check_meta = True assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) if not assemblies: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.") return 4 # Running QUAST(s) if qconfig.gene_finding: quast_py_args += ['--mgm'] if qconfig.min_IDY is None: # special case: user not specified min-IDY, so we need to use MetaQUAST default value quast_py_args += ['--min-identity', str(qconfig.META_MIN_IDY)] downloaded_refs = False # SEARCHING REFERENCES if not ref_fpaths: logger.main_info() if qconfig.max_references == 0: logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled") else: if qconfig.references_txt: logger.main_info("List of references was provided, starting to download reference genomes from NCBI...") else: logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database " "and to download them from NCBI...") downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname) if not os.path.isdir(downloaded_dirpath): os.mkdir(downloaded_dirpath) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, corrected_dirpath, qconfig.references_txt) if ref_fpaths: search_references_meta.is_quast_first_run = True if not qconfig.references_txt: downloaded_refs = True logger.main_info() logger.main_info('Downloaded reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=True) elif test_mode and not ref_fpaths: logger.error('Failed to download or setup SILVA 16S rRNA database for working without ' 'references on metagenome datasets!', to_stderr=True, exit_with_code=4) if not ref_fpaths: # No references, running regular quast with MetaGenemark gene finder logger.main_info() logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder') assemblies = [Assembly(fpath, qutils.label_from_fpath(fpath)) for fpath in contigs_fpaths] _start_quast_main(quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, run_regular_quast=True) exit(0) # Running combined reference combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name) qconfig.reference = combined_ref_fpath if qconfig.bed: quast_py_args += ['--sv-bed'] quast_py_args += [qconfig.bed] quast_py_args += ['--combined-ref'] if qconfig.draw_plots or qconfig.html_report: if plotter_data.dict_color_and_ls: colors_and_ls = [plotter_data.dict_color_and_ls[asm.label] for asm in assemblies] quast_py_args += ['--colors'] quast_py_args += [','.join([style[0] for style in colors_and_ls])] quast_py_args += ['--ls'] quast_py_args += [','.join([style[1] for style in colors_and_ls])] run_name = 'for the combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') total_num_notices = 0 total_num_warnings = 0 total_num_nf_errors = 0 total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors) if qconfig.html_report: from quast_libs.html_saver import json_saver json_texts = [] else: json_texts = None if qconfig.unique_mapping: ambiguity_opts = [] else: ambiguity_opts = ["--ambiguity-usage", 'all'] return_code, total_num_notifications = \ _start_quast_main(quast_py_args + ambiguity_opts, labels=labels, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_combined_ref=True) if json_texts is not None: json_texts.append(json_saver.json_text) search_references_meta.is_quast_first_run = False genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats') genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt') if not os.path.exists(genome_info_fpath): logger.main_info('') if not downloaded_refs: msg = 'Try to restart MetaQUAST with another references.' else: msg = 'Try to use option --max-ref-number to change maximum number of references (per each assembly) to download.' logger.main_info('Failed aligning the contigs for all the references. ' + msg) logger.main_info('') cleanup(corrected_dirpath) logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) if downloaded_refs and return_code == 0: logger.main_info() logger.main_info('Excluding downloaded references with low genome fraction from further analysis..') corr_ref_fpaths = get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs) if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths: logger.main_info() logger.main_info('Filtered reference(s):') os.remove(combined_ref_fpath) contigs_analyzer.ref_labels_by_chromosomes = OrderedDict() corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names = \ correct_meta_references(corr_ref_fpaths, corrected_dirpath) assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) run_name = 'for the corrected combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications = \ _start_quast_main(quast_py_args + ambiguity_opts, labels=labels, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_combined_ref=True) if json_texts is not None: json_texts = json_texts[:-1] json_texts.append(json_saver.json_text) elif corr_ref_fpaths == ref_fpaths: logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.') else: logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.') if return_code != 0: logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) if qconfig.calculate_read_support: calculate_ave_read_support(combined_output_dirpath, assemblies) prepare_regular_quast_args(quast_py_args, combined_output_dirpath) logger.main_info() logger.main_info('Partitioning contigs into bins aligned to each reference..') assemblies_by_reference, not_aligned_assemblies = partition_contigs( assemblies, corrected_ref_fpaths, corrected_dirpath, os.path.join(combined_output_dirpath, 'contigs_reports', 'alignments_%s.tsv'), labels) output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname) if not qconfig.memory_efficient and \ len(assemblies_by_reference) > len(assemblies) and len(assemblies) < qconfig.max_threads: logger.main_info() logger.main_info('Run QUAST on different references in parallel..') threads_per_ref = max(1, qconfig.max_threads // len(assemblies_by_reference)) quast_py_args += ['--memory-efficient'] quast_py_args += ['-t', str(threads_per_ref)] num_notifications = (0, 0, 0) parallel_run_args = [(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, num_notifications, True) for ref_fpath, ref_assemblies in assemblies_by_reference] ref_names, ref_json_texts, ref_notifications = \ run_parallel(_run_quast_per_ref, parallel_run_args, qconfig.max_threads, filter_results=True) per_ref_num_notifications = list(map(sum, zip(*ref_notifications))) total_num_notifications = list(map(sum, zip(total_num_notifications, per_ref_num_notifications))) if json_texts is not None: json_texts.extend(ref_json_texts) quast_py_args.remove('--memory-efficient') quast_py_args = remove_from_quast_py_args(quast_py_args, '-t', str(threads_per_ref)) else: ref_names = [] for ref_fpath, ref_assemblies in assemblies_by_reference: ref_name, json_text, total_num_notifications = \ _run_quast_per_ref(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, total_num_notifications) if not ref_name: continue ref_names.append(ref_name) if json_texts is not None: json_texts.append(json_text) # Finally running for the contigs that has not been aligned to any reference no_unaligned_contigs = True for assembly in not_aligned_assemblies: if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0: no_unaligned_contigs = False break run_name = 'for the contigs not aligned anywhere' logger.main_info() if no_unaligned_contigs: logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)') else: logger.main_info('Starting quast.py ' + run_name + '... (logging to ' + os.path.join(output_dirpath, qconfig.not_aligned_name, qconfig.LOGGER_DEFAULT_NAME + '.log)')) return_code, total_num_notifications = _start_quast_main(quast_py_args + ['-t', str(qconfig.max_threads)], assemblies=not_aligned_assemblies, output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name), num_notifications_tuple=total_num_notifications) if return_code not in [0, 4]: logger.error('Error running quast.py for the contigs not aligned anywhere') elif return_code == 4: # no unaligned contigs, i.e. everything aligned no_unaligned_contigs = True if not no_unaligned_contigs: if json_texts is not None: json_texts.append(json_saver.json_text) if ref_names: logger.print_timestamp() logger.main_info("Summarizing results...") summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir) if not os.path.isdir(summary_output_dirpath): os.makedirs(summary_output_dirpath) if html_report and json_texts: from quast_libs.html_saver import html_saver html_summary_report_fpath = html_saver.init_meta_report(output_dirpath) else: html_summary_report_fpath = None from quast_libs import create_meta_summary metrics_for_plots = reporting.Fields.main_metrics misassembly_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION, reporting.Fields.MIS_ISTRANSLOCATIONS] if no_unaligned_contigs: full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] else: full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] + [qconfig.not_aligned_name] create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots, misassembly_metrics, full_ref_names) if html_report and json_texts: html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls, meta=True) if qconfig.create_icarus_html: icarus_html_fpath = html_saver.create_meta_icarus(output_dirpath, ref_names) logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) html_saver.create_meta_report(output_dirpath, json_texts) cleanup(corrected_dirpath) logger.main_info('') logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not download_blast_binaries(filenames=blast_filenames): return None, None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error('You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not download_blastdb(): return None, None, None blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) parallel_run_args = [(assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for assembly in blast_assemblies] run_parallel(parallel_blast, parallel_run_args, n_jobs, filter_results=True) logger.main_info() species_scores = [] species_by_assembly = dict() max_entries = 4 replacement_dict = defaultdict(list) for label in labels: assembly_scores = [] assembly_species = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 with open(res_fpath) as res_file: query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None for line in res_file: fs = line.split() if line.startswith('#'): refs_for_query = 0 # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score if 'Fields' in line: fs = line.strip().split('Fields: ')[-1].split(', ') query_id_col = fs.index('query id') if 'query id' in fs else 0 subj_id_col = fs.index('subject id') if 'subject id' in fs else 1 idy_col = fs.index('% identity') if '% identity' in fs else 2 len_col = fs.index('alignment length') if 'alignment length' in fs else 3 score_col = fs.index('bit score') if 'bit score' in fs else 11 elif refs_for_query < max_entries and len(fs) > score_col: query_id = fs[query_id_col] organism_id = fs[subj_id_col] idy = float(fs[idy_col]) length = int(fs[len_col]) score = float(fs[score_col]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue species_name = get_species_name(seqname) if species_name and 'uncultured' not in seqname and 'gut_metagenome' not in species_name: if refs_for_query == 0: if species_name not in assembly_species: assembly_scores.append((seqname, query_id, score)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons assembly_species.append(species_name) refs_for_query += 1 else: seq_scores = [(query_name, seq_query_id, seq_score) for query_name, seq_query_id, seq_score in assembly_scores if get_species_name(query_name) == species_name] if seq_scores and score > seq_scores[0][2]: assembly_scores.remove(seq_scores[0]) assembly_scores.append((seqname, query_id, score)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons refs_for_query += 1 else: if seqname not in replacement_dict[query_id]: replacement_dict[query_id].append(seqname) refs_for_query += 1 assembly_scores = sorted(assembly_scores, reverse=True) assembly_scores = assembly_scores[:qconfig.max_references] for seqname, query_id, score in assembly_scores: if not species_by_assembly or not any(seqname in species_list for species_list in species_by_assembly.values()): species_scores.append((seqname, query_id, score)) species_by_assembly[label] = [seqname for seqname, query_id, score in assembly_scores] if not species_scores: return None, None, None return species_scores, species_by_assembly, replacement_dict
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [AlignerStatus.FAILED] * len(contigs_fpaths))), None num_nf_errors = logger._num_nf_errors create_minimap_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats(reference, skip_ns=True) threads = qconfig.max_threads if qconfig.memory_efficient else threads args = [(is_cyclic, i, contigs_fpath, output_dir, reference, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))] statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel(align_and_analyze, args, n_jobs) reports = [] aligner_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if AlignerStatus.OK in aligner_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == AlignerStatus.OK: reports.append(save_result(results[index], report, fname, reference, genome_size)) elif statuses[index] == AlignerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if AlignerStatus.OK in aligner_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(aligner_statuses.values()).count(AlignerStatus.OK) not_aligned = list(aligner_statuses.values()).count(AlignerStatus.NOT_ALIGNED) failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED) errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR) problems = not_aligned + failed + errors all = len(aligner_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return aligner_statuses, aligned_lengths_per_fpath
def main(args): check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '.\n' + 'Please, put QUAST in a different directory, then try again.\n', exit_code=3) if not args: qconfig.usage(stream=sys.stderr) sys.exit(1) metaquast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, metaquast_path + args) output_dirpath, ref_fpaths, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels html_report = qconfig.html_report test_mode = qconfig.test # Directories output_dirpath, _, _ = qutils.set_up_output_dir( output_dirpath, None, not output_dirpath, save_json=False) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) qconfig.set_max_threads(logger) qutils.logger = logger ######################################################################## from quast_libs import reporting try: import importlib importlib.reload(reporting) except (ImportError, AttributeError): reload(reporting) from quast_libs import plotter if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCES if ref_fpaths: logger.main_info() logger.main_info('Reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath) # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') qconfig.no_check_meta = True assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) if not assemblies: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.") return 4 # Running QUAST(s) if qconfig.gene_finding: quast_py_args += ['--mgm'] if qconfig.min_IDY is None: # special case: user not specified min-IDY, so we need to use MetaQUAST default value quast_py_args += ['--min-identity', str(qconfig.META_MIN_IDY)] if qconfig.reuse_combined_alignments: reuse_combined_alignments = True else: reuse_combined_alignments = False downloaded_refs = False # SEARCHING REFERENCES if not ref_fpaths: logger.main_info() if qconfig.max_references == 0: logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled") else: if qconfig.references_txt: logger.main_info("List of references was provided, starting to download reference genomes from NCBI...") else: logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database " "and to download them from NCBI...") downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname) if not os.path.isdir(downloaded_dirpath): os.mkdir(downloaded_dirpath) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, corrected_dirpath, qconfig.references_txt) if ref_fpaths: search_references_meta.is_quast_first_run = True if not qconfig.references_txt: downloaded_refs = True logger.main_info() logger.main_info('Downloaded reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=True) elif test_mode and not ref_fpaths: logger.error('Failed to download or setup SILVA 16S rRNA database for working without ' 'references on metagenome datasets!', to_stderr=True, exit_with_code=4) if not ref_fpaths: # No references, running regular quast with MetaGenemark gene finder logger.main_info() logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder') assemblies = [Assembly(fpath, qutils.label_from_fpath(fpath)) for fpath in contigs_fpaths] _start_quast_main(quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, run_regular_quast=True) exit(0) # Running combined reference combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name) qconfig.reference = combined_ref_fpath if qconfig.bed: quast_py_args += ['--sv-bed'] quast_py_args += [qconfig.bed] quast_py_args += ['--combined-ref'] if qconfig.draw_plots or qconfig.html_report: if plotter_data.dict_color_and_ls: colors_and_ls = [plotter_data.dict_color_and_ls[asm.label] for asm in assemblies] quast_py_args += ['--colors'] quast_py_args += [','.join([style[0] for style in colors_and_ls])] quast_py_args += ['--ls'] quast_py_args += [','.join([style[1] for style in colors_and_ls])] run_name = 'for the combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') total_num_notices = 0 total_num_warnings = 0 total_num_nf_errors = 0 total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors) if qconfig.html_report: from quast_libs.html_saver import json_saver json_texts = [] else: json_texts = None if qconfig.unique_mapping: ambiguity_opts = [] else: ambiguity_opts = ["--ambiguity-usage", 'all'] return_code, total_num_notifications = \ _start_quast_main(quast_py_args + ambiguity_opts, labels=labels, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_combined_ref=True) if json_texts is not None: json_texts.append(json_saver.json_text) search_references_meta.is_quast_first_run = False genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats') genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt') if not os.path.exists(genome_info_fpath): logger.main_info('') if not downloaded_refs: msg = 'Try to restart MetaQUAST with another references.' else: msg = 'Try to use option --max-ref-number to change maximum number of references (per each assembly) to download.' logger.main_info('Failed aligning the contigs for all the references. ' + msg) logger.main_info('') cleanup(corrected_dirpath) logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) if downloaded_refs and return_code == 0: logger.main_info() logger.main_info('Excluding downloaded references with low genome fraction from further analysis..') corr_ref_fpaths = get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs) if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths: logger.main_info() logger.main_info('Filtered reference(s):') os.remove(combined_ref_fpath) contigs_analyzer.ref_labels_by_chromosomes = OrderedDict() corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names = \ correct_meta_references(corr_ref_fpaths, corrected_dirpath) assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) run_name = 'for the corrected combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications = \ _start_quast_main(quast_py_args + ambiguity_opts, labels=labels, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_combined_ref=True) if json_texts is not None: json_texts = json_texts[:-1] json_texts.append(json_saver.json_text) elif corr_ref_fpaths == ref_fpaths: logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.') else: logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.') if return_code != 0: logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) if qconfig.calculate_read_support: calculate_ave_read_support(combined_output_dirpath, assemblies) prepare_regular_quast_args(quast_py_args, combined_output_dirpath, reuse_combined_alignments) logger.main_info() logger.main_info('Partitioning contigs into bins aligned to each reference..') assemblies_by_reference, not_aligned_assemblies = partition_contigs( assemblies, corrected_ref_fpaths, corrected_dirpath, os.path.join(combined_output_dirpath, qconfig.detailed_contigs_reports_dirname, 'alignments_%s.tsv'), labels) output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname) if not qconfig.memory_efficient and \ len(assemblies_by_reference) > len(assemblies) and len(assemblies) < qconfig.max_threads: logger.main_info() logger.main_info('Run QUAST on different references in parallel..') threads_per_ref = max(1, qconfig.max_threads // len(assemblies_by_reference)) quast_py_args += ['--memory-efficient'] quast_py_args += ['-t', str(threads_per_ref)] num_notifications = (0, 0, 0) parallel_run_args = [(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, num_notifications, True) for ref_fpath, ref_assemblies in assemblies_by_reference] ref_names, ref_json_texts, ref_notifications = \ run_parallel(_run_quast_per_ref, parallel_run_args, qconfig.max_threads, filter_results=True) per_ref_num_notifications = list(map(sum, zip(*ref_notifications))) total_num_notifications = list(map(sum, zip(total_num_notifications, per_ref_num_notifications))) if json_texts is not None: json_texts.extend(ref_json_texts) quast_py_args.remove('--memory-efficient') quast_py_args = remove_from_quast_py_args(quast_py_args, '-t', str(threads_per_ref)) else: ref_names = [] for ref_fpath, ref_assemblies in assemblies_by_reference: ref_name, json_text, total_num_notifications = \ _run_quast_per_ref(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, total_num_notifications) if not ref_name: continue ref_names.append(ref_name) if json_texts is not None: json_texts.append(json_text) # Finally running for the contigs that has not been aligned to any reference no_unaligned_contigs = True for assembly in not_aligned_assemblies: if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0: no_unaligned_contigs = False break run_name = 'for the contigs not aligned anywhere' logger.main_info() if no_unaligned_contigs: logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)') else: logger.main_info('Starting quast.py ' + run_name + '... (logging to ' + os.path.join(output_dirpath, qconfig.not_aligned_name, qconfig.LOGGER_DEFAULT_NAME + '.log)')) return_code, total_num_notifications = _start_quast_main(quast_py_args + ['-t', str(qconfig.max_threads)], assemblies=not_aligned_assemblies, output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name), num_notifications_tuple=total_num_notifications) if return_code not in [0, 4]: logger.error('Error running quast.py for the contigs not aligned anywhere') elif return_code == 4: # no unaligned contigs, i.e. everything aligned no_unaligned_contigs = True if not no_unaligned_contigs: if json_texts is not None: json_texts.append(json_saver.json_text) if ref_names: logger.print_timestamp() logger.main_info("Summarizing results...") summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir) if not os.path.isdir(summary_output_dirpath): os.makedirs(summary_output_dirpath) if html_report and json_texts: from quast_libs.html_saver import html_saver html_summary_report_fpath = html_saver.init_meta_report(output_dirpath) else: html_summary_report_fpath = None from quast_libs import create_meta_summary metrics_for_plots = reporting.Fields.main_metrics misassembly_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION, reporting.Fields.MIS_ISTRANSLOCATIONS] if no_unaligned_contigs: full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] else: full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] + [qconfig.not_aligned_name] create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots, misassembly_metrics, full_ref_names) if html_report and json_texts: html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls, meta=True) if qconfig.create_icarus_html: icarus_html_fpath = html_saver.create_meta_icarus(output_dirpath, ref_names) logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) html_saver.create_meta_report(output_dirpath, json_texts) cleanup(corrected_dirpath) logger.main_info('') logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)