def parse_alignments(contigs_fpaths, contig_report_fpath_pattern): lists_of_aligned_blocks = [] for contigs_fpath in contigs_fpaths: if contig_report_fpath_pattern: report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname( contigs_fpath) aligned_blocks, misassembled_id_to_structure = parse_aligner_contig_report( report_fpath) if aligned_blocks is None: continue aligned_blocks = check_misassembled_blocks( aligned_blocks, misassembled_id_to_structure, filter_local=True) lists_of_aligned_blocks.append(aligned_blocks) if lists_of_aligned_blocks: max_contigs = max([ len(aligned_blocks) for aligned_blocks in lists_of_aligned_blocks ]) return get_assemblies(contigs_fpaths, lists_of_aligned_blocks).assemblies, max_contigs else: return None, None
def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath): total_len = dict() contigs_dict = dict() contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]] for contigs_fpath in contigs_fpaths: total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN) contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS) cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage] num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage] common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs) histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)' plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram', histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold) for contigs_fpath in contigs_with_coverage: coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]], [contigs_dict[contigs_fpath]]) label = qutils.label_from_fpath(contigs_fpath) corr_label = qutils.label_from_fpath_for_fname(contigs_fpath) histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)' histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram') plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath, histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_genemark.stderr') genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads) if not genes: unique_count = None count = None # [None] * len(gene_lengths) else: tool_name = "genemark" out_gff_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')) add_genes_to_gff(genes, out_gff_fpath, prokaryote) if OUTPUT_FASTA: out_fasta_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.fasta') add_genes_to_fasta(genes, out_fasta_fpath) count = [sum([gene.end - gene.start > x for gene in genes]) for x in gene_lengths] gene_ids = [gene.seq if gene.seq else gene.name for gene in genes] unique_count = len(set(gene_ids)) total_count = len(genes) logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_fpath) return genes, unique_count, count
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def create_mismatches_plot(assembly, window_size, ref_len, root_dir, output_dir): assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) nucmer_dirpath = join(root_dir, '..', 'contigs_reports') nucmer_fpath = join(create_nucmer_output_dir(nucmer_dirpath), assembly_label) _, _, _, _, used_snps_fpath = get_nucmer_aux_out_fpaths(nucmer_fpath) if not exists(used_snps_fpath): return None mismatches_fpath = join(output_dir, assembly_label + '.mismatches.txt') mismatch_density_by_chrom = defaultdict(lambda : [0] * (ref_len // window_size + 1)) for line in open_gzipsafe(used_snps_fpath): chrom, contig, ref_pos, ref_nucl, ctg_nucl, ctg_pos = line.split('\t') if ref_nucl != '.' and ctg_nucl != '.': mismatch_density_by_chrom[chrom][int(ref_pos) // window_size] += 1 with open(mismatches_fpath, 'w') as out_f: for chrom, density_list in mismatch_density_by_chrom.items(): start, end = 0, 0 for i, density in enumerate(density_list): if density == 0: end = (i + 1) * window_size else: if end: out_f.write('\t'.join([chrom, str(start), str(end), '0']) + '\n') out_f.write('\t'.join([chrom, str(i * window_size), str(((i + 1) * window_size)), str(density)]) + '\n') start = (i + 1) * window_size end = None out_f.write('\t'.join([chrom, str(start), str(end), '0']) + '\n') return mismatches_fpath
def get_assemblies_data(contigs_fpaths, icarus_dirpath, stdout_pattern, nx_marks): assemblies_n50 = defaultdict(dict) assemblies_data = '' assemblies_data += 'var assemblies_links = {};\n' assemblies_data += 'var assemblies_len = {};\n' assemblies_data += 'var assemblies_contigs = {};\n' assemblies_data += 'var assemblies_misassemblies = {};\n' assemblies_data += 'var assemblies_n50 = {};\n' assemblies_contig_size_data = '' for contigs_fpath in contigs_fpaths: assembly_label = qutils.label_from_fpath(contigs_fpath) report = reporting.get(contigs_fpath) l = report.get_field(reporting.Fields.TOTALLEN) contigs = report.get_field(reporting.Fields.CONTIGS) n50 = report.get_field(reporting.Fields.N50) if stdout_pattern: contig_stdout_fpath = stdout_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) + '.stdout' contig_stdout_fpath = qutils.relpath(contig_stdout_fpath, icarus_dirpath) assemblies_data += 'assemblies_links["' + assembly_label + '"] = "' + contig_stdout_fpath + '";\n' assemblies_contig_size_data += 'assemblies_len["' + assembly_label + '"] = ' + str(l) + ';\n' assemblies_contig_size_data += 'assemblies_contigs["' + assembly_label + '"] = ' + str(contigs) + ';\n' assemblies_contig_size_data += 'assemblies_n50["' + assembly_label + '"] = "' + str(n50) + '";\n' for nx in nx_marks: assemblies_n50[assembly_label][nx] = report.get_field(nx) return assemblies_data, assemblies_contig_size_data, assemblies_n50
def calculate_ave_read_support(combined_output_dirpath, assemblies): unique_contigs_fpath = os.path.join(combined_output_dirpath, 'contigs_reports', qconfig.unique_contigs_fname_pattern) for assembly in assemblies: aligned_contigs_by_ref = dict() assembly_label = qutils.label_from_fpath(assembly.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) with open(unique_contigs_fpath % corr_assembly_label) as in_f: for line in in_f: ref_name, contig_len, contig_cov = line.strip().split('\t') aligned_contigs_by_ref.setdefault(ref_name, []).append((float(contig_len), float(contig_cov))) for ref_name, contigs in aligned_contigs_by_ref.items(): ref_cov = sum(contig_cov * aligned_len for (aligned_len, contig_cov) in contigs) ref_cov /= sum(aligned_len for (aligned_len, contig_cov) in contigs) corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) ref_contigs_fpath = os.path.join( os.path.dirname(assembly.fpath), corr_assembly_label + '_to_' + ref_name + '.fasta') qconfig.assembly_labels_by_fpath[ref_contigs_fpath] = assembly_label report = reporting.get(ref_contigs_fpath, ref_name=ref_name) report.add_field(reporting.Fields.AVE_READ_SUPPORT, '%.2f' % ref_cov)
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') if not exists(kmc_check_fpath): return False successful_check_content = open(kmc_check_fpath).read().split('\n') if len(successful_check_content) < 2: return False if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)): return False if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)): return False return True
def create_mismatches_plot(assembly, window_size, ref_len, root_dir, output_dir): assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) aligner_dirpath = join(root_dir, '..', qconfig.detailed_contigs_reports_dirname) coords_basename = join(create_minimap_output_dir(aligner_dirpath), assembly_label) _, coords_filtered_fpath, _, _ = get_aux_out_fpaths(coords_basename) if not exists(coords_filtered_fpath) or not qconfig.show_snps: return None mismatches_fpath = join(output_dir, assembly_label + '.mismatches.txt') mismatch_density_by_chrom = defaultdict(lambda: [0] * (ref_len // window_size + 1)) with open(coords_filtered_fpath) as coords_file: for line in coords_file: s1 = int(line.split('|')[0].split()[0]) chrom = line.split()[11].strip() cigar = line.split()[-1].strip() ref_pos = s1 for op in parse_cs_tag(cigar): n_bases = len(op) - 1 if op.startswith('*'): mismatch_density_by_chrom[chrom][int(ref_pos) // window_size] += 1 ref_pos += 1 elif not op.startswith('+'): ref_pos += n_bases with open(mismatches_fpath, 'w') as out_f: for chrom, density_list in mismatch_density_by_chrom.items(): start, end = 0, 0 for i, density in enumerate(density_list): if density == 0: end = (i + 1) * window_size else: if end: out_f.write( '\t'.join([chrom, str(start), str(end), '0']) + '\n') out_f.write('\t'.join([ chrom, str(i * window_size), str(((i + 1) * window_size)), str(density) ]) + '\n') start = (i + 1) * window_size end = None if end: out_f.write('\t'.join([chrom, str(start), str(end), '0']) + '\n') return mismatches_fpath
def is_license_valid(out_dirpath, fasta_fpaths): # checking the installation err_fpath = os.path.join(out_dirpath, qutils.label_from_fpath_for_fname(fasta_fpaths[0]) + '_genemark.stderr') if os.path.isfile(err_fpath): with open(err_fpath) as err_f: for line in err_f: if line.find('license period has ended') != -1: logger.main_info() logger.warning('License period for GeneMark has ended! \n' 'To update license, please visit http://exon.gatech.edu/GeneMark/license_download.cgi page and fill in the form.\n' 'You should choose GeneMarkS tool and your operating system (note that GeneMark is free for non-commercial use).\n' 'Download the license key and replace your ~/.gm_key with the updated version. After that you can restart QUAST.\n') return False return True
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template): assembly_label = qutils.label_from_fpath(asm.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath) logger.info(' ' + 'processing ' + assembly_label) added_ref_asm = [] not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() aligned_contigs_for_each_ref = {} contigs_seq = fastaparser.read_fasta_one_time(asm.fpath) alignments_fpath = alignments_fpath_template % corr_assembly_label if os.path.exists(alignments_fpath): with open(alignments_fpath) as f: for line in f: values = line.split() if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys(): ref_name = contigs_analyzer.ref_labels_by_chromosomes[values[0]] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, corr_assembly_label + '_to_' + ref_name + '.fasta') if ref_name not in aligned_contigs_for_each_ref: aligned_contigs_for_each_ref[ref_name] = [] for (cont_name, seq) in contigs_seq: if not cont_name in contigs: contigs[cont_name] = seq if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ref_name]: # Collecting all aligned contigs names in order to further extract not aligned aligned_contig_names.add(cont_name) aligned_contigs_for_each_ref[ref_name].append(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, assembly_label) if ref_asm.name not in added_ref_asm: if ref_name in assemblies_by_ref: assemblies_by_ref[ref_name].append(ref_asm) added_ref_asm.append(ref_asm.name) if qconfig.space_efficient: os.remove(alignments_fpath) # Extraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) return assemblies_by_ref, not_aligned_asm
def create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, completeness, len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') kmc_stats_fpath = join(output_dir, label + '.stat') with open(kmc_check_fpath, 'w') as check_f: check_f.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath)) check_f.write("Reference md5 checksum: %s\n" % md5(ref_fpath)) check_f.write("Used assemblies: %s\n" % ','.join(contigs_fpaths)) with open(kmc_stats_fpath, 'w') as stats_f: stats_f.write("Completeness: %s\n" % completeness) if len_map_to_one_chrom or len_map_to_multi_chrom: stats_f.write("Length assigned to one chromosome: %d\n" % len_map_to_one_chrom) stats_f.write("Length assigned to multi chromosomes: %d\n" % len_map_to_multi_chrom) stats_f.write("Length assigned to none chromosome: %d\n" % len_map_to_none_chrom) stats_f.write("Total length: %d\n" % total_len)
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') if not exists(kmc_check_fpath): return False successful_check_content = open(kmc_check_fpath).read().split('\n') if len(successful_check_content) < 3: return False if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)): return False if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)): return False used_assemblies = successful_check_content[2].strip().split(': ')[-1] if used_assemblies and sorted(used_assemblies.split(',')) != sorted(contigs_fpaths): return False return True
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') if not exists(kmc_check_fpath): return False successful_check_content = open(kmc_check_fpath).read().split('\n') if len(successful_check_content) < 3: return False if not successful_check_content[0].strip().endswith(str(getsize(contigs_fpath))): return False if not successful_check_content[1].strip().endswith(str(getsize(ref_fpath))): return False used_assemblies = successful_check_content[2].strip().split(': ')[-1] if used_assemblies and sorted(used_assemblies.split(',')) != sorted(contigs_fpaths): return False return True
def create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, completeness, len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') kmc_stats_fpath = join(output_dir, label + '.stat') with open(kmc_check_fpath, 'w') as check_f: check_f.write("Assembly file size in bytes: %d\n" % getsize(contigs_fpath)) check_f.write("Reference file size in bytes: %d\n" % getsize(ref_fpath)) check_f.write("Used assemblies: %s\n" % ','.join(contigs_fpaths)) with open(kmc_stats_fpath, 'w') as stats_f: stats_f.write("Completeness: %s\n" % completeness) if len_map_to_one_chrom or len_map_to_multi_chrom: stats_f.write("Length assigned to one chromosome: %d\n" % len_map_to_one_chrom) stats_f.write("Length assigned to multi chromosomes: %d\n" % len_map_to_multi_chrom) stats_f.write("Length assigned to none chromosome: %d\n" % len_map_to_none_chrom) stats_f.write("Total length: %d\n" % total_len)
def parse_alignments(contigs_fpaths, contig_report_fpath_pattern): lists_of_aligned_blocks = [] for contigs_fpath in contigs_fpaths: if contig_report_fpath_pattern: report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) aligned_blocks, misassembled_id_to_structure = parse_nucmer_contig_report(report_fpath) if aligned_blocks is None: continue aligned_blocks = check_misassembled_blocks(aligned_blocks, misassembled_id_to_structure, filter_local=True) lists_of_aligned_blocks.append(aligned_blocks) if lists_of_aligned_blocks: max_contigs = max([len(aligned_blocks) for aligned_blocks in lists_of_aligned_blocks]) return get_assemblies(contigs_fpaths, lists_of_aligned_blocks).assemblies, max_contigs else: return None, None
def create_kmc_stats_file(output_dir, contigs_fpath, ref_fpath, completeness, corr_len, mis_len, undef_len, total_len, translocations, relocations): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') kmc_stats_fpath = join(output_dir, label + '.stat') with open(kmc_check_fpath, 'w') as check_f: check_f.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath)) check_f.write("Reference md5 checksum: %s\n" % md5(ref_fpath)) with open(kmc_stats_fpath, 'w') as stats_f: stats_f.write("Completeness: %s\n" % completeness) if corr_len or mis_len: stats_f.write("K-mer-based correct length: %d\n" % corr_len) stats_f.write("K-mer-based misjoined length: %d\n" % mis_len) stats_f.write("K-mer-based undefined length: %d\n" % undef_len) stats_f.write("Total length: %d\n" % total_len) stats_f.write("# translocations: %d\n" % translocations) stats_f.write("# 100 kbp relocations: %d\n" % relocations)
def create_jf_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, completeness, len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom): label = qutils.label_from_fpath_for_fname(contigs_fpath) jf_check_fpath = join(output_dir, label + '.sf') jf_stats_fpath = join(output_dir, label + '.stat') with open(jf_check_fpath, 'w') as check_f: check_f.write("Assembly file size in bytes: %d\n" % getsize(contigs_fpath)) check_f.write("Reference file size in bytes: %d\n" % getsize(ref_fpath)) check_f.write("Used assemblies: %s\n" % ','.join(contigs_fpaths)) with open(jf_stats_fpath, 'w') as stats_f: stats_f.write("Completeness: %s\n" % completeness) stats_f.write("Length assigned to one chromosome: %d\n" % len_map_to_one_chrom) stats_f.write("Length assigned to multi chromosomes: %d\n" % len_map_to_multi_chrom) stats_f.write("Length assigned to none chromosome: %d\n" % len_map_to_none_chrom)
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) out_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer') err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer.stderr') #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir, # fasta_path, out_path, gene_lengths, err_path) out_gff_path, genes, unique, total, cnt = glimmerHMM(tool_dirpath, contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index) if out_gff_path: logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique) + ' unique, ' + str(total) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_path) return genes, unique, cnt
def create_mismatches_plot(assembly, window_size, ref_len, root_dir, output_dir): assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) aligner_dirpath = join(root_dir, '..', 'contigs_reports') coords_basename = join(create_minimap_output_dir(aligner_dirpath), assembly_label) _, coords_filtered_fpath, _, _ = get_aux_out_fpaths(coords_basename) if not exists(coords_filtered_fpath) or not qconfig.show_snps: return None mismatches_fpath = join(output_dir, assembly_label + '.mismatches.txt') mismatch_density_by_chrom = defaultdict(lambda : [0] * (ref_len // window_size + 1)) with open(coords_filtered_fpath) as coords_file: for line in coords_file: s1 = int(line.split('|')[0].split()[0]) chrom = line.split()[11].strip() cigar = line.split()[-1].strip() ref_pos = s1 for op in parse_cs_tag(cigar): n_bases = len(op) - 1 if op.startswith('*'): mismatch_density_by_chrom[chrom][int(ref_pos) // window_size] += 1 ref_pos += 1 elif not op.startswith('+'): ref_pos += n_bases with open(mismatches_fpath, 'w') as out_f: for chrom, density_list in mismatch_density_by_chrom.items(): start, end = 0, 0 for i, density in enumerate(density_list): if density == 0: end = (i + 1) * window_size else: if end: out_f.write('\t'.join([chrom, str(start), str(end), '0']) + '\n') out_f.write('\t'.join([chrom, str(i * window_size), str(((i + 1) * window_size)), str(density)]) + '\n') start = (i + 1) * window_size end = None if end: out_f.write('\t'.join([chrom, str(start), str(end), '0']) + '\n') return mismatches_fpath
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() kmer_len = qconfig.unique_kmer_len logger.main_info('Running analysis based on unique ' + str(kmer_len) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 7: corr_len = int(stats_content[1].strip().split(': ')[-1]) mis_len = int(stats_content[2].strip().split(': ')[-1]) undef_len = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) translocations = int(stats_content[5].strip().split(': ')[-1]) relocations = int(stats_content[6].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: save_kmers(output_dir) logger.info('Done.') return if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger) global kmc_bin_fpath global kmc_tools_fpath kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) if not exists(kmc_bin_fpath) or not exists( kmc_tools_fpath) or not compile_minimap(logger): logger.warning(' Sorry, can\'t run KMC, skipping...') return None logger.info(' Running KMC on reference...') if not isdir(output_dir): os.makedirs(output_dir) log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, kmer_len, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: logger.warning('KMC failed, check ' + log_fpath + ' and ' + err_fpath + '. Skipping...') return logger.info(' Analyzing assemblies completeness...') kmc_out_fpaths = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, kmer_len, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers( tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info(' Analyzing assemblies correctness...') ref_contigs = [name for name, _ in read_fasta(ref_fpath)] logger.info(' Downsampling k-mers...') ref_kmers, downsampled_kmers_fpath = downsample_kmers( tmp_dirpath, ref_fpath, ref_kmc_out_fpath, kmer_len, log_fpath, err_fpath) for id, (contigs_fpath, kmc_db_fpath) in enumerate(zip(contigs_fpaths, kmc_out_fpaths)): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) corr_len = None mis_len = None undef_len = None translocations, relocations = None, None total_len = 0 contig_lens = dict() for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(ref_contigs) > MAX_REF_CONTIGS_NUM: logger.warning( 'Reference is too fragmented. Scaffolding accuracy will not be assessed.' ) else: corr_len = 0 mis_len = 0 kmers_by_contig, kmers_pos_by_contig = align_kmers( tmp_dirpath, contigs_fpath, downsampled_kmers_fpath, err_fpath, qconfig.max_threads) is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref cyclic_ref_lens = report.get_field( reporting.Fields.REFLEN) if is_cyclic else None translocations = 0 relocations = 0 with open( join( tmp_dirpath, qutils.label_from_fpath_for_fname(contigs_fpath) + '.misjoins.txt'), 'w') as out: for contig in kmers_by_contig.keys(): contig_markers = [] prev_pos, prev_ref_pos, prev_chrom, marker = None, None, None, None for pos, kmer in sorted(zip(kmers_pos_by_contig[contig], kmers_by_contig[contig]), key=lambda x: x[0]): ref_chrom, ref_pos = ref_kmers[kmer] if prev_pos and prev_chrom: if prev_chrom == ref_chrom and abs( abs(pos - prev_pos) / abs(ref_pos - prev_ref_pos) - 1) <= 0.05: marker = (pos, ref_pos, ref_chrom) elif marker: contig_markers.append(marker) pos, ref_pos, ref_chrom, marker = None, None, None, None prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if marker: contig_markers.append(marker) prev_pos, prev_ref_pos, prev_chrom = None, None, None is_misassembled = False for marker in contig_markers: pos, ref_pos, ref_chrom = marker if prev_pos and prev_chrom: if ref_chrom != prev_chrom: translocations += 1 out.write( 'Translocation in %s: %s %d | %s %d\n' % (contig, prev_chrom, prev_pos, ref_chrom, pos)) is_misassembled = True elif _get_dist_inconstistency( pos, prev_pos, ref_pos, prev_ref_pos, cyclic_ref_lens) > EXT_RELOCATION_SIZE: relocations += 1 out.write( 'Relocation in %s: %d (%d) | %d (%d)\n' % (contig, prev_pos, prev_ref_pos, pos, ref_pos)) is_misassembled = True prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if is_misassembled: mis_len += contig_lens[contig] elif len(contig_markers) > 0: corr_len += contig_lens[contig] undef_len = total_len - corr_len - mis_len report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) create_kmc_stats_file( output_dir, contigs_fpath, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), corr_len, mis_len, undef_len, total_len, translocations, relocations) save_kmers(output_dir) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries( logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus) if not clade_dirpath: logger.info('Failed finding conservative genes.') return config_fpath = make_config(output_dir, tmp_dir, busco_threads, clade_dirpath, augustus_dirpath) logger.info('Logs and results will be saved under ' + output_dir + '...') os.environ['BUSCO_CONFIG_FILE'] = config_fpath os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_contigs( augustus_dirpath, tmp_dir) if not os.environ['AUGUSTUS_CONFIG_PATH']: logger.error( 'Augustus configs not found, failed to run BUSCO without them.') busco_args = [[ contigs_fpath, qutils.label_from_fpath_for_fname(contigs_fpath) ] for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco_main_handler, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error( 'Failed running BUSCO for all the assemblies. See log files in ' + output_dir + ' for information.') return # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field( reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) shutil.copy(summary_fpaths[i], output_dir) else: logger.error('Failed running BUSCO for ' + contigs_fpath + '. See the log for detailed information.') if not qconfig.debug: cleanup(output_dir) logger.info('Done.')
def process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = defaultdict(int) logger.info(' ' + qutils.index_to_str(index) + assembly_label) coords_base_fpath = os.path.join(coords_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: coords_fpath = coords_base_fpath else: coords_fpath = coords_base_fpath + '.filtered' if not os.path.isfile(coords_fpath): logger.error('File with alignment coords (' + coords_fpath + ') not found! Try to restart QUAST.', indent=' ') return None, None # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) sorted_contig_tuples = sorted(enumerate(contig_tuples), key=lambda x: len(x[1][1]), reverse=True) sorted_contigs_names = [] contigs_order = [] for idx, (name, _) in sorted_contig_tuples: sorted_contigs_names.append(name) contigs_order.append(idx) features_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(containers) if qconfig.memory_efficient and gene_searching_enabled: logger.warning('Run QUAST without genes and operons files to reduce memory consumption.') if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] with open(coords_fpath) as coordfile: for line in coordfile: s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1, contig=contig_name, start_in_contig=s2, end_in_contig=e2)) for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 for chr_name in genome_mapping.keys(): for i in ns_by_chromosomes[chr_name]: genome_mapping[chr_name][i] = 0 ref_lengths[chr_name] = sum(genome_mapping[chr_name]) if qconfig.space_efficient and coords_fpath.endswith('.filtered'): os.remove(coords_fpath) # counting genome coverage and gaps number gaps_count = 0 if qconfig.analyze_gaps: gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' with open(gaps_fpath, 'w') as gaps_file: for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1 or i in ns_by_chromosomes[chr_name]: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') results["gaps_count"] = gaps_count results[reporting.Fields.GENES + "_full"] = None results[reporting.Fields.GENES + "_partial"] = None results[reporting.Fields.OPERONS + "_full"] = None results[reporting.Fields.OPERONS + "_partial"] = None # finding genes and operons for container in containers: if not container.region_list: continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_genomic_features_' + container.kind.lower() + '.txt') found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type', 'Contig')) found_file.write('=' * 50 + '\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 gene_blocks = [] if region.id is None: region.id = '# ' + str(region.number + 1) for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if cur_block.seqname != region.seqname: continue if region.end <= cur_block.start or cur_block.end <= region.start: continue elif cur_block.start <= region.start and region.end <= cur_block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 contig_info = cur_block.format_gene_info(region) found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': operons_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig else: features_in_contigs[contig_id] += 1 cur_feature_is_found = True break elif min(region.end, cur_block.end) - max(region.start, cur_block.start) >= qconfig.min_gene_overlap: if found_list[i] == 0: found_list[i] = 2 total_partial += 1 gene_blocks.append(cur_block) if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon contig_info = ','.join([block.format_gene_info(region) for block in sorted(gene_blocks, key=lambda block: block.start)]) found_file.write('%s\t\t%d\t%d\tpartial\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': results[reporting.Fields.OPERONS + "_full"] = total_full results[reporting.Fields.OPERONS + "_partial"] = total_partial else: if results[reporting.Fields.GENES + "_full"] is None: results[reporting.Fields.GENES + "_full"] = 0 results[reporting.Fields.GENES + "_partial"] = 0 results[reporting.Fields.GENES + "_full"] += total_full results[reporting.Fields.GENES + "_partial"] += total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') unsorted_features_in_contigs = [features_in_contigs[idx] for idx in contigs_order] unsorted_operons_in_contigs = [operons_in_contigs[idx] for idx in contigs_order] return ref_lengths, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, cov_fpath=None, physical_cov_fpath=None, stdout_pattern=None, find_similar=True, features=None, json_output_dir=None, genes_by_labels=None): make_output_dir(output_dirpath) lists_of_aligned_blocks = [] contigs_by_assemblies = OrderedDict() structures_by_labels = {} ambiguity_alignments_by_labels = {} total_genome_size = 0 reference_chromosomes = OrderedDict() contig_names_by_refs = None assemblies = None chr_names = [] features_data = None plot_fpath = None max_small_chromosomes = 10 if ref_fpath: for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_names.append(chr_name) chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = 100 sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] if ref_labels_by_chromosomes: contig_names_by_refs = ref_labels_by_chromosomes elif sum(reference_chromosomes.values()) > qconfig.MAX_SIZE_FOR_COMB_PLOT: contig_names_by_refs = dict() if len(chr_names) > max_small_chromosomes: summary_len = 0 num_parts = 1 html_name = qconfig.alignment_viewer_part_name + str(num_parts) for chr_name, chr_len in reference_chromosomes.items(): summary_len += chr_len contig_names_by_refs[chr_name] = html_name if summary_len >= qconfig.MAX_SIZE_FOR_COMB_PLOT: summary_len = 0 num_parts += 1 html_name = qconfig.alignment_viewer_part_name + str(num_parts) else: for chr_name in chr_names: contig_names_by_refs[chr_name] = chr_name for i, chr in enumerate(chr_names): chr_length = reference_chromosomes[chr] len_to_append = cumulative_ref_lengths[-1] + chr_length if contig_names_by_refs: if i < len(chr_names) - 1 and contig_names_by_refs[chr] != contig_names_by_refs[chr_names[i + 1]]: len_to_append = 0 cumulative_ref_lengths.append(len_to_append) virtual_genome_size = sum(reference_chromosomes.values()) + virtual_genome_shift * (len(reference_chromosomes.values()) - 1) for contigs_fpath in contigs_fpaths: label = qconfig.assembly_labels_by_fpath[contigs_fpath] if not contig_report_fpath_pattern: contigs = parse_contigs_fpath(contigs_fpath) else: report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) aligned_blocks, misassembled_id_to_structure, contigs, ambiguity_alignments = parse_nucmer_contig_report(report_fpath, list(reference_chromosomes.keys()), cumulative_ref_lengths) if not contigs: contigs = parse_contigs_fpath(contigs_fpath) if aligned_blocks is None: return None for block in aligned_blocks: block.label = label aligned_blocks = check_misassembled_blocks(aligned_blocks, misassembled_id_to_structure) lists_of_aligned_blocks.append(aligned_blocks) structures_by_labels[label] = misassembled_id_to_structure if qconfig.ambiguity_usage == 'all': ambiguity_alignments_by_labels[label] = ambiguity_alignments contigs_by_assemblies[label] = contigs if contigs_fpaths and ref_fpath and features: features_data = parse_features_data(features, cumulative_ref_lengths, chr_names) if contigs_fpaths and qconfig.gene_finding: parse_genes_data(contigs_by_assemblies, genes_by_labels) if reference_chromosomes and lists_of_aligned_blocks: assemblies = get_assemblies(contigs_fpaths, virtual_genome_size, lists_of_aligned_blocks, find_similar) if qconfig.draw_svg: plot_fpath = draw_alignment_plot(assemblies, virtual_genome_size, output_dirpath, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift) if (assemblies or contigs_by_assemblies) and qconfig.create_icarus_html: icarus_html_fpath = js_data_gen(assemblies, contigs_fpaths, reference_chromosomes, output_dirpath, structures_by_labels, contig_names_by_refs=contig_names_by_refs, ref_fpath=ref_fpath, stdout_pattern=stdout_pattern, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels, contigs_by_assemblies=contigs_by_assemblies, features_data=features_data, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, json_output_dir=json_output_dir) else: icarus_html_fpath = None return icarus_html_fpath, plot_fpath
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_lens = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header ref_lens[name] = len(seq) log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in ref_lens.items(): regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) # if qconfig.large_genome: # log_out_f.write('Analyzing large blocks...\n') # large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null' # ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'), # coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w')) # min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD # result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null', # aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0]) # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = {} logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') return None coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(genes_container.region_list) or len(operons_container.region_list) if qconfig.memory_efficient and gene_searching_enabled: logger.warning('Run QUAST without genes and operons files to reduce memory consumption.') if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() if qconfig.space_efficient and nucmer_fpath.endswith('.filtered'): os.remove(nucmer_fpath) # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 aligned_len = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') aligned_len += 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 ref_lengths[chr_name] = aligned_len if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt')]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + suffix) found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type')) found_file.write('=========================================\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) found_file.write('%s\t\t%d\t%d\tcomplete\n' % (region_id, region.start, region.end)) feature_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) found_file.write('%s\t\t%d\t%d\tpartial\n' % (region_id, region.start, region.end)) results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries( logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus) if not clade_dirpath: logger.info('Failed finding conservative genes.') return config_fpath = make_config(output_dir, tmp_dir, busco_threads, clade_dirpath, augustus_dirpath) logger.info('Logs and results will be saved under ' + output_dir + '...') os.environ['BUSCO_CONFIG_FILE'] = config_fpath os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_configs( augustus_dirpath, tmp_dir) if not os.environ['AUGUSTUS_CONFIG_PATH']: logger.error( 'Augustus configs not found, failed to run BUSCO without them.') busco_args = [[ contigs_fpath, qutils.label_from_fpath_for_fname(contigs_fpath) ] for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco_main_handler, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error( 'Failed running BUSCO for all the assemblies. See log files in ' + output_dir + ' for information ' '(rerun with --debug to keep all intermediate files).') return # saving results zero_output_for_all = True for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field( reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) if complete_buscos + part_buscos > 0: zero_output_for_all = False shutil.copy(summary_fpaths[i], output_dir) else: logger.error( 'Failed running BUSCO for ' + contigs_fpath + '. See the log for detailed information' ' (rerun with --debug to keep all intermediate files).') if zero_output_for_all: logger.warning( 'BUSCO did not fail explicitly but found nothing for all assemblies! ' 'Possible reasons and workarounds:\n' ' 1. Provided assemblies are so small that they do not contain even a single partial BUSCO gene. Not likely but may happen -- nothing to worry then.\n' ' 2. Incorrect lineage database was used. To run with fungi DB use --fungus, to run with eukaryota DB use --eukaryote, otherwise BUSCO uses bacteria DB.\n' ' 3. Problem with BUSCO dependencies, most likely Augustus. Check that the binaries in ' + augustus_dirpath + '/bin/ are working properly.\n' ' If something is wrong with Augustus, you may try to install it yourself (https://github.com/Gaius-Augustus/Augustus) and add "augustus" binary to PATH.\n' ' 4. Some other problem with BUSCO. Check the logs (you may need to rerun QUAST with --debug to see all intermediate files).\n' ' If you cannot solve the problem yourself, post an issue at https://github.com/ablab/quast/issues or write to [email protected]' ) if not qconfig.debug: cleanup(output_dir) logger.info('Done.')
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique 101-mers...') addsitedir(jellyfish_python_dirpath) try: compile_jellyfish(logger) import jellyfish try: import imp imp.reload(jellyfish) except: reload(jellyfish) jellyfish.MerDNA.k(KMERS_LEN) except: logger.warning('Failed unique 101-mers analysis.') return checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): jf_stats_fpath = join(output_dir, label + '.stat') stats_content = open(jf_stats_fpath).read().split('\n') if len(stats_content) < 4: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % float(stats_content[1].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % float(stats_content[2].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % float(stats_content[3].strip().split(': ')[-1])) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: logger.info('Done.') return logger.info('Running Jellyfish on reference...') jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), ref_fpath ]) ref_kmers = jellyfish.ReadMerFile(jf_out_fpath) os.remove(jf_out_fpath) logger.info('Running Jellyfish on assemblies...') contigs_kmers = [] for contigs_fpath in contigs_fpaths: jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), contigs_fpath ]) contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath)) os.remove(jf_out_fpath) logger.info('Analyzing completeness and accuracy of assemblies...') unique_kmers = 0 matched_kmers = defaultdict(int) shared_kmers = set() kmer_i = 0 for kmer, count in ref_kmers: unique_kmers += 1 matches = 0 for idx in range(len(contigs_fpaths)): if contigs_kmers[idx][kmer]: matched_kmers[idx] += 1 matches += 1 if matches == len(contigs_fpaths): if kmer_i % 100 == 0: shared_kmers.add(str(kmer)) kmer_i += 1 for idx, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) completeness = matched_kmers[idx] * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) shared_kmers_by_chrom = dict() ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath)) for name, seq in ref_contigs.items(): seq_kmers = jellyfish.string_mers(seq) for kmer in seq_kmers: if str(kmer) in shared_kmers: shared_kmers_by_chrom[str(kmer)] = name for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 total_len = 0 for name, seq in read_fasta(contigs_fpath): total_len += len(seq) seq_kmers = jellyfish.string_mers(seq) chrom_markers = [] for kmer in seq_kmers: kmer_str = str(kmer) if kmer_str in shared_kmers_by_chrom: chrom = shared_kmers_by_chrom[kmer_str] chrom_markers.append(chrom) if len(chrom_markers) < MIN_MARKERS: continue if len(set(chrom_markers)) == 1: len_map_to_one_chrom += len(seq) else: len_map_to_multi_chrom += len(seq) len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_jf_stats_file( output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom) logger.info('Done.')
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1): tmp_output_dirpath = create_minimap_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) out_basename = join(tmp_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(out_basename) status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath) if status != AlignerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if status == AlignerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif status == AlignerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif status == AlignerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') return status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} with open(coords_fpath) as coords_file: for line in coords_file: mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_features = {} # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in reference_chromosomes.items(): log_out_f.write('\tLoaded [%s]\n' % name) regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'), icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') total_aligned_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath) total_indels_info += indels_info cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases} result.update(cov_stats) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') if not ref_aligns: return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries(logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return set_augustus_dir(augustus_dirpath) if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus) if not clade_dirpath: logger.info('Failed finding conservative genes.') return log_fpath = join(output_dir, 'busco.log') logger.info('Logging to ' + log_fpath + '...') busco_args = [(['-i', contigs_fpath, '-o', qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath, '-m', 'genome', '-f', '-z', '-c', str(busco_threads), '-t', tmp_dir, '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' + join(augustus_dirpath, 'config') + '\'' ], output_dir) for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error('Failed running BUSCO for all the assemblies. See ' + log_fpath + ' for information.') return # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field(reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) else: logger.error( 'Failed running BUSCO for ' + contigs_fpath + '. See ' + log_fpath + ' for information.') logger.info('Done.')
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 5: len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1]) len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1]) len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) checked_assemblies.append(contigs_fpath) contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies] if len(contigs_fpaths) == 0: logger.info('Done.') return if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger) global kmc_bin_fpath global kmc_tools_fpath kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath) or not compile_minimap(logger): logger.warning(' Sorry, can\'t run KMC, skipping...') return None logger.info('Running KMC on reference...') log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: return logger.info('Analyzing assemblies completeness...') kmc_out_fpaths = [] for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info('Analyzing assemblies accuracy...') if len(kmc_out_fpaths) > 1: shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath) else: shared_kmc_db = kmc_out_fpaths[0] kmer_fraction = 0.001 ref_contigs = [name for name, _ in read_fasta(ref_fpath)] ref_kmc_dbs = [] if len(ref_contigs) <= MAX_REF_CONTIGS_NUM: shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, ref_fpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction) for name, seq in read_fasta(ref_fpath): seq_kmc_db = seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, seq=seq, name=name, is_ref=True, intersect_with=shared_downsampled_kmc_db) ref_kmc_dbs.append((name, seq_kmc_db)) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = None len_map_to_multi_chrom = None len_map_to_none_chrom = None total_len = 0 long_contigs = [] contig_lens = dict() contig_markers = defaultdict(list) label = qutils.label_from_fpath_for_fname(contigs_fpath) list_files_fpath = join(tmp_dirpath, label + '_files.txt') with open(list_files_fpath, 'w') as list_files: for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(seq) >= MIN_CONTIGS_LEN: long_contigs.append(len(seq)) tmp_contig_fpath = join(tmp_dirpath, name + '.fasta') with open(tmp_contig_fpath, 'w') as out_f: out_f.write('>%s\n' % name) out_f.write('%s\n' % seq) list_files.write(tmp_contig_fpath + '\n') if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5: logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.') elif len(ref_contigs) > MAX_REF_CONTIGS_NUM: logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.') else: len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 filtered_fpath = join(tmp_dirpath, label + '.filtered.fasta') filter_contigs(list_files_fpath, filtered_fpath, shared_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MARKERS) filtered_list_files_fpath = join(tmp_dirpath, label + '_files.filtered.txt') with open(filtered_list_files_fpath, 'w') as list_files: for name, _ in read_fasta(filtered_fpath): tmp_contig_fpath = join(tmp_dirpath, name + '.fasta') list_files.write(tmp_contig_fpath + '\n') for ref_name, ref_kmc_db in ref_kmc_dbs: tmp_filtered_fpath = join(tmp_dirpath, ref_name + '.filtered.fasta') filter_contigs(filtered_list_files_fpath, tmp_filtered_fpath, ref_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MISJOIN_MARKERS) if exists(tmp_filtered_fpath): for name, _ in read_fasta(tmp_filtered_fpath): contig_markers[name].append(ref_name) for name, chr_markers in contig_markers.items(): if len(chr_markers) == 1: len_map_to_one_chrom += contig_lens[name] else: len_map_to_multi_chrom += contig_lens[name] len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.main_info('Running GAGE...') metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'] metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) if not compile_aligner(logger) or (not all_required_java_classes_exist(gage_dirpath) and not compile_gage()): logger.error('GAGE module was not installed properly, so it is disabled and you cannot use --gage.') return n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) if 0 not in return_codes: logger.error('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join( gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries( logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return set_augustus_dir(augustus_dirpath) if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote) if not clade_dirpath: logger.info('Failed finding conservative genes.') return log_fpath = join(output_dir, 'busco.log') logger.info('Logging to ' + log_fpath + '...') busco_args = [([ '-i', contigs_fpath, '-o', qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath, '-m', 'genome', '-f', '-z', '-c', str(busco_threads), '-t', tmp_dir, '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' + join(augustus_dirpath, 'config') + '\'' ], output_dir) for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error('Failed running BUSCO for all the assemblies. See ' + log_fpath + ' for information.') return # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field( reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) else: logger.error('Failed running BUSCO for ' + contigs_fpath + '. See ' + log_fpath + ' for information.') logger.info('Done.')
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 5: len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1]) len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1]) len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) checked_assemblies.append(contigs_fpath) contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies] if len(contigs_fpaths) == 0: logger.info('Done.') return if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath): logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None logger.info('Running KMC on reference...') log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: return logger.info('Analyzing assemblies completeness...') kmc_out_fpaths = [] for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info('Analyzing assemblies accuracy...') if len(kmc_out_fpaths) > 1: shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath) else: shared_kmc_db = kmc_out_fpaths[0] kmer_fraction = 100 if getsize(ref_fpath) < 500 * 1024 ** 2 else 1000 shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction) shared_kmers_by_chrom = dict() shared_kmers_fpath = join(tmp_dirpath, 'shared_kmers.txt') ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath)) with open(shared_kmers_fpath, 'w') as out_f: for name, seq in ref_contigs.items(): seq_kmers = get_string_kmers(tmp_dirpath, log_fpath, err_fpath, seq=seq, intersect_with=shared_downsampled_kmc_db) for kmer_i, kmer in enumerate(seq_kmers): shared_kmers_by_chrom[str(kmer)] = name out_f.write('>' + str(kmer_i) + '\n') out_f.write(kmer + '\n') shared_kmc_db = count_kmers(tmp_dirpath, shared_kmers_fpath, log_fpath, err_fpath) ref_kmc_dbs = [] for ref_name, ref_seq in ref_contigs.items(): ref_contig_fpath = join(tmp_dirpath, ref_name + '.fa') if not is_non_empty_file(ref_contig_fpath): with open(ref_contig_fpath, 'w') as out_f: out_f.write(ref_seq) ref_kmc_db = count_kmers(tmp_dirpath, ref_contig_fpath, log_fpath, err_fpath) ref_shared_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, shared_kmc_db], log_fpath, err_fpath) ref_kmc_dbs.append((ref_name, ref_shared_kmc_db)) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = None len_map_to_multi_chrom = None len_map_to_none_chrom = None total_len = 0 long_contigs = [] contig_lens = dict() contig_markers = defaultdict(list) for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(seq) >= MIN_CONTIGS_LEN: long_contigs.append(len(seq)) if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5: logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.') elif len(ref_kmc_dbs) > MAX_CONTIGS_NUM: logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.') else: len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 for name, seq in read_fasta(contigs_fpath): if len(seq) < MIN_CONTIGS_LEN: continue tmp_contig_fpath = join(tmp_dirpath, name + '.fa') with open(tmp_contig_fpath, 'w') as out_tmp_f: out_tmp_f.write(seq) contig_kmc_db = count_kmers(tmp_dirpath, tmp_contig_fpath, log_fpath, err_fpath) intersect_all_ref_kmc_db = intersect_kmers(tmp_dirpath, [contig_kmc_db, shared_kmc_db], log_fpath, err_fpath) kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_all_ref_kmc_db, log_fpath, err_fpath) if kmers_cnt < MIN_MARKERS: continue for ref_name, ref_kmc_db in ref_kmc_dbs: intersect_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, intersect_all_ref_kmc_db], log_fpath, err_fpath) kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_kmc_db, log_fpath, err_fpath) if kmers_cnt: contig_markers[name].append(ref_name) for name, chr_markers in contig_markers.items(): if len(chr_markers) == 1: len_map_to_one_chrom += contig_lens[name] else: len_map_to_multi_chrom += contig_lens[name] len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')