def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names): raw_cov_fpath = cov_fpath + '_raw' chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names) if not is_non_empty_file(cov_fpath): logger.info(' Calculating reads coverage...') if not is_non_empty_file(raw_cov_fpath): if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath ], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam', bam_sorted_fpath, '-g', chr_len_fpath ], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.assert_file_exists(raw_cov_fpath, 'coverage file') proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names) if not is_non_empty_file(physical_cov_fpath): raw_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, physical_cov_fpath, chr_len_fpath) proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names) return cov_fpath, physical_cov_fpath
def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads): logger.info(' ' + 'processing ' + label) blast_query_fpath = contigs_fpath compress_ext = ['.gz', '.gzip', '.bz2', '.bzip2', '.zip'] if any(contigs_fpath.endswith(ext) for ext in compress_ext): logger.info(' ' + 'unpacking ' + label) unpacked_fpath = os.path.join( corrected_dirpath, os.path.basename(contigs_fpath) + '.unpacked') with _get_fasta_file_handler(contigs_fpath) as f_in: with open(unpacked_fpath, 'w') as f_out: for l in f_in: f_out.write(l) blast_query_fpath = unpacked_fpath res_fpath = get_blast_output_fpath(blast_res_fpath, label) check_fpath = get_blast_output_fpath(blast_check_fpath, label) cmd = get_blast_fpath('blastn') + ( ' -query %s -db %s -outfmt 7 -num_threads %s' % (blast_query_fpath, db_fpath, blast_threads)) qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines('Assembly: %s md5 checksum: %s\n' % (contigs_fpath, md5(contigs_fpath)))
def merge_bed(repeats_fpath, uncovered_fpath, insert_size, output_dirpath, err_path): combined_bed_fpath = join(output_dirpath, 'skipped_regions.bed') with open(combined_bed_fpath, 'w') as out: if exists(repeats_fpath): with open(repeats_fpath) as in_f: for line in in_f: l = line.split('\t') repeat_len = int(l[2]) - int(l[1]) if repeat_len >= insert_size: out.write(line) if exists(uncovered_fpath): with open(uncovered_fpath) as in_f: for line in in_f: out.write(line) sorted_bed_fpath = add_suffix(combined_bed_fpath, 'sorted') qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', combined_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) merged_bed_fpath = add_suffix(combined_bed_fpath, 'merged') qutils.call_subprocess( [bedtools_fpath('bedtools'), 'merge', '-i', sorted_bed_fpath], stdout=open(merged_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return merged_bed_fpath
def get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, cov_fpath, chr_len_fpath): if not os.path.exists(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None raw_cov_fpath = cov_fpath + '_raw' if not is_non_empty_file(raw_cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = os.path.join(output_dirpath, ref_name + '.filtered.bam') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) ## sort by read names bam_filtered_sorted_fpath = os.path.join(output_dirpath, ref_name + '.filtered.sorted.bam') qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_filtered_sorted_fpath, '-n', bam_filtered_fpath], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) bedpe_fpath = os.path.join(output_dirpath, ref_name + '.bedpe') qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_filtered_sorted_fpath, '-bedpe'], stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) raw_bed_fpath = os.path.join(output_dirpath, ref_name + '.bed') with open(bedpe_fpath, 'r') as bedpe: with open(raw_bed_fpath, 'w') as bed_file: for line in bedpe: fs = line.split() bed_file.write('\t'.join([fs[0], fs[1], fs[5] + '\n'])) sorted_bed_fpath = os.path.join(output_dirpath, ref_name + '.sorted.bed') qutils.call_subprocess([bedtools_fpath('bedtools'), 'sort', '-i', raw_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([bedtools_fpath('bedtools'), 'genomecov', '-bga', '-i', sorted_bed_fpath, '-g', chr_len_fpath], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return raw_cov_fpath
def draw_mummer_plot(logger, nucmer_fpath, delta_fpath, index, log_out_f, log_err_f): output_dirpath = dirname(dirname(nucmer_fpath)) mummer_plot_fpath = join(output_dirpath, basename(nucmer_fpath) + '_mummerplot.html') return_code = qutils.call_subprocess( [bin_fpath('mummerplot'), '--html', '--layout', '-p', nucmer_fpath, delta_fpath], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code == 0: plot_script_fpath = nucmer_fpath + '.gp' temp_plot_fpath = nucmer_fpath + '.html' if isfile(plot_script_fpath) and isfile(gnuplot_exec_fpath()): qutils.call_subprocess( [gnuplot_exec_fpath(), plot_script_fpath], stdout=open('/dev/null', 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if isfile(temp_plot_fpath): with open(temp_plot_fpath) as template_file: html = template_file.read() html = _embed_css_and_scripts(html) with open(mummer_plot_fpath, 'w') as f_html: f_html.write(html) logger.info(' ' + qutils.index_to_str(index) + 'MUMmer plot saved to ' + mummer_plot_fpath) if not isfile(mummer_plot_fpath): logger.notice(qutils.index_to_str(index) + ' MUMmer plot cannot be created.\n')
def bwa_index(ref_fpath, err_path, logger): cmd = [bwa_fpath('bwa'), 'index', '-p', ref_fpath, ref_fpath] if getsize(ref_fpath) > 2 * 1024**3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] if not is_non_empty_file(ref_fpath + '.bwt'): qutils.call_subprocess(cmd, stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
def sort_bam(bam_fpath, sorted_bam_fpath, err_path, logger, threads=None, sort_rule=None): if not threads: threads = qconfig.max_threads mem = '%dGB' % min(100, max(2, get_total_memory() // 4)) cmd = [sambamba_fpath('sambamba'), 'sort', '-t', str(threads), '--tmpdir', dirname(sorted_bam_fpath), '-m', mem, '-o', sorted_bam_fpath, bam_fpath] if sort_rule: cmd += [sort_rule] qutils.call_subprocess(cmd, stderr=open(err_path, 'a'), logger=logger)
def check_repeats_instances(coords_fpath, repeats_fpath, use_long_reads=False): query_instances = defaultdict(list) with open(coords_fpath) as f: for line in f: fs = line.split('\t') contig, align_start, align_end, strand, ref_name, ref_start = \ fs[0], fs[2], fs[3], fs[4], fs[5], fs[7] align_start, align_end, ref_start = map(int, (align_start, align_end, ref_start)) align_start += 1 ref_start += 1 matched_bases, bases_in_mapping = map(int, (fs[9], fs[10])) if matched_bases > qconfig.optimal_assembly_insert_size: query_instances[contig].append((align_start, align_end)) repeats_regions = defaultdict(list) filtered_repeats_fpath = add_suffix(repeats_fpath, 'filtered') with open(filtered_repeats_fpath, 'w') as out_f: with open(repeats_fpath) as f: for line in f: fs = line.split() query_id = '%s:%s-%s' % (fs[0], fs[1], fs[2]) if query_id in query_instances and len(query_instances[query_id]) > 1: mapped_repeats = sorted(list(set(query_instances[query_id][1:]))) merged_intervals = [] i_start, i_end = mapped_repeats[0] merged_interval = (i_start, i_end) for s, e in mapped_repeats[1:]: if s <= merged_interval[1]: merged_interval = (merged_interval[0], max(merged_interval[1], e)) else: merged_intervals.append(merged_interval) merged_interval = (s, e) merged_intervals.append(merged_interval) aligned_bases = sum([end - start + 1 for start, end in merged_intervals]) if aligned_bases >= (int(fs[2]) - int(fs[1])) * 0.9: if use_long_reads and len(mapped_repeats) > 1: solid_repeats = [] full_repeat_pos = int(fs[1]) mapped_repeats.sort(key=lambda x: (x[1], x[1] - x[0]), reverse=True) cur_repeat_start, cur_repeat_end = mapped_repeats[0] for repeat_start, repeat_end in mapped_repeats[1:]: if (cur_repeat_start >= repeat_start - REPEAT_CONF_INTERVAL and cur_repeat_end <= repeat_end + REPEAT_CONF_INTERVAL) or \ (repeat_start >= cur_repeat_start - REPEAT_CONF_INTERVAL and repeat_end <= cur_repeat_end + REPEAT_CONF_INTERVAL): cur_repeat_start, cur_repeat_end = min(repeat_start, cur_repeat_start), max(repeat_end, cur_repeat_end) else: solid_repeats.append((cur_repeat_start, cur_repeat_end)) cur_repeat_start, cur_repeat_end = repeat_start, repeat_end solid_repeats.append((cur_repeat_start, cur_repeat_end)) for repeat in solid_repeats: out_f.write('\t'.join((fs[0], str(repeat[0] + full_repeat_pos), str(repeat[1] + full_repeat_pos))) + '\n') repeats_regions[fs[0]].append((repeat[0] + full_repeat_pos, repeat[1] + full_repeat_pos)) else: out_f.write(line) repeats_regions[fs[0]].append((int(fs[1]), int(fs[2]))) sorted_repeats_fpath = add_suffix(repeats_fpath, 'sorted') qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', filtered_repeats_fpath], stdout=open(sorted_repeats_fpath, 'w'), logger=logger) return sorted_repeats_fpath, repeats_regions
def sambamba_view(in_fpath, out_fpath, max_threads, err_fpath, logger, filter_rule=None): cmd = [sambamba_fpath('sambamba'), 'view', '-t', str(max_threads), '-h'] if in_fpath.endswith('.sam'): cmd += ['-S'] if out_fpath.endswith('.bam'): cmd += ['-f', 'bam'] if filter_rule: cmd += ['-F', filter_rule] cmd.append(in_fpath) qutils.call_subprocess(cmd, stdout=open(out_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
def run(contigs_fpath, gff_fpath, log_fpath, threads, kingdom): barrnap_fpath = join(qconfig.LIBS_LOCATION, 'barrnap', 'bin', 'barrnap') if is_non_empty_file(gff_fpath): return call_subprocess([ barrnap_fpath, '--quiet', '-k', kingdom, '--threads', str(threads), contigs_fpath ], stdout=open(gff_fpath, 'w'), stderr=open(log_fpath, 'a'))
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath): tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return sam_fpath
def get_correct_names_for_chroms(output_dirpath, ref_fpath, sam_fpath, err_path, reads_fpaths): correct_chr_names = dict() ref_chr_lengths = get_chr_lengths_from_fastafile(ref_fpath) sam_chr_lengths = dict() sam_header_fpath = os.path.join(output_dirpath, os.path.basename(sam_fpath) + '.header') qutils.call_subprocess( [sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'w'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(ref_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for ref_chr, sam_chr in zip(ref_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name( sam_chr) == ref_chr[:len(sam_chr)] and sam_chr_lengths[ sam_chr] == ref_chr_lengths[ref_chr]: correct_chr_names[sam_chr] = ref_chr elif sam_chr_lengths[sam_chr] != ref_chr_lengths[ref_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning( inconsistency + ' in reference and SAM file do not match. ' + 'QUAST will try to realign reads to the reference genome.') else: logger.error( inconsistency + ' in reference and SAM file do not match. ' + 'Use SAM file obtained by aligning reads to the reference genome.' ) return None return correct_chr_names
def align_kmers(output_dir, ref_fpath, kmers_fpath, log_err_fpath, max_threads): out_fpath = join(output_dir, 'kmers.coords') cmdline = [minimap_fpath(), '-ax', 'sr', '-s202', '--frag=no', '-t', str(max_threads), ref_fpath, kmers_fpath] qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ') kmers_pos_by_chrom = defaultdict(list) kmers_by_chrom = defaultdict(list) with open(out_fpath) as f: for line in f: fs = line.split('\t') if len(fs) < 10: continue contig, chrom, pos = fs[0], fs[2], fs[3] kmers_pos_by_chrom[chrom].append(int(pos)) kmers_by_chrom[chrom].append(int(contig)) return kmers_by_chrom, kmers_pos_by_chrom
def connect_with_matepairs(bam_fpath, output_dirpath, err_fpath): bam_filtered_fpath = add_suffix(bam_fpath, 'filtered') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) ## sort by read names bam_filtered_sorted_fpath = add_suffix(bam_filtered_fpath, 'sorted') sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, 'matepairs', bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True, only_intervals=True) matepair_regions = defaultdict(list) with open(bed_fpath) as bed: for l in bed: fs = l.split() matepair_regions[fs[0]].append((int(fs[1]), int(fs[2]))) return matepair_regions
def do(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, gc_fpath, features_containers, cov_fpath, output_dir, logger): if not exists(output_dir): os.makedirs(output_dir) conf_fpath, circos_legend_fpath = create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger) circos_exec = get_path_to_program('circos') if not circos_exec: logger.warning( 'Circos is not installed!\n' 'If you want to create Circos plots, install Circos as described at http://circos.ca/tutorials/lessons/configuration/distribution_and_installation ' 'and run the following command:\n\tcircos -conf ' + conf_fpath + '\n' 'The plot legend is saved to ' + circos_legend_fpath + '\n') return None, None cmdline = [circos_exec, '-conf', conf_fpath] log_fpath = join(output_dir, 'circos.log') err_fpath = join(output_dir, 'circos.err') circos_png_fpath = join(output_dir, circos_png_fname) return_code = qutils.call_subprocess(cmdline, stdout=open(log_fpath, 'w'), stderr=open(err_fpath, 'w')) if return_code == 0 and is_non_empty_file(circos_png_fpath): return circos_png_fpath, circos_legend_fpath else: logger.warning(' Circos diagram was not created. See ' + log_fpath + ' and ' + err_fpath + ' for details') return None, None
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def compile_gage(only_clean=False): if only_clean: for required_name in required_java_fnames: fpath = os.path.join(gage_dirpath, required_name + '.class') if os.path.isfile(fpath): os.remove(fpath) return True javac_path = get_path_to_program('javac') if javac_path is None: logger.error('Java compiler not found (javac)! ' 'Please install it or compile GAGE java classes manually (' + gage_dirpath + '/*.java)!') return False cur_dir = os.getcwd() os.chdir(gage_dirpath) # making logger.main_info('Compiling JAVA classes (details are in ' + os.path.join(gage_dirpath, 'make.log') + ' and make.err)') return_codes = [qutils.call_subprocess( ['javac', os.path.join(gage_dirpath, java_fname + '.java')], stdout=open(os.path.join(gage_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(gage_dirpath, 'make.err'), 'w'),) for java_fname in required_java_fnames] os.chdir(cur_dir) if any(return_code != 0 for return_code in return_codes) or not all_required_java_classes_exist(gage_dirpath): logger.error('Error occurred during compilation of java classes (' + gage_dirpath + '/*.java)! ' 'Try to compile it manually. ' + ('You can restart Quast with the --debug flag ' 'to see the command line.' if not qconfig.debug else '')) return False return True
def run_minimap_agb(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): # run minimap2 for AGB mask_level = '1' if qconfig.min_IDY < 95 else '0.9' cmdline = [minimap_fpath(), '-cx', 'asm20', '--mask-level', mask_level, '-N', '100', '--score-N', '0', '-E', '1,0', '-f', '200', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess([ 'perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath ] + (['--fungus'] if qconfig.is_fungus else []), stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] fnames = [ fname for (path, dirs, files) in os.walk(tmp_dirpath) for fname in files ] for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def run_minimap(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): if qconfig.is_agv_mode: return run_minimap_agv(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads) preset = 'asm5' if qconfig.min_IDY >= 95 and not qconfig.is_combined_ref else 'asm10' # -s -- min CIGAR score, -z -- affects how often to stop alignment extension, -B -- mismatch penalty # -O -- gap penalty, -r -- max gap size mask_level = '1' if qconfig.is_combined_ref else '0.9' num_alignments = '100' if qconfig.is_combined_ref else '50' additional_options = [ '-B5', '-O4,16', '--no-long-join', '-r', str(qconfig.MAX_INDEL_LENGTH), '-N', num_alignments, '-s', str(qconfig.min_alignment), '-z', '200' ] cmdline = [minimap_fpath(), '-c', '-x', preset] + (additional_options if not qconfig.large_genome else []) + \ ['--mask-level', mask_level, '--min-occ', '200', '-g', '2500', '--score-N', '2', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess( ['perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess([ 'perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def compile_glimmer(logger, only_clean=False): tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tool_src_dirpath = os.path.join(tool_dirpath, 'src') tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm') if only_clean: if os.path.isfile(tool_exec_fpath): os.remove(tool_exec_fpath) return True if not os.path.isfile(tool_exec_fpath): logger.main_info("Compiling GlimmerHMM...") return_code = qutils.call_subprocess( ['make', '-C', tool_src_dirpath], stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'), indent=' ') if return_code != 0 or not os.path.isfile(tool_exec_fpath): logger.error( "Failed to compile GlimmerHMM (" + tool_src_dirpath + ")!\nTry to compile it manually or do not use --gene-finding " "option with --eukaryote.\nUse --debug option to see the command lines." ) return None return tool_exec_fpath
def compile_gage(only_clean=False): if only_clean: for required_name in required_java_fnames: fpath = os.path.join(gage_dirpath, required_name + '.class') if os.path.isfile(fpath): os.remove(fpath) return True javac_path = get_path_to_program('javac') if javac_path is None: logger.error('Java compiler not found (javac)! ' 'Please install it or compile GAGE java classes manually (' + gage_dirpath + '/*.java)!') return cur_dir = os.getcwd() os.chdir(gage_dirpath) # making logger.main_info('Compiling JAVA classes (details are in ' + os.path.join(gage_dirpath, 'make.log') + ' and make.err)') return_codes = [qutils.call_subprocess( ['javac', os.path.join(gage_dirpath, java_fname + '.java')], stdout=open(os.path.join(gage_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(gage_dirpath, 'make.err'), 'w'),) for java_fname in required_java_fnames] os.chdir(cur_dir) if any(return_code != 0 for return_code in return_codes) or not all_required_java_classes_exist(gage_dirpath): logger.error('Error occurred during compilation of java classes (' + gage_dirpath + '/*.java)! ' 'Try to compile it manually. ' + ('You can restart Quast with the --debug flag ' 'to see the command line.' if not qconfig.debug else '')) return
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def get_unique_covered_regions(ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=False): red_genome_dir = os.path.join(tmp_dir, 'tmp_red') if isdir(red_genome_dir): shutil.rmtree(red_genome_dir) os.makedirs(red_genome_dir) ref_name = qutils.name_from_fpath(ref_fpath) ref_symlink = os.path.join(red_genome_dir, ref_name + '.fa') ## Red recognizes only *.fa files if os.path.islink(ref_symlink): os.remove(ref_symlink) os.symlink(ref_fpath, ref_symlink) logger.info(' ' + 'Running repeat masking tool...') repeats_fpath = os.path.join(tmp_dir, ref_name + '.rpt') if is_non_empty_file(repeats_fpath): return_code = 0 logger.info(' ' + 'Using existing file ' + repeats_fpath + '...') else: return_code = qutils.call_subprocess([binary_fpath, '-gnm', red_genome_dir, '-rpt', tmp_dir, '-frm', '2', '-min', '5'], stdout=open(log_fpath, 'w'), stderr=open(log_fpath, 'w'), indent=' ') if return_code == 0 and repeats_fpath and exists(repeats_fpath): long_repeats_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.long.rpt') with open(long_repeats_fpath, 'w') as out: with open(repeats_fpath) as in_f: for line in in_f: l = line.split('\t') repeat_len = int(l[2]) - int(l[1]) if repeat_len >= insert_size: out.write(line[1:]) repeats_fasta_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.fasta') coords_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.rpt.coords.txt') if not is_non_empty_file(coords_fpath): fasta_index_fpath = ref_fpath + '.fai' if exists(fasta_index_fpath): os.remove(fasta_index_fpath) qutils.call_subprocess([bedtools_fpath('bedtools'), 'getfasta', '-fi', ref_fpath, '-bed', long_repeats_fpath, '-fo', repeats_fasta_fpath], stderr=open(log_fpath, 'w'), indent=' ') cmdline = [minimap_fpath(), '-c', '-x', 'asm10', '-N', '50', '--mask-level', '1', '--no-long-join', '-r', '100', '-t', str(qconfig.max_threads), '-z', '200', ref_fpath, repeats_fasta_fpath] qutils.call_subprocess(cmdline, stdout=open(coords_fpath, 'w'), stderr=open(log_fpath, 'a')) filtered_repeats_fpath, repeats_regions = check_repeats_instances(coords_fpath, long_repeats_fpath, use_long_reads) unique_covered_regions = remove_repeat_regions(ref_fpath, filtered_repeats_fpath, uncovered_fpath) return unique_covered_regions, repeats_regions return None, None
def sort_bam(bam_fpath, sorted_bam_fpath, err_path, logger, threads=None, sort_rule=None): if not threads: threads = qconfig.max_threads mem = '%dGB' % min(100, max(2, get_free_memory())) cmd = [ sambamba_fpath('sambamba'), 'sort', '-t', str(threads), '--tmpdir', dirname(sorted_bam_fpath), '-m', mem, '-o', sorted_bam_fpath, bam_fpath ] if sort_rule: cmd += [sort_rule] qutils.call_subprocess(cmd, stderr=open(err_path, 'a'), logger=logger)
def calculate_genome_cov(in_fpath, out_fpath, chr_len_fpath, err_fpath, logger, print_all_positions=True): cmd = [ bedtools_fpath('bedtools'), 'genomecov', '-ibam' if in_fpath.endswith('.bam') else '-i', in_fpath, '-g', chr_len_fpath ] if print_all_positions: cmd += ['-bga'] qutils.call_subprocess(cmd, stdout=open(out_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code
def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec_fpath, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code
def parallel_blast(contigs_fpath, label, blast_res_fpath, err_fpath, blast_check_fpath, blast_threads): cmd = get_blast_fpath('blastn') + ( ' -query %s -db %s -outfmt 7 -num_threads %s' % (contigs_fpath, db_fpath, blast_threads)) res_fpath = blast_res_fpath + '_' + label check_fpath = blast_check_fpath + '_' + label logger.info(' ' + 'processing ' + label) qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines('Assembly: %s size: %d\n' % (contigs_fpath, os.path.getsize(contigs_fpath))) return
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names): raw_cov_fpath = cov_fpath + '_raw' chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names) if not is_non_empty_file(cov_fpath): logger.info(' Calculating reads coverage...') if not is_non_empty_file(raw_cov_fpath): if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam', bam_sorted_fpath, '-g', chr_len_fpath], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.assert_file_exists(raw_cov_fpath, 'coverage file') proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names) if not is_non_empty_file(physical_cov_fpath): raw_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, physical_cov_fpath, chr_len_fpath) proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names) return cov_fpath, physical_cov_fpath
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, output_dir, max_threads, err_fpath): merged_bam_fpath = add_suffix(bam_fpath, 'merged') tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sambamba_view(tmp_sam_fpath, tmp_bam_fpath, max_threads, err_fpath, logger, filter_rule=None) sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), merged_bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, merged_bam_fpath, bam_fpath], stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return merged_bam_fpath
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, max_threads): nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster), '-l', str(qconfig.min_cluster), '--maxmatch', '-p', prefix, '-t', str(max_threads)] env = os.environ.copy() nucmer_cmdline += [ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index), env=env) return return_code
def compile_gnuplot(logger, only_clean=False): tool_dirpath = join(qconfig.LIBS_LOCATION, 'gnuplot') tool_exec_fpath = gnuplot_exec_fpath() if only_clean: if isfile(tool_exec_fpath): os.remove(tool_exec_fpath) return True if not isfile(tool_exec_fpath): failed_compilation_flag = join(tool_dirpath, 'make.failed') if check_prev_compilation_failed('gnuplot', failed_compilation_flag, just_notice=True, logger=logger): return None logger.main_info("Compiling gnuplot...") prev_dir = os.getcwd() os.chdir(tool_dirpath) return_code = qutils.call_subprocess( [ './configure', '--with-qt=no', '--disable-wxwidgets', '--prefix=' + tool_dirpath ], stdout=open(join(tool_dirpath, 'make.log'), 'w'), stderr=open(join(tool_dirpath, 'make.err'), 'w'), indent=' ') if return_code == 0: return_code = qutils.call_subprocess( ['make'], stdout=open(join(tool_dirpath, 'make.log'), 'w'), stderr=open(join(tool_dirpath, 'make.err'), 'w'), indent=' ') os.chdir(prev_dir) if return_code != 0 or not isfile(tool_exec_fpath): write_failed_compilation_flag('gnuplot', tool_dirpath, failed_compilation_flag, just_notice=True, logger=logger) return None return tool_exec_fpath
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type): bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads) insert_sizes = [] for idx, reads in enumerate(read_fpaths): if isinstance(reads, str): if reads_type == 'pacbio' or reads_type == 'nanopore': if reads_type == 'pacbio': preset = ' -ax map-pb ' else: preset = ' -ax map-ont ' cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads else: cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads else: read1, read2 = reads cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2 output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1)) bam_fpath = output_fpath.replace('.sam', '.bam') if not is_non_empty_file(output_fpath): qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_fpath): if not is_non_empty_file(bam_fpath): sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) if reads_type == 'pe': bam_dedup_fpath = add_suffix(bam_fpath, 'dedup') qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, bam_fpath, bam_dedup_fpath], stderr=open(err_fpath, 'a'), logger=logger) if exists(bam_dedup_fpath): shutil.move(bam_dedup_fpath, bam_fpath) if reads_type == 'pe': insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath)) if insert_size < qconfig.optimal_assembly_max_IS: insert_sizes.append(insert_size) out_sam_fpaths.append(output_fpath) if insert_sizes: qconfig.optimal_assembly_insert_size = max(insert_sizes) ref_name = qutils.name_from_fpath(ref_fpath) insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt') with open(insert_size_fpath, 'w') as out: out.write(str(qconfig.optimal_assembly_insert_size))
def get_correct_names_for_chroms(output_dirpath, fasta_fpath, sam_fpath, err_path, reads_fpaths, logger, is_reference=False): correct_chr_names = dict() fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath) sam_chr_lengths = dict() sam_header_fpath = join(dirname(output_dirpath), basename(sam_fpath) + '.header') if not isfile(sam_fpath) and not isfile(sam_header_fpath): return None if isfile(sam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(fasta_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name(sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == fasta_chr_lengths[fasta_chr]: correct_chr_names[sam_chr] = fasta_chr elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'QUAST will try to realign reads to ' + ('the reference genome' if is_reference else fasta_fpath)) else: logger.error(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'Use SAM file obtained by aligning reads to ' + ('the reference genome' if is_reference else fasta_fpath)) return None return correct_chr_names
def run_bwa(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type): bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads) insert_sizes = [] for idx, reads in enumerate(read_fpaths): if isinstance(reads, str): cmd = bwa_cmd + (' -p ' if reads_type != 'single' else ' ') + ref_fpath + ' ' + reads else: read1, read2 = reads cmd = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2 output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1)) if not is_non_empty_file(output_fpath): qutils.call_subprocess(shlex.split(cmd), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if reads_type == 'paired_end': insert_size = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath)) if insert_size < qconfig.ideal_assembly_max_IS: insert_sizes.append(insert_size) out_sam_fpaths.append(output_fpath) if insert_sizes: qconfig.ideal_assembly_insert_size = max(insert_sizes)
def calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath): if not reads_fpaths or not sam_fpath: return lap_out_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.lap.out') if not is_non_empty_file(lap_out_fpath): if index is not None: logger.info(' ' + index_str + 'Running LAP...') else: logger.info(' Running LAP for reference...') prob_out_fpath = get_safe_fpath(output_dirpath, filename + '.prob') qutils.call_subprocess([lap_fpath('calc_prob.py'), '-a', fpath, '-i', ','.join(reads_fpaths), '-q', '-s', sam_fpath], stdout=open(prob_out_fpath, 'w'), stderr=open(err_fpath, 'a')) qutils.call_subprocess([lap_fpath('sum_prob.py'), '-i', prob_out_fpath], stdout=open(lap_out_fpath, 'w'), stderr=open(err_fpath, 'a')) else: if index is not None: logger.info(' ' + index_str + 'Using existing file with LAP score...') else: logger.info(' Using existing file with LAP score for reference...')
def gmhmm_p(tool_exec, fasta_fpath, heu_fpath, out_fpath, err_file, index): """ Run GeneMark.hmm with this heuristic model (heu_dirpath) prompt> gmhmmp -m heu_11_45.mod sequence prompt> gm -m heu_11_45.mat sequence""" return_code = qutils.call_subprocess( [tool_exec, '-d', '-a', '-p', '0', '-m', heu_fpath, '-o', out_fpath, fasta_fpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) return return_code == 0 and os.path.isfile(out_fpath)
def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads): logger.info(' ' + 'processing ' + label) blast_query_fpath = contigs_fpath compress_ext = ['.gz', '.gzip', '.bz2', '.bzip2', '.zip'] if any(contigs_fpath.endswith(ext) for ext in compress_ext): logger.info(' ' + 'unpacking ' + label) unpacked_fpath = os.path.join(corrected_dirpath, os.path.basename(contigs_fpath) + '.unpacked') with _get_fasta_file_handler(contigs_fpath) as f_in: with open(unpacked_fpath, 'w') as f_out: for l in f_in: f_out.write(l) blast_query_fpath = unpacked_fpath res_fpath = get_blast_output_fpath(blast_res_fpath, label) check_fpath = get_blast_output_fpath(blast_check_fpath, label) cmd = get_blast_fpath('blastn') + (' -query %s -db %s -outfmt 7 -num_threads %s' % ( blast_query_fpath, db_fpath, blast_threads)) qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines('Assembly: %s size: %d\n' % (contigs_fpath, os.path.getsize(contigs_fpath)))
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, emem_threads=1): # additional GAGE params of Nucmer: '-l', '30', '-banded' nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster), '-l', str(qconfig.min_cluster), '--maxmatch', '-p', prefix] if is_emem_aligner(): nucmer_cmdline += ['-t', str(emem_threads)] nucmer_cmdline += [ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def bam_to_bed(output_dirpath, name, bam_fpath, err_path, logger, bedpe=False): raw_bed_fpath = join(output_dirpath, name + '.bed') if bedpe: bedpe_fpath = join(output_dirpath, name + '.bedpe') if not is_non_empty_file(bedpe_fpath) and not is_non_empty_file( bedpe_fpath): qutils.call_subprocess( [bedtools_fpath('bamToBed'), '-i', bam_fpath, '-bedpe'], stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) with open(bedpe_fpath, 'r') as bedpe: with open(raw_bed_fpath, 'w') as bed_file: for line in bedpe: fs = line.split() start, end = fs[1], fs[5] bed_file.write('\t'.join([fs[0], start, end + '\n'])) else: if not is_non_empty_file(raw_bed_fpath): qutils.call_subprocess( [bedtools_fpath('bamToBed'), '-i', bam_fpath], stdout=open(raw_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) sorted_bed_fpath = join(output_dirpath, name + '.sorted.bed') if not is_non_empty_file(sorted_bed_fpath): qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', raw_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return sorted_bed_fpath
def get_correct_names_for_chroms(output_dirpath, ref_fpath, sam_fpath, err_path, reads_fpaths): correct_chr_names = dict() ref_chr_lengths = get_chr_lengths_from_fastafile(ref_fpath) sam_chr_lengths = dict() sam_header_fpath = os.path.join(output_dirpath, os.path.basename(sam_fpath) + '.header') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'w'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(ref_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for ref_chr, sam_chr in zip(ref_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name(sam_chr) == ref_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == ref_chr_lengths[ref_chr]: correct_chr_names[sam_chr] = ref_chr elif sam_chr_lengths[sam_chr] != ref_chr_lengths[ref_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning(inconsistency + ' in reference and SAM file do not match. ' + 'QUAST will try to realign reads to the reference genome.') else: logger.error(inconsistency + ' in reference and SAM file do not match. ' + 'Use SAM file obtained by aligning reads to the reference genome.') return None return correct_chr_names
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, emem_threads=1): # additional GAGE params of Nucmer: '-l', '30', '-banded' nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster), '-l', str(qconfig.min_cluster), '--maxmatch', '-p', prefix] if is_emem_aligner(): nucmer_cmdline += ['-t', str(emem_threads)] installed_emem_fpath = get_installed_emem() if installed_emem_fpath: nucmer_cmdline += ['--emem', installed_emem_fpath] nucmer_cmdline += [ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess( ['perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] fnames = [fname for (path, dirs, files) in os.walk(tmp_dirpath) for fname in files] for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def do(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, gc_fpath, features_containers, cov_fpath, output_dir, logger): if not exists(output_dir): os.makedirs(output_dir) conf_fpath, circos_legend_fpath = create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger) circos_exec = get_path_to_program('circos') if not circos_exec: logger.warning('Circos is not installed!\n' 'If you want to create Circos plots, install Circos as described at http://circos.ca/tutorials/lessons/configuration/distribution_and_installation ' 'and run the following command:\n circos -conf ' + conf_fpath + '.\n ' 'The plot annotation is saved to ' + circos_legend_fpath) return None, None cmdline = [circos_exec, '-conf', conf_fpath] log_fpath = join(output_dir, 'circos.log') err_fpath = join(output_dir, 'circos.err') circos_png_fpath = join(output_dir, circos_png_fname) return_code = qutils.call_subprocess(cmdline, stdout=open(log_fpath, 'w'), stderr=open(err_fpath, 'w')) if return_code == 0 and is_non_empty_file(circos_png_fpath): return circos_png_fpath, circos_legend_fpath else: logger.warning(' Circos diagram was not created. See ' + log_fpath + ' and ' + err_fpath + ' for details') return None, None
def run_minimap(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): if qconfig.is_agb_mode: return run_minimap_agb(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads) if qconfig.min_IDY < 90: preset = 'asm20' elif qconfig.min_IDY < 95: preset = 'asm10' else: preset = 'asm5' # -s -- min CIGAR score, -z -- affects how often to stop alignment extension, -B -- mismatch penalty # -O -- gap penalty, -r -- max gap size mask_level = '1' if qconfig.is_combined_ref else '0.9' num_alignments = '100' if qconfig.is_combined_ref else '50' additional_options = ['-B5', '-O4,16', '--no-long-join', '-r', str(qconfig.MAX_INDEL_LENGTH), '-N', num_alignments, '-s', str(qconfig.min_alignment), '-z', '200'] cmdline = [minimap_fpath(), '-c', '-x', preset] + (additional_options if not qconfig.large_genome else []) + \ ['--mask-level', mask_level, '--min-occ', '200', '-g', '2500', '--score-N', '2', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def compile_glimmer(logger, only_clean=False): tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tool_src_dirpath = os.path.join(tool_dirpath, 'src') tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm') if only_clean: if os.path.isfile(tool_exec_fpath): os.remove(tool_exec_fpath) return True if not os.path.isfile(tool_exec_fpath): logger.main_info("Compiling GlimmerHMM...") return_code = qutils.call_subprocess( ['make', '-C', tool_src_dirpath], stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'), indent=' ') if return_code != 0 or not os.path.isfile(tool_exec_fpath): logger.error("Failed to compile GlimmerHMM (" + tool_src_dirpath + ")!\nTry to compile it manually or do not use --gene-finding " "option with --eukaryote.\nUse --debug option to see the command lines.") return None return tool_exec_fpath
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath): sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') bam_mapped_fpath = add_suffix(bam_fpath, 'mapped') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') if not is_non_empty_file(bam_fpath): bwa_index(ref_fpath, err_fpath, logger) qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath], stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', bam_fpath], stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') ref_name = qutils.name_from_fpath(ref_fpath) correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger) get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return uncovered_fpath
def bam_to_bed(output_dirpath, name, bam_fpath, err_path, logger, bedpe=False, only_intervals=False): raw_bed_fpath = join(output_dirpath, name + '.bed') if bedpe: bedpe_fpath = join(output_dirpath, name + '.bedpe') qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_fpath, '-bedpe'], stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) with open(bedpe_fpath, 'r') as bedpe: with open(raw_bed_fpath, 'w') as bed_file: for line in bedpe: fs = line.split() if only_intervals: start, end = fs[2], fs[4] else: start, end = fs[1], fs[5] bed_file.write('\t'.join([fs[0], start, end + '\n'])) else: qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_fpath], stdout=open(raw_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) sorted_bed_fpath = join(output_dirpath, name + '.sorted.bed') qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', raw_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return sorted_bed_fpath
def create_krona_charts(taxons_for_krona, meta_log, results_dirpath, json_texts): meta_log.info(' Drawing interactive Krona plots...') krona_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'kronatools') krona_res_dirpath = os.path.join(results_dirpath, qconfig.krona_dirname) try: import json except ImportError: try: import simplejson as json except ImportError: meta_log.warning('Can\'t draw Krona charts - please install python-simplejson') return if not os.path.isdir(krona_res_dirpath): os.mkdir(krona_res_dirpath) json_data = json.loads(json_texts[0]) assemblies = json_data['assembliesNames'] krona_txt_ext = '_taxonomy.txt' krona_common_fpath = os.path.join(krona_res_dirpath, 'overall' + krona_txt_ext) krona_common_file = open(krona_common_fpath, 'w') for index, name in enumerate(assemblies): krona_file = open(os.path.join(krona_res_dirpath, name + krona_txt_ext), 'w') krona_file.close() for json_text in json_texts[1:]: json_data = json.loads(json_text) ref_name = json_data['referenceName'] if not ref_name: continue lengths = [] report = json_data['report'] for section in report: if lengths: break for metric in section[1]: if metric['metricName'] == reporting.Fields.TOTAL_ALIGNED_LEN: lengths = metric['values'] break if not lengths: continue if None in lengths: lengths = [l if l is not None else 0 for l in lengths] cur_assemblies = json_data['assembliesNames'] for index, name in enumerate(cur_assemblies): krona_fpath = os.path.join(krona_res_dirpath, name + krona_txt_ext) with open(krona_fpath, 'a') as f_krona: if ref_name in taxons_for_krona: f_krona.write(str(lengths[index]) + '\t' + taxons_for_krona[ref_name] + '\n') else: f_krona.write(str(lengths[index]) + '\n') if ref_name in taxons_for_krona: krona_common_file.write(str(sum(lengths)) + '\t' + taxons_for_krona[ref_name] + '\n') else: krona_common_file.write(str(sum(lengths)) + '\n') krona_common_file.close() krona_fpaths = [] krona_log_fpath = os.path.join(krona_res_dirpath, 'krona.log') krona_err_fpath = os.path.join(krona_res_dirpath, 'krona.err') open(krona_log_fpath, 'w').close() open(krona_err_fpath, 'w').close() for index, name in enumerate(assemblies): krona_fpath = os.path.join(krona_res_dirpath, name + '_taxonomy_chart.html') krona_txt_fpath = os.path.join(krona_res_dirpath, name + krona_txt_ext) return_code = qutils.call_subprocess( ['perl', '-I', krona_dirpath + '/lib', krona_dirpath + '/scripts/ImportText.pl', krona_txt_fpath, '-o', krona_fpath], stdout=open(krona_log_fpath, 'a'), stderr=open(krona_err_fpath, 'a')) if return_code != 0: meta_log.warning('Error occurred while Krona was processing assembly ' + name + '. See Krona error log for details: %s' % krona_err_fpath) else: krona_fpaths.append(os.path.join(qconfig.krona_dirname, name + '_taxonomy_chart.html')) meta_log.main_info(' Krona chart for ' + name + ' is saved to ' + krona_fpath) if not qconfig.debug: os.remove(krona_txt_fpath) if len(krona_fpaths) > 1: name = 'summary' krona_fpath = os.path.join(krona_res_dirpath, name + '_taxonomy_chart.html') return_code = qutils.call_subprocess( ['perl', '-I', krona_dirpath + '/lib', krona_dirpath + '/scripts/ImportText.pl', krona_common_fpath, '-o', krona_fpath], stdout=open(krona_log_fpath, 'a'), stderr=open(krona_err_fpath, 'a')) if return_code != 0: meta_log.warning('Error occurred while Krona was building summary chart. ' 'See Krona error log for details: %s' % krona_err_fpath) else: meta_log.main_info(' Summary Krona chart is saved to ' + krona_fpath) krona_fpaths.append(os.path.join(qconfig.krona_dirname, name + '_taxonomy_chart.html')) # extra fpath! if not qconfig.debug: os.remove(krona_common_fpath) save_krona_paths(results_dirpath, krona_fpaths, assemblies)
def bwa_index(ref_fpath, err_path, logger): cmd = [bwa_fpath('bwa'), 'index', '-p', ref_fpath, ref_fpath] if getsize(ref_fpath) > 2 * 1024 ** 3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] if not is_non_empty_file(ref_fpath + '.bwt'): qutils.call_subprocess(cmd, stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
def calculate_genome_cov(in_fpath, out_fpath, chr_len_fpath, err_fpath, logger, print_all_positions=True): cmd = [bedtools_fpath('bedtools'), 'genomecov', '-ibam' if in_fpath.endswith('.bam') else '-i', in_fpath, '-g', chr_len_fpath] if print_all_positions: cmd += ['-bga'] qutils.call_subprocess(cmd, stdout=open(out_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)