def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, error_mode, output_progress): """ High-level polisher interface """ logger_func = logger.info if output_progress else logger.debug subs_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"]) hopo_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"]) prev_assembly = contig_seqs contig_lengths = None for i in xrange(num_iters): logger_func("Polishing genome ({0}/{1})".format(i + 1, num_iters)) alignment_file = os.path.join(work_dir, "minimap_{0}.sam".format(i + 1)) logger_func("Running minimap2") make_alignment(prev_assembly, read_seqs, num_threads, work_dir, error_mode, alignment_file) logger_func("Separating alignment into bubbles") contigs_info = get_contigs_info(prev_assembly) bubbles_file = os.path.join(work_dir, "bubbles_{0}.fasta".format(i + 1)) coverage_stats, mean_aln_error = \ make_bubbles(alignment_file, contigs_info, prev_assembly, error_mode, num_threads, bubbles_file) logger_func("Alignment error rate: {0}".format(mean_aln_error)) logger_func("Correcting bubbles") consensus_out = os.path.join(work_dir, "consensus_{0}.fasta".format(i + 1)) polished_file = os.path.join(work_dir, "polished_{0}.fasta".format(i + 1)) _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out, num_threads) polished_fasta, polished_lengths = _compose_sequence([consensus_out]) fp.write_fasta_dict(polished_fasta, polished_file) contig_lengths = polished_lengths prev_assembly = polished_file stats_file = os.path.join(work_dir, "contigs_stats.txt") with open(stats_file, "w") as f: f.write("seq_name\tlength\tcoverage\n") for ctg_id in contig_lengths: f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id], coverage_stats[ctg_id]))
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, error_mode, output_progress): """ High-level polisher interface """ logger_state = logger.disabled if not output_progress: logger.disabled = True subs_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"]) hopo_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"]) stats_file = os.path.join(work_dir, "contigs_stats.txt") prev_assembly = contig_seqs contig_lengths = None coverage_stats = None for i in xrange(num_iters): logger.info("Polishing genome ({0}/{1})".format(i + 1, num_iters)) #split into 1Mb chunks to reduce RAM usage #slightly vary chunk size between iterations CHUNK_SIZE = 1000000 - (i % 2) * 100000 chunks_file = os.path.join(work_dir, "chunks_{0}.fasta".format(i + 1)) chunks = split_into_chunks(fp.read_sequence_dict(prev_assembly), CHUNK_SIZE) fp.write_fasta_dict(chunks, chunks_file) #### logger.info("Running minimap2") alignment_file = os.path.join(work_dir, "minimap_{0}.sam".format(i + 1)) make_alignment(chunks_file, read_seqs, num_threads, work_dir, error_mode, alignment_file, reference_mode=True, sam_output=True) ##### logger.info("Separating alignment into bubbles") contigs_info = get_contigs_info(chunks_file) bubbles_file = os.path.join(work_dir, "bubbles_{0}.fasta".format(i + 1)) coverage_stats, mean_aln_error = \ make_bubbles(alignment_file, contigs_info, chunks_file, error_mode, num_threads, bubbles_file) logger.info("Alignment error rate: {0}".format(mean_aln_error)) consensus_out = os.path.join(work_dir, "consensus_{0}.fasta".format(i + 1)) polished_file = os.path.join(work_dir, "polished_{0}.fasta".format(i + 1)) if os.path.getsize(bubbles_file) == 0: logger.info("No reads were aligned during polishing") if not output_progress: logger.disabled = logger_state open(stats_file, "w").write("#seq_name\tlength\tcoverage\n") open(polished_file, "w") return polished_file, stats_file ##### logger.info("Correcting bubbles") _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out, num_threads, output_progress) polished_fasta, polished_lengths = _compose_sequence(consensus_out) merged_chunks = merge_chunks(polished_fasta) fp.write_fasta_dict(merged_chunks, polished_file) #Cleanup os.remove(chunks_file) os.remove(bubbles_file) os.remove(consensus_out) os.remove(alignment_file) contig_lengths = polished_lengths prev_assembly = polished_file #merge information from chunks contig_lengths = merge_chunks(contig_lengths, fold_function=sum) coverage_stats = merge_chunks(coverage_stats, fold_function=lambda l: sum(l) / len(l)) with open(stats_file, "w") as f: f.write("#seq_name\tlength\tcoverage\n") for ctg_id in contig_lengths: f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id], coverage_stats[ctg_id])) if not output_progress: logger.disabled = logger_state return prev_assembly, stats_file
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platform, read_type, output_progress): """ High-level polisher interface """ logger_state = logger.disabled if not output_progress: logger.disabled = True subs_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][read_platform]["subs_matrix"]) hopo_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][read_platform]["hopo_matrix"]) use_hopo = cfg.vals["err_modes"][read_platform]["hopo_enabled"] use_hopo = use_hopo and (read_type == "raw") stats_file = os.path.join(work_dir, "contigs_stats.txt") bam_input = read_seqs[0].endswith("bam") prev_assembly = contig_seqs contig_lengths = None coverage_stats = None for i in range(num_iters): logger.info("Polishing genome (%d/%d)", i + 1, num_iters) #### if not bam_input: logger.info("Running minimap2") alignment_file = os.path.join(work_dir, "minimap_{0}.bam".format(i + 1)) make_alignment(prev_assembly, read_seqs, num_threads, work_dir, read_platform, alignment_file, reference_mode=True, sam_output=True) else: logger.info("Polishing with provided bam") alignment_file = read_seqs[0] ##### logger.info("Separating alignment into bubbles") contigs_info = get_contigs_info(prev_assembly) bubbles_file = os.path.join(work_dir, "bubbles_{0}.fasta".format(i + 1)) coverage_stats, mean_aln_error = \ make_bubbles(alignment_file, contigs_info, prev_assembly, read_platform, num_threads, bubbles_file) logger.info("Alignment error rate: %f", mean_aln_error) consensus_out = os.path.join(work_dir, "consensus_{0}.fasta".format(i + 1)) polished_file = os.path.join(work_dir, "polished_{0}.fasta".format(i + 1)) if os.path.getsize(bubbles_file) == 0: logger.info("No reads were aligned during polishing") if not output_progress: logger.disabled = logger_state open(stats_file, "w").write("#seq_name\tlength\tcoverage\n") open(polished_file, "w") return polished_file, stats_file ##### logger.info("Correcting bubbles") _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out, num_threads, output_progress, use_hopo) polished_fasta, polished_lengths = _compose_sequence(consensus_out) fp.write_fasta_dict(polished_fasta, polished_file) #Cleanup os.remove(bubbles_file) os.remove(consensus_out) if not bam_input: os.remove(alignment_file) contig_lengths = polished_lengths prev_assembly = polished_file with open(stats_file, "w") as f: f.write("#seq_name\tlength\tcoverage\n") for ctg_id in contig_lengths: f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id], coverage_stats[ctg_id])) if not output_progress: logger.disabled = logger_state return prev_assembly, stats_file