def get_consensus(alignment_path, contigs_path, contigs_info, num_proc, platform): """ Main function """ aln_reader = SynchronizedSamReader( alignment_path, fp.read_sequence_dict(contigs_path), max_coverage=cfg.vals["max_read_coverage"], use_secondary=True) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] for _ in range(num_proc): threads.append( multiprocessing.Process(target=_thread_worker, args=(aln_reader, contigs_info, platform, results_queue, error_queue))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() if t.exitcode == -9: logger.error("Looks like the system ran out of memory") if t.exitcode != 0: raise Exception( "One of the processes exited with code: {0}".format( t.exitcode)) except KeyboardInterrupt: for t in threads: t.terminate() raise if not error_queue.empty(): raise error_queue.get() aln_reader.close() out_fasta = {} total_aln_errors = [] while not results_queue.empty(): ctg_id, ctg_seq, aln_errors = results_queue.get() total_aln_errors.extend(aln_errors) if len(ctg_seq) > 0: out_fasta[ctg_id] = ctg_seq mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.info("Alignment error rate: %f", mean_aln_error) return out_fasta
def make_bubbles(alignment_path, contigs_info, contigs_path, err_mode, num_proc, bubbles_out): """ The main function: takes an alignment and returns bubbles """ CHUNK_SIZE = 1000000 contigs_fasta = fp.read_sequence_dict(contigs_path) aln_reader = SynchronizedSamReader(alignment_path, contigs_fasta, cfg.vals["max_read_coverage"], use_secondary=True) chunk_feeder = SynchonizedChunkManager(contigs_fasta, chunk_size=CHUNK_SIZE) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() bubbles_out_lock = multiprocessing.Lock() bubbles_out_handle = open(bubbles_out, "w") process_in_parallel( _thread_worker, (aln_reader, chunk_feeder, contigs_info, err_mode, results_queue, error_queue, bubbles_out_handle, bubbles_out_lock), num_proc) if not error_queue.empty(): raise error_queue.get() #logging total_bubbles = 0 total_long_bubbles = 0 total_long_branches = 0 total_empty = 0 total_aln_errors = [] coverage_stats = defaultdict(list) while not results_queue.empty(): (ctg_id, num_bubbles, num_long_bubbles, num_empty, num_long_branch, aln_errors, mean_coverage) = results_queue.get() total_long_bubbles += num_long_bubbles total_long_branches += num_long_branch total_empty += num_empty total_aln_errors.extend(aln_errors) total_bubbles += num_bubbles coverage_stats[ctg_id].append(mean_coverage) for ctg in coverage_stats: coverage_stats[ctg] = int( sum(coverage_stats[ctg]) / len(coverage_stats[ctg])) mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.debug("Generated %d bubbles", total_bubbles) logger.debug("Split %d long bubbles", total_long_bubbles) logger.debug("Skipped %d empty bubbles", total_empty) logger.debug("Skipped %d bubbles with long branches", total_long_branches) ### return coverage_stats, mean_aln_error
def get_consensus(alignment_path, contigs_path, contigs_info, num_proc, platform): """ Main function """ CHUNK_SIZE = 1000000 contigs_fasta = fp.read_sequence_dict(contigs_path) mp_manager = multiprocessing.Manager() aln_reader = SynchronizedSamReader( alignment_path, contigs_fasta, mp_manager, max_coverage=cfg.vals["max_read_coverage"], use_secondary=True) chunk_feeder = SynchonizedChunkManager(contigs_fasta, mp_manager, CHUNK_SIZE) #manager = multiprocessing.Manager() results_queue = mp_manager.Queue() error_queue = mp_manager.Queue() process_in_parallel( _thread_worker, (aln_reader, chunk_feeder, platform, results_queue, error_queue), num_proc) if not error_queue.empty(): raise error_queue.get() chunk_consensus = defaultdict(list) total_aln_errors = [] while not results_queue.empty(): ctg_id, region_start, ctg_seq, aln_errors = results_queue.get() total_aln_errors.extend(aln_errors) if len(ctg_seq) > 0: chunk_consensus[ctg_id].append((region_start, ctg_seq)) out_fasta = {} for ctg in chunk_consensus: sorted_chunks = [ x[1] for x in sorted(chunk_consensus[ctg], key=lambda p: p[0]) ] out_fasta[ctg] = "".join(sorted_chunks) mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.info("Alignment error rate: %f", mean_aln_error) return out_fasta
def make_bubbles(alignment_path, contigs_info, contigs_path, err_mode, num_proc, bubbles_out): """ The main function: takes an alignment and returns bubbles """ aln_reader = SynchronizedSamReader(alignment_path, fp.read_sequence_dict(contigs_path), cfg.vals["max_read_coverage"], use_secondary=True) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] bubbles_out_lock = multiprocessing.Lock() bubbles_out_handle = open(bubbles_out, "w") for _ in range(num_proc): threads.append( multiprocessing.Process( target=_thread_worker, args=(aln_reader, contigs_info, err_mode, results_queue, error_queue, bubbles_out_handle, bubbles_out_lock))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() if t.exitcode == -9: logger.error("Looks like the system ran out of memory") if t.exitcode != 0: raise Exception( "One of the processes exited with code: {0}".format( t.exitcode)) except KeyboardInterrupt: for t in threads: t.terminate() raise if not error_queue.empty(): raise error_queue.get() aln_reader.close() total_bubbles = 0 total_long_bubbles = 0 total_long_branches = 0 total_empty = 0 total_aln_errors = [] coverage_stats = {} while not results_queue.empty(): (ctg_id, num_bubbles, num_long_bubbles, num_empty, num_long_branch, aln_errors, mean_coverage) = results_queue.get() total_long_bubbles += num_long_bubbles total_long_branches += num_long_branch total_empty += num_empty total_aln_errors.extend(aln_errors) total_bubbles += num_bubbles coverage_stats[ctg_id] = mean_coverage mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.debug("Generated %d bubbles", total_bubbles) logger.debug("Split %d long bubbles", total_long_bubbles) logger.debug("Skipped %d empty bubbles", total_empty) logger.debug("Skipped %d bubbles with long branches", total_long_branches) return coverage_stats, mean_aln_error
def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir, error_mode, polished_stats, num_threads): """ Generate polished graph edges sequences by extracting them from polished contigs """ logger.debug("Generating polished GFA") edges_new_coverage = {} with open(polished_stats, "r") as f: for line in f: if line.startswith("#"): continue ctg, _len, coverage = line.strip().split() ctg_id = ctg.split("_")[1] edges_new_coverage[ctg_id] = int(coverage) alignment_file = os.path.join(work_dir, "edges_aln.bam") polished_dict = fp.read_sequence_dict(polished_contigs) make_alignment(polished_contigs, [edges_file], num_threads, work_dir, error_mode, alignment_file, reference_mode=True, sam_output=True) aln_reader = SynchronizedSamReader(alignment_file, polished_dict, multiprocessing.Manager(), cfg.vals["max_read_coverage"]) aln_by_edge = defaultdict(list) #getting one best alignment for each contig #for ctg in polished_dict: # ctg_aln = aln_reader.get_alignments(ctg) for aln in aln_reader.get_all_alignments(): aln_by_edge[aln.qry_id].append(aln) #logger.debug("Bam parsing done") MIN_CONTAINMENT = 0.9 updated_seqs = 0 edges_dict = fp.read_sequence_dict(edges_file) for edge in edges_dict: if edge in aln_by_edge: aln_by_edge[edge].sort(key=lambda a: a.qry_end - a.qry_start, reverse=True) main_aln = aln_by_edge[edge][0] map_start = main_aln.trg_start map_end = main_aln.trg_end for aln in aln_by_edge[edge]: if aln.trg_id == main_aln.trg_id and aln.trg_sign == main_aln.trg_sign: map_start = min(map_start, aln.trg_start) map_end = max(map_end, aln.trg_end) new_seq = polished_dict[main_aln.trg_id][map_start:map_end] if main_aln.qry_sign == "-": new_seq = fp.reverse_complement(new_seq) #print(edge, main_aln.qry_len, len(new_seq), main_aln.qry_start, main_aln.qry_end) if len(new_seq) / aln.qry_len > MIN_CONTAINMENT: edges_dict[edge] = new_seq updated_seqs += 1 #writes fasta file with polished egdes #edges_polished = os.path.join(work_dir, "polished_edges.fasta") #fp.write_fasta_dict(edges_dict, edges_polished) #writes gfa file with polished edges with open(os.path.join(work_dir, "polished_edges.gfa"), "w") as gfa_polished, \ open(gfa_file, "r") as gfa_in: for line in gfa_in: if line.startswith("S"): seq_id = line.split()[1] coverage_tag = line.split()[3] seq_num = seq_id.split("_")[1] if seq_num in edges_new_coverage: #logger.info("from {0} to {1}".format(coverage_tag, edges_new_coverage[seq_num])) coverage_tag = "dp:i:{0}".format( edges_new_coverage[seq_num]) gfa_polished.write("S\t{0}\t{1}\t{2}\n".format( seq_id, edges_dict[seq_id], coverage_tag)) else: gfa_polished.write(line) logger.debug("%d sequences remained unpolished", len(edges_dict) - updated_seqs) os.remove(alignment_file)
def find_divergence(alignment_path, contigs_path, contigs_info, frequency_path, positions_path, div_sum_path, min_aln_rate, platform, num_proc, sub_thresh, del_thresh, ins_thresh): """ Main function: takes in an alignment and finds the divergent positions """ if not os.path.isfile(alignment_path) or not os.path.isfile(contigs_path): ctg_profile = [] positions = _write_frequency_path(frequency_path, ctg_profile, sub_thresh, del_thresh, ins_thresh) total_header = "".join([ "Total_positions_{0}_".format(len(positions["total"])), "with_thresholds_sub_{0}".format(sub_thresh), "_del_{0}_ins_{1}".format(del_thresh, ins_thresh) ]) sub_header = "".join([ "Sub_positions_{0}_".format(len(positions["sub"])), "with_threshold_sub_{0}".format(sub_thresh) ]) del_header = "".join([ "Del_positions_{0}_".format(len(positions["del"])), "with_threshold_del_{0}".format(del_thresh) ]) ins_header = "".join([ "Ins_positions_{0}_".format(len(positions["ins"])), "with_threshold_ins_{0}".format(ins_thresh) ]) _write_positions(positions_path, positions, total_header, sub_header, del_header, ins_header) window_len = 1000 sum_header = "Tentative Divergent Position Summary" _write_div_summary(div_sum_path, sum_header, positions, len(ctg_profile), window_len) return contigs_fasta = fp.read_sequence_dict(contigs_path) aln_reader = SynchronizedSamReader(alignment_path, contigs_fasta, config.vals["max_read_coverage"]) chunk_feeder = SynchonizedChunkManager(contigs_fasta) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() process_in_parallel(_thread_worker, (aln_reader, chunk_feeder, contigs_info, platform, results_queue, error_queue), num_proc) if not error_queue.empty(): raise error_queue.get() total_aln_errors = [] while not results_queue.empty(): _, ctg_profile, aln_errors = results_queue.get() positions = _write_frequency_path(frequency_path, ctg_profile, sub_thresh, del_thresh, ins_thresh) total_header = "".join([ "Total_positions_{0}_".format(len(positions["total"])), "with_thresholds_sub_{0}".format(sub_thresh), "_del_{0}_ins_{1}".format(del_thresh, ins_thresh) ]) sub_header = "".join([ "Sub_positions_{0}_".format(len(positions["sub"])), "with_threshold_sub_{0}".format(sub_thresh) ]) del_header = "".join([ "Del_positions_{0}_".format(len(positions["del"])), "with_threshold_del_{0}".format(del_thresh) ]) ins_header = "".join([ "Ins_positions_{0}_".format(len(positions["ins"])), "with_threshold_ins_{0}".format(ins_thresh) ]) _write_positions(positions_path, positions, total_header, sub_header, del_header, ins_header) window_len = 1000 sum_header = "Tentative Divergent Position Summary" _write_div_summary(div_sum_path, sum_header, positions, len(ctg_profile), window_len) logger.debug("Total positions: %d", len(positions["total"])) total_aln_errors.extend(aln_errors) mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.debug("Alignment error rate: %f", mean_aln_error)
def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir, error_mode, num_threads): """ Generate polished graph edges sequences by extracting them from polished contigs """ logger.debug("Generating polished GFA") alignment_file = os.path.join(work_dir, "edges_aln.sam") polished_dict = fp.read_sequence_dict(polished_contigs) make_alignment(polished_contigs, [edges_file], num_threads, work_dir, error_mode, alignment_file, reference_mode=True, sam_output=True) aln_reader = SynchronizedSamReader(alignment_file, polished_dict, cfg.vals["max_read_coverage"]) aln_reader.init_reading() aln_by_edge = defaultdict(list) #getting one best alignment for each contig while not aln_reader.is_eof(): _, ctg_aln = aln_reader.get_chunk() for aln in ctg_aln: aln_by_edge[aln.qry_id].append(aln) aln_reader.stop_reading() MIN_CONTAINMENT = 0.9 updated_seqs = 0 edges_dict = fp.read_sequence_dict(edges_file) for edge in edges_dict: if edge in aln_by_edge: main_aln = aln_by_edge[edge][0] map_start = main_aln.trg_start map_end = main_aln.trg_end for aln in aln_by_edge[edge]: if aln.trg_id == main_aln.trg_id and aln.trg_sign == main_aln.trg_sign: map_start = min(map_start, aln.trg_start) map_end = max(map_end, aln.trg_end) new_seq = polished_dict[main_aln.trg_id][map_start:map_end] if main_aln.qry_sign == "-": new_seq = fp.reverse_complement(new_seq) #print edge, main_aln.qry_len, len(new_seq), main_aln.qry_start, main_aln.qry_end if len(new_seq) / aln.qry_len > MIN_CONTAINMENT: edges_dict[edge] = new_seq updated_seqs += 1 #writes fasta file with polished egdes #edges_polished = os.path.join(work_dir, "polished_edges.fasta") #fp.write_fasta_dict(edges_dict, edges_polished) #writes gfa file with polished edges with open(os.path.join(work_dir, "polished_edges.gfa"), "w") as gfa_polished, \ open(gfa_file, "r") as gfa_in: for line in gfa_in: if line.startswith("S"): seq_id = line.split()[1] coverage_tag = line.split()[3] gfa_polished.write("S\t{0}\t{1}\t{2}\n".format( seq_id, edges_dict[seq_id], coverage_tag)) else: gfa_polished.write(line) logger.debug("%d sequences remained unpolished", len(edges_dict) - updated_seqs) os.remove(alignment_file)
def find_divergence(alignment_path, contigs_path, contigs_info, frequency_path, positions_path, div_sum_path, min_aln_rate, platform, num_proc, sub_thresh, del_thresh, ins_thresh): """ Main function: takes in an alignment and finds the divergent positions """ if not os.path.isfile(alignment_path) or not os.path.isfile(contigs_path): ctg_profile = [] positions = _write_frequency_path(frequency_path, ctg_profile, sub_thresh, del_thresh, ins_thresh) total_header = "".join([ "Total_positions_{0}_".format(len(positions["total"])), "with_thresholds_sub_{0}".format(sub_thresh), "_del_{0}_ins_{1}".format(del_thresh, ins_thresh) ]) sub_header = "".join([ "Sub_positions_{0}_".format(len(positions["sub"])), "with_threshold_sub_{0}".format(sub_thresh) ]) del_header = "".join([ "Del_positions_{0}_".format(len(positions["del"])), "with_threshold_del_{0}".format(del_thresh) ]) ins_header = "".join([ "Ins_positions_{0}_".format(len(positions["ins"])), "with_threshold_ins_{0}".format(ins_thresh) ]) _write_positions(positions_path, positions, total_header, sub_header, del_header, ins_header) window_len = 1000 sum_header = "Tentative Divergent Position Summary" _write_div_summary(div_sum_path, sum_header, positions, len(ctg_profile), window_len) return aln_reader = SynchronizedSamReader(alignment_path, fp.read_sequence_dict(contigs_path), config.vals["max_read_coverage"]) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] for _ in range(num_proc): threads.append( multiprocessing.Process(target=_thread_worker, args=(aln_reader, contigs_info, platform, results_queue, error_queue))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() except KeyboardInterrupt: for t in threads: t.terminate() if not error_queue.empty(): raise error_queue.get() total_aln_errors = [] while not results_queue.empty(): _, ctg_profile, aln_errors = results_queue.get() positions = _write_frequency_path(frequency_path, ctg_profile, sub_thresh, del_thresh, ins_thresh) total_header = "".join([ "Total_positions_{0}_".format(len(positions["total"])), "with_thresholds_sub_{0}".format(sub_thresh), "_del_{0}_ins_{1}".format(del_thresh, ins_thresh) ]) sub_header = "".join([ "Sub_positions_{0}_".format(len(positions["sub"])), "with_threshold_sub_{0}".format(sub_thresh) ]) del_header = "".join([ "Del_positions_{0}_".format(len(positions["del"])), "with_threshold_del_{0}".format(del_thresh) ]) ins_header = "".join([ "Ins_positions_{0}_".format(len(positions["ins"])), "with_threshold_ins_{0}".format(ins_thresh) ]) _write_positions(positions_path, positions, total_header, sub_header, del_header, ins_header) window_len = 1000 sum_header = "Tentative Divergent Position Summary" _write_div_summary(div_sum_path, sum_header, positions, len(ctg_profile), window_len) logger.debug("Total positions: %d", len(positions["total"])) total_aln_errors.extend(aln_errors) mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.debug("Alignment error rate: %f", mean_aln_error)