def _is_simple_kmer(profile, position): """ Checks if the kmer with center at the given position is simple """ SIMPLE_LEN = cfg.vals["simple_kmer_length"] extended_len = SIMPLE_LEN * 2 nucl_str = [ p.nucl for p in profile[position - extended_len // 2:position + extended_len // 2] ] #single nucleotide homopolymers for i in range(extended_len // 2 - SIMPLE_LEN // 2, extended_len // 2 + SIMPLE_LEN // 2 - 1): if nucl_str[i] == nucl_str[i + 1]: return False #dinucleotide homopolymers for shift in [0, 1]: for i in range(SIMPLE_LEN - shift - 1): pos = extended_len // 2 - SIMPLE_LEN + shift + i * 2 if (nucl_str[pos:pos + 2] == nucl_str[pos + 2:pos + 4]): return False #trinucleotide homopolymers #for shift in [0, 1, 2]: # for i in xrange(SIMPLE_LEN - shift - 1): # pos = shift + i * 3 # if (nucl_str[pos : pos + 3] == nucl_str[pos + 3 : pos + 6]): # #logger.debug("tri" + "".join(nucl_str)) # return False return True
def _run(args): """ Runs the pipeline """ logger.info("Starting Flye " + _version()) logger.debug("Cmd: %s", " ".join(sys.argv)) logger.debug("Python version: " + sys.version) if args.genome_size: _set_genome_size(args) for read_file in args.reads: if not os.path.exists(read_file): raise ResumeException("Can't open " + read_file) if " " in read_file: raise ResumeException("Path to reads contain spaces: " + read_file) save_file = os.path.join(args.out_dir, "params.json") jobs = _create_job_list(args, args.out_dir, args.log_file) if args.stop_after and not args.stop_after in [j.name for j in jobs]: raise ResumeException("Stop after: unknown stage '{0}'".format( args.stop_after)) current_job = 0 if args.resume or args.resume_from: if not os.path.exists(save_file): raise ResumeException("Can't find save file") logger.info("Resuming previous run") if args.resume_from: job_to_resume = args.resume_from else: job_to_resume = json.load(open(save_file, "r"))["stage_name"] can_resume = False for i in range(len(jobs)): if jobs[i].name == job_to_resume: jobs[i].load(save_file) current_job = i if not jobs[i - 1].completed(save_file): raise ResumeException( "Can't resume: stage '{0}' incomplete".format( jobs[i - 1].name)) can_resume = True break if not can_resume: raise ResumeException( "Can't resume: stage {0} does not exist".format(job_to_resume)) for i in range(current_job, len(jobs)): jobs[i].save(save_file) jobs[i].run() if args.stop_after == jobs[i].name: if i + 1 < len(jobs): jobs[i + 1].save(save_file) logger.info("Pipeline stopped as requested by --stop-after") break
def trim_and_transpose(_self, alignmens, region_start, region_end): """ Transforms alignments so that the are strictly within the interval, and shifts the coordinates relative to this interval """ MIN_ALN = 100 trimmed_aln = [] for aln in alignmens: if aln.trg_start >= region_start and aln.trg_end <= region_end: trimmed_aln.append(copy(aln)) continue #trimming from left new_qry_start = aln.qry_start new_trg_start = aln.trg_start left_offset = None for left_offset in range(len(aln.trg_seq)): if new_trg_start >= region_start: break if aln.trg_seq[left_offset] != "-": new_trg_start += 1 if aln.qry_seq[left_offset] != "-": new_qry_start += 1 #trimming from right new_qry_end = aln.qry_end new_trg_end = aln.trg_end right_offset = None for right_offset in range(len(aln.trg_seq)): if new_trg_end <= region_end: break if aln.trg_seq[-1 - right_offset] != "-": new_trg_end -= 1 if aln.qry_seq[-1 - right_offset] != "-": new_qry_end -= 1 if new_trg_end - new_qry_end > MIN_ALN: new_qry_seq = aln.qry_seq[left_offset : len(aln.qry_seq) - right_offset] new_trg_seq = aln.trg_seq[left_offset : len(aln.trg_seq) - right_offset] trimmed_aln.append(aln._replace(qry_start=new_qry_start, qry_end=new_qry_end, trg_start=new_trg_start, trg_end=new_trg_end, qry_seq=new_qry_seq, trg_seq=new_trg_seq)) #print("Aln trg", aln.trg_start, aln.trg_end, "qry", aln.qry_start, aln.qry_end) #print("Left offset", left_offset, "right offset", right_offset) #print("New aln", new_trg_start, new_trg_end, new_qry_start, new_qry_end) #print("") for i, aln in enumerate(trimmed_aln): trimmed_aln[i] = aln._replace(trg_start=aln.trg_start - region_start, trg_end=aln.trg_end - region_start, trg_len=region_end - region_start) #print(len(alignmens), len(trimmed_aln)) return trimmed_aln
def __init__(self, reference_fasta, multiproc_manager, chunk_size=None): #prepare list of chunks to read self.fetch_list = [] self.chunk_size = chunk_size #will be shared between processes #self.shared_manager = multiprocessing.Manager() self.shared_num_jobs = multiprocessing.Value(ctypes.c_int, 0) self.shared_lock = multiproc_manager.Lock() self.shared_eof = multiprocessing.Value(ctypes.c_bool, False) for ctg_id in reference_fasta: ctg_len = len(reference_fasta[ctg_id]) chunk_size = self.chunk_size if self.chunk_size is not None else ctg_len for i in range(0, max(ctg_len // chunk_size, 1)): reg_start = i * chunk_size reg_end = (i + 1) * chunk_size if ctg_len - reg_end < chunk_size: reg_end = ctg_len self.fetch_list.append(ContigRegion(ctg_id, reg_start, reg_end)) #logger.debug("Region: {0} {1} {2}".format(ctg_id, reg_start, reg_end)) if len(self.fetch_list) == 0: self.shared_eof.value = True
def _contig_profile(alignment, platform, genome_len): """ Computes alignment profile """ #max_aln_err = config.vals["err_modes"][platform]["max_aln_error"] aln_errors = [] profile = [Profile() for _ in range(genome_len)] for aln in alignment: #if aln.err_rate > max_aln_err: continue aln_errors.append(aln.err_rate) qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) #qry_seq = aln.qry_seq #trg_seq = aln.trg_seq trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 if trg_pos >= genome_len: trg_pos -= genome_len prof_elem = profile[trg_pos] if trg_nuc == "-": prof_elem.insertions[qry_nuc] += 1 else: prof_elem.nucl = trg_nuc prof_elem.matches[qry_nuc] += 1 trg_pos += 1 return profile, aln_errors
def shift_gaps(seq_trg, seq_qry): """ Shifts all ambigious query gaps to the right """ lst_trg, lst_qry = list("$" + seq_trg + "$"), list("$" + seq_qry + "$") is_gap = False gap_start = 0 for i in range(len(lst_trg)): if is_gap and lst_qry[i] != "-": is_gap = False swap_left = gap_start - 1 swap_right = i - 1 while (swap_left > 0 and swap_right >= gap_start and lst_qry[swap_left] == lst_trg[swap_right]): lst_qry[swap_left], lst_qry[swap_right] = \ lst_qry[swap_right], lst_qry[swap_left] swap_left -= 1 swap_right -= 1 if not is_gap and lst_qry[i] == "-": is_gap = True gap_start = i return "".join(lst_qry[1 : -1])
def _split_long_bubbles(bubbles): MAX_BUBBLE = cfg.vals["max_bubble_length"] #MAX_BUBBLE = 50 #MAX_BRANCH = MAX_BUBBLE * 1.5 new_bubbles = [] long_branches = 0 for bubble in bubbles: median_branch = sorted(bubble.branches, key=len)[len(bubble.branches) // 2] num_chunks = len(median_branch) // MAX_BUBBLE #if len(median_branch) > MAX_BRANCH: if num_chunks > 1: #logger.debug("Splitting: pos:{0} len:{1}".format(bubble.position, len(median_branch))) long_branches += 1 for part_num in range(num_chunks): new_branches = [] for b in bubble.branches: chunk_len = len(b) // num_chunks start = part_num * chunk_len end = ( part_num + 1 ) * chunk_len if part_num != num_chunks - 1 else len(b) new_branches.append(b[start:end]) new_bubbles.append(Bubble(bubble.contig_id, bubble.position)) new_bubbles[-1].consensus = new_branches[0] new_bubbles[-1].branches = new_branches new_bubbles[-1].sub_position = part_num else: new_bubbles.append(bubble) return new_bubbles, long_branches
def _aln_score(aln): wnd_good = 0 wnd_bad = 0 for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW + 1): if wnd_primary_cov[i] < cov_threshold: wnd_good += 1 else: wnd_bad += 1 return wnd_good, wnd_bad
def get_consensus(alignment_path, contigs_path, contigs_info, num_proc, platform): """ Main function """ aln_reader = SynchronizedSamReader( alignment_path, fp.read_sequence_dict(contigs_path), max_coverage=cfg.vals["max_read_coverage"], use_secondary=True) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] for _ in range(num_proc): threads.append( multiprocessing.Process(target=_thread_worker, args=(aln_reader, contigs_info, platform, results_queue, error_queue))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() if t.exitcode == -9: logger.error("Looks like the system ran out of memory") if t.exitcode != 0: raise Exception( "One of the processes exited with code: {0}".format( t.exitcode)) except KeyboardInterrupt: for t in threads: t.terminate() raise if not error_queue.empty(): raise error_queue.get() aln_reader.close() out_fasta = {} total_aln_errors = [] while not results_queue.empty(): ctg_id, ctg_seq, aln_errors = results_queue.get() total_aln_errors.extend(aln_errors) if len(ctg_seq) > 0: out_fasta[ctg_id] = ctg_seq mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.info("Alignment error rate: %f", mean_aln_error) return out_fasta
def write_fasta_dict(fasta_dict, filename): """ Writes dictionary with fasta to file """ with open(filename, "w") as f: for header in sorted(fasta_dict): f.write(">{0}\n".format(header)) for i in range(0, len(fasta_dict[header]), 60): f.write(fasta_dict[header][i:i + 60] + "\n")
def _get_partition(profile, err_mode): """ Partitions genome into sub-alignments at solid regions / simple kmers """ #logger.debug("Partitioning genome") SOLID_LEN = cfg.vals["solid_kmer_length"] SIMPLE_LEN = cfg.vals["simple_kmer_length"] MAX_BUBBLE = cfg.vals["max_bubble_length"] solid_flags = [False for _ in range(len(profile))] prof_pos = 0 while prof_pos < len(profile) - SOLID_LEN: if _is_solid_kmer(profile, prof_pos, err_mode): for i in range(prof_pos, prof_pos + SOLID_LEN): solid_flags[i] = True prof_pos += SOLID_LEN else: prof_pos += 1 partition = [] prev_partition = SOLID_LEN long_bubbles = 0 prof_pos = SOLID_LEN while prof_pos < len(profile) - SOLID_LEN: cur_partition = prof_pos + SIMPLE_LEN // 2 landmark = (all(solid_flags[prof_pos:prof_pos + SIMPLE_LEN]) and _is_simple_kmer(profile, cur_partition)) if prof_pos - prev_partition > MAX_BUBBLE: long_bubbles += 1 if landmark or prof_pos - prev_partition > MAX_BUBBLE: partition.append(cur_partition) prev_partition = cur_partition prof_pos += SOLID_LEN else: prof_pos += 1 #logger.debug("Partitioned into {0} segments".format(len(partition) + 1)) #logger.debug("Long bubbles: {0}".format(long_bubbles)) return partition, long_bubbles
def unite_mapping_segments(segments): segments.sort(key=lambda segment: segment.start) united_segments = [segments[0]] for i in range(1, len(segments)): if segments[i].start <= united_segments[-1].end: if segments[i].end > united_segments[-1].end: united_segments[-1].end = segments[i].end else: united_segments.append(segments[i]) return united_segments
def get_uniform_alignments(alignments, seq_len): """ Leaves top alignments for each position within contig assuming uniform coverage distribution """ def _get_median(lst): if not lst: raise ValueError("_get_median() arg is an empty sequence") sorted_list = sorted(lst) if len(lst) % 2 == 1: return sorted_list[len(lst) // 2] else: mid1 = sorted_list[(len(lst) // 2) - 1] mid2 = sorted_list[(len(lst) // 2)] return (mid1 + mid2) / 2 WINDOW = 100 MIN_COV = 10 COV_RATE = 1.25 #split contig into windows, get median read coverage over all windows and #determine the quality threshold cutoffs for each window wnd_primary_cov = [0 for _ in range(seq_len // WINDOW + 1)] wnd_aln_quality = [[] for _ in range(seq_len // WINDOW + 1)] wnd_qual_thresholds = [1.0 for _ in range(seq_len // WINDOW + 1)] for aln in alignments: for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW): if not aln.is_secondary: wnd_primary_cov[i] += 1 wnd_aln_quality[i].append(aln.err_rate) #for each window, select top X alignmetns, where X is the median read coverage cov_threshold = max(int(COV_RATE * _get_median(wnd_primary_cov)), MIN_COV) for i in range(len(wnd_aln_quality)): if len(wnd_aln_quality[i]) > cov_threshold: wnd_qual_thresholds[i] = sorted(wnd_aln_quality[i])[cov_threshold] #for each alignment, count in how many windows it passes the threshold filtered_alignments = [] total_sequence = 0 filtered_sequence = 0 for aln in alignments: good_windows = 0 total_windows = aln.trg_end // WINDOW - aln.trg_start // WINDOW total_sequence += aln.trg_end - aln.trg_start for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW): if aln.err_rate <= wnd_qual_thresholds[i]: good_windows += 1 if good_windows > total_windows // 2: filtered_alignments.append(aln) filtered_sequence += aln.trg_end - aln.trg_start #filtered_reads_rate = 1 - float(len(filtered_alignments)) / len(alignments) #filtered_seq_rate = 1 - float(filtered_sequence) / total_sequence #logger.debug("Filtered {0:7.2f}% reads, {1:7.2f}% sequence" # .format(filtered_reads_rate * 100, filtered_seq_rate * 100)) return filtered_alignments
def find_connected_components(graph): def dfs(start_vertex, connected_components_counter): dfs_stack = [start_vertex] used[start_vertex] = True while len(dfs_stack): vertex = dfs_stack.pop() connected_components[vertex] = connected_components_counter for neighbour in graph[vertex]: if not used[neighbour]: dfs_stack.append(neighbour) used[neighbour] = True n_vertices = len(graph) connected_components = [0 for _ in range(n_vertices)] connected_components_counter = 0 used = [False for _ in range(n_vertices)] for i in range(n_vertices): if not used[i]: dfs(i, connected_components_counter) connected_components_counter += 1 return connected_components, connected_components_counter
def split_into_chunks(fasta_in, chunk_size, fasta_out): out_dict = {} for header, seq in fp.stream_sequence(fasta_in): #print len(seq) for i in range(0, max(len(seq) // chunk_size, 1)): chunk_hdr = "{0}$chunk_{1}".format(header, i) start = i * chunk_size end = (i + 1) * chunk_size if len(seq) - end < chunk_size: end = len(seq) #print(start, end) out_dict[chunk_hdr] = seq[start : end] fp.write_fasta_dict(out_dict, fasta_out)
def split_into_chunks(fasta_in, chunk_size): out_dict = {} for header, seq in iteritems(fasta_in): #print len(seq) for i in range(0, max(len(seq) // chunk_size, 1)): chunk_hdr = "{0}$chunk_{1}".format(header, i) start = i * chunk_size end = (i + 1) * chunk_size if len(seq) - end < chunk_size: end = len(seq) #print(start, end) out_dict[chunk_hdr] = seq[start : end] return out_dict
def _is_solid_kmer(profile, position, err_mode): """ Checks if the kmer at given position is solid """ MISSMATCH_RATE = cfg.vals["err_modes"][err_mode]["solid_missmatch"] INS_RATE = cfg.vals["err_modes"][err_mode]["solid_indel"] SOLID_LEN = cfg.vals["solid_kmer_length"] for i in range(position, position + SOLID_LEN): if profile[i].coverage == 0: return False local_missmatch = (profile[i].num_missmatch + profile[i].num_deletions) / profile[i].coverage local_ins = profile[i].num_inserts / profile[i].coverage if local_missmatch > MISSMATCH_RATE or local_ins > INS_RATE: return False return True
def _contig_profile(alignment, platform): """ Computes alignment profile """ if not alignment: return [] genome_len = alignment[0].trg_len aln_errors = [] profile = [Profile() for _ in range(genome_len)] #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"] for aln in alignment: #if aln.err_rate > max_aln_err: continue aln_errors.append(aln.err_rate) #after gap shifting it is possible that #two gaps are aligned against each other qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 if trg_pos >= genome_len: trg_pos -= genome_len #total += 1 prof_elem = profile[trg_pos] if trg_nuc == "-" and qry_nuc != "-": prof_elem.insertions[aln.qry_id] += qry_nuc else: prof_elem.nucl = trg_nuc prof_elem.matches[qry_nuc] += 1 trg_pos += 1 #print "len", genome_len, "median coverage", cov_threshold #print "total bases: ", total, "discarded bases: ", discarded #print "filtered", float(discarded) / total #print "" return profile, aln_errors
def _compute_profile(alignment, platform, genome_len): """ Computes alignment profile """ max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"] min_aln_len = cfg.vals["min_polish_aln_len"] aln_errors = [] #filtered = 0 profile = [ProfileInfo() for _ in range(genome_len)] for aln in alignment: if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len: #filtered += 1 continue aln_errors.append(aln.err_rate) qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 if trg_pos >= genome_len: trg_pos -= genome_len prof_elem = profile[trg_pos] if trg_nuc == "-": prof_elem.num_inserts += 1 else: prof_elem.nucl = trg_nuc prof_elem.coverage += 1 if qry_nuc == "-": prof_elem.num_deletions += 1 elif trg_nuc != qry_nuc: prof_elem.num_missmatch += 1 trg_pos += 1 #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment))) return profile, aln_errors
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, error_mode, output_progress): """ High-level polisher interface """ logger_state = logger.disabled if not output_progress: logger.disabled = True subs_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"]) hopo_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"]) stats_file = os.path.join(work_dir, "contigs_stats.txt") prev_assembly = contig_seqs contig_lengths = None coverage_stats = None for i in range(num_iters): logger.info("Polishing genome (%d/%d)", i + 1, num_iters) #split into 1Mb chunks to reduce RAM usage #slightly vary chunk size between iterations CHUNK_SIZE = 1000000 - (i % 2) * 100000 chunks_file = os.path.join(work_dir, "chunks_{0}.fasta".format(i + 1)) chunks = split_into_chunks(fp.read_sequence_dict(prev_assembly), CHUNK_SIZE) fp.write_fasta_dict(chunks, chunks_file) #### logger.info("Running minimap2") alignment_file = os.path.join(work_dir, "minimap_{0}.sam".format(i + 1)) make_alignment(chunks_file, read_seqs, num_threads, work_dir, error_mode, alignment_file, reference_mode=True, sam_output=True) ##### logger.info("Separating alignment into bubbles") contigs_info = get_contigs_info(chunks_file) bubbles_file = os.path.join(work_dir, "bubbles_{0}.fasta".format(i + 1)) coverage_stats, mean_aln_error = \ make_bubbles(alignment_file, contigs_info, chunks_file, error_mode, num_threads, bubbles_file) logger.info("Alignment error rate: %f", mean_aln_error) consensus_out = os.path.join(work_dir, "consensus_{0}.fasta".format(i + 1)) polished_file = os.path.join(work_dir, "polished_{0}.fasta".format(i + 1)) if os.path.getsize(bubbles_file) == 0: logger.info("No reads were aligned during polishing") if not output_progress: logger.disabled = logger_state open(stats_file, "w").write("#seq_name\tlength\tcoverage\n") open(polished_file, "w") return polished_file, stats_file ##### logger.info("Correcting bubbles") _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out, num_threads, output_progress) polished_fasta, polished_lengths = _compose_sequence(consensus_out) merged_chunks = merge_chunks(polished_fasta) fp.write_fasta_dict(merged_chunks, polished_file) #Cleanup os.remove(chunks_file) os.remove(bubbles_file) os.remove(consensus_out) os.remove(alignment_file) contig_lengths = polished_lengths prev_assembly = polished_file #merge information from chunks contig_lengths = merge_chunks(contig_lengths, fold_function=sum) coverage_stats = merge_chunks(coverage_stats, fold_function=lambda l: sum(l) // len(l)) with open(stats_file, "w") as f: f.write("#seq_name\tlength\tcoverage\n") for ctg_id in contig_lengths: f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id], coverage_stats[ctg_id])) if not output_progress: logger.disabled = logger_state return prev_assembly, stats_file
def _write_div_summary(div_sum_path, sum_header, positions, seq_len, window_len): pos_list = sorted(positions["total"]) av_div = 0.0 if seq_len != 0: av_div = len(pos_list) / float(seq_len) position_gaps = [0 for _ in range(len(pos_list) + 1)] curr_pos = 0 for i, p in enumerate(pos_list): position_gaps[i] = p - curr_pos curr_pos = p position_gaps[-1] = seq_len - curr_pos mean_position_gap = _mean(position_gaps) max_position_gap = max(position_gaps) window_len = 1000 position_counts = [0 for _ in range(((seq_len - 1) // window_len) + 1)] window_divs = [0.0 for _ in range(((seq_len - 1) // window_len) + 1)] curr_p_i = 0 for i in range(len(window_divs)): start = i * window_len end = (i + 1) * window_len - 1 if i == len(window_divs) - 1: end = seq_len - 1 curr_window_len = end - start + 1 if curr_p_i < len(pos_list) and pos_list[curr_p_i] < start: raise PositionIOError('Problem with position indices') while curr_p_i < len(pos_list) and pos_list[curr_p_i] <= end: position_counts[i] += 1 curr_p_i += 1 window_divs[i] = 0.0 if curr_window_len != 0: window_divs[i] = position_counts[i] / float(curr_window_len) mean_window_div = _mean(window_divs) median_window_div = _get_median(window_divs) min_window_div = min(window_divs) with open(div_sum_path, 'w') as f: f.write("{0}\n\n".format(sum_header)) f.write("{0:33}\t{1}\n".format("Sequence Length:", seq_len)) f.write("{0:33}\t{1:.4f}\n\n".format("Average Divergence:", av_div)) f.write("{0:33}\t{1}\n".format("Total Substitution Positions:", len(positions["sub"]))) f.write("{0:33}\t{1}\n".format("Total Deletion Positions:", len(positions["del"]))) f.write("{0:33}\t{1}\n".format("Total Insertion Positions:", len(positions["ins"]))) f.write("{0:33}\t{1}\n".format("Total Positions:", len(positions["total"]))) mixed_count = (len(positions["sub"]) + len(positions["del"]) + len(positions["ins"])) - len(positions["total"]) f.write("{0:33}\t{1}\n\n".format("Mixed Positions:", mixed_count)) f.write("{0:33}\t{1:.2f}\n".format("Mean Position Gap:", mean_position_gap)) f.write("{0:33}\t{1}\n".format("Max Position Gap:", max_position_gap)) f.write("{0:33}\t{1}\n".format("Window Length:", window_len)) f.write("{0:33}\t{1:.5f}\n".format("Mean Window Divergence:", mean_window_div)) f.write("{0:33}\t{1:.5f}\n".format("Median Window Divergence:", median_window_div)) f.write("{0:33}\t{1:.5f}\n".format("Min Window Divergence:", min_window_div))
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platform, read_type, output_progress): """ High-level polisher interface """ logger_state = logger.disabled if not output_progress: logger.disabled = True subs_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][read_platform]["subs_matrix"]) hopo_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][read_platform]["hopo_matrix"]) use_hopo = cfg.vals["err_modes"][read_platform]["hopo_enabled"] use_hopo = use_hopo and (read_type == "raw") stats_file = os.path.join(work_dir, "contigs_stats.txt") bam_input = read_seqs[0].endswith("bam") prev_assembly = contig_seqs contig_lengths = None coverage_stats = None for i in range(num_iters): logger.info("Polishing genome (%d/%d)", i + 1, num_iters) #### if not bam_input: logger.info("Running minimap2") alignment_file = os.path.join(work_dir, "minimap_{0}.bam".format(i + 1)) make_alignment(prev_assembly, read_seqs, num_threads, work_dir, read_platform, alignment_file, reference_mode=True, sam_output=True) else: logger.info("Polishing with provided bam") alignment_file = read_seqs[0] ##### logger.info("Separating alignment into bubbles") contigs_info = get_contigs_info(prev_assembly) bubbles_file = os.path.join(work_dir, "bubbles_{0}.fasta".format(i + 1)) coverage_stats, mean_aln_error = \ make_bubbles(alignment_file, contigs_info, prev_assembly, read_platform, num_threads, bubbles_file) logger.info("Alignment error rate: %f", mean_aln_error) consensus_out = os.path.join(work_dir, "consensus_{0}.fasta".format(i + 1)) polished_file = os.path.join(work_dir, "polished_{0}.fasta".format(i + 1)) if os.path.getsize(bubbles_file) == 0: logger.info("No reads were aligned during polishing") if not output_progress: logger.disabled = logger_state open(stats_file, "w").write("#seq_name\tlength\tcoverage\n") open(polished_file, "w") return polished_file, stats_file ##### logger.info("Correcting bubbles") _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out, num_threads, output_progress, use_hopo) polished_fasta, polished_lengths = _compose_sequence(consensus_out) fp.write_fasta_dict(polished_fasta, polished_file) #Cleanup os.remove(bubbles_file) os.remove(consensus_out) if not bam_input: os.remove(alignment_file) contig_lengths = polished_lengths prev_assembly = polished_file with open(stats_file, "w") as f: f.write("#seq_name\tlength\tcoverage\n") for ctg_id in contig_lengths: f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id], coverage_stats[ctg_id])) if not output_progress: logger.disabled = logger_state return prev_assembly, stats_file
def get_uniform_alignments(alignments): """ Leaves top alignments for each position within contig assuming uniform coverage distribution """ if not alignments: return [] WINDOW = 100 MIN_COV = 20 GOOD_RATE = 0.66 MIN_QV = 20 def is_reliable(aln): return not aln.is_secondary and not aln.is_supplementary and aln.map_qv >= MIN_QV seq_len = alignments[0].trg_len ctg_id = alignments[0].trg_id #split contig into windows, get median read coverage over all windows and #determine the quality threshold cutoffs for each window wnd_primary_cov = [0 for _ in range(seq_len // WINDOW + 1)] wnd_all_cov = [0 for _ in range(seq_len // WINDOW + 1)] for aln in alignments: for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW + 1): if is_reliable(aln): wnd_primary_cov[i] += 1 wnd_all_cov[i] += 1 cov_threshold = max(int(get_median(wnd_primary_cov)), MIN_COV) orig_primary_cov = copy(wnd_primary_cov) selected_alignments = [] original_sequence = 0 primary_sequence = 0 secondary_sequence = 0 primary_aln = 0 secondary_aln = 0 def _aln_score(aln): wnd_good = 0 wnd_bad = 0 for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW + 1): if wnd_primary_cov[i] < cov_threshold: wnd_good += 1 else: wnd_bad += 1 return wnd_good, wnd_bad sec_aln_scores = {} for aln in alignments: original_sequence += aln.trg_end - aln.trg_start #always keep primary alignments, regardless of local coverage if is_reliable(aln): primary_sequence += aln.trg_end - aln.trg_start primary_aln += 1 selected_alignments.append(aln) #if alignment is secondary, count how many windows it helps to improve else: wnd_good, wnd_bad = _aln_score(aln) sec_aln_scores[aln.qry_id] = (wnd_good, wnd_bad, aln) #now, greedily add secondaty alignments, until they add useful coverage _score_fun = lambda x: (sec_aln_scores[x][0] - 2 * sec_aln_scores[x][1], sec_aln_scores[x][2].trg_end - sec_aln_scores[x][2].trg_start) sorted_sec_aln = [x for x in sorted(sec_aln_scores, reverse=True, key=_score_fun)] for aln_id in sorted_sec_aln: aln = sec_aln_scores[aln_id][2] #recompute scores wnd_good, wnd_bad = _aln_score(aln) to_take = wnd_good / (wnd_good + wnd_bad) > GOOD_RATE if to_take: selected_alignments.append(aln) secondary_aln += 1 secondary_sequence += aln.trg_end - aln.trg_start for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW + 1): wnd_primary_cov[i] += 1 #logger.debug("\tSec score: {} {} {} {}".format(aln_id, wnd_good, wnd_bad, to_take)) #logger.debug("Seq: {0} pri_cov: {1} all_cov: {2}".format(ctg_id, get_median(orig_primary_cov), # get_median(wnd_all_cov)) + "\n" + # "\tOriginal seq: {0}, reads: {1}".format(original_sequence, len(alignments)) + "\n" + # "\tPrimary seq: {0}, reads: {1}".format(primary_sequence, primary_aln) + "\n" + # "\tSecondary seq: {0}, reads: {1}".format(secondary_sequence, secondary_aln) + "\n" + # "\tSelected size: {0}, median coverage: {1}".format(len(selected_alignments), get_median(wnd_primary_cov))) median_cov = get_median(wnd_primary_cov) #mean_cov = sum(wnd_primary_cov) / (len(wnd_primary_cov) + 1) return selected_alignments, median_cov
def _parse_cigar(self, cigar_str, read_str, ctg_str, ctg_pos): #ctg_str = self.ref_fasta[ctg_name] trg_seq = [] qry_seq = [] trg_start = ctg_pos - 1 trg_pos = ctg_pos - 1 qry_start = 0 qry_pos = 0 left_hard = True left_soft = True hard_clipped_left = 0 hard_clipped_right = 0 soft_clipped_left = 0 soft_clipped_right = 0 for token in self.cigar_parser.findall(cigar_str): size, op = int(token[:-1]), token[-1:] if op == b"H": if left_hard: qry_start += size hard_clipped_left += size else: hard_clipped_right += size elif op == b"S": qry_pos += size if left_soft: soft_clipped_left += size else: soft_clipped_right += size elif op == b"M": qry_seq.append(read_str[qry_pos : qry_pos + size].upper()) trg_seq.append(ctg_str[trg_pos : trg_pos + size].upper()) qry_pos += size trg_pos += size elif op == b"I": qry_seq.append(read_str[qry_pos : qry_pos + size].upper()) trg_seq.append(b"-" * size) qry_pos += size elif op == b"D": qry_seq.append(b"-" * size) trg_seq.append(ctg_str[trg_pos : trg_pos + size].upper()) trg_pos += size else: raise AlignmentException("Unsupported CIGAR operation: " + str(op)) left_hard = False if op != b"H": left_soft = False trg_seq = b"".join(trg_seq) qry_seq = b"".join(qry_seq) matches = 0 for i in range(len(trg_seq)): if trg_seq[i] == qry_seq[i]: matches += 1 err_rate = 1 - matches / len(trg_seq) trg_end = trg_pos qry_end = qry_pos + hard_clipped_left qry_len = qry_end + hard_clipped_right qry_start += soft_clipped_left qry_end -= soft_clipped_right return (trg_start, trg_end, len(ctg_str), trg_seq, qry_start, qry_end, qry_len, qry_seq, err_rate)
def find_divergence(alignment_path, contigs_path, contigs_info, frequency_path, positions_path, div_sum_path, min_aln_rate, platform, num_proc, sub_thresh, del_thresh, ins_thresh): """ Main function: takes in an alignment and finds the divergent positions """ if not os.path.isfile(alignment_path) or not os.path.isfile(contigs_path): ctg_profile = [] positions = _write_frequency_path(frequency_path, ctg_profile, sub_thresh, del_thresh, ins_thresh) total_header = "".join([ "Total_positions_{0}_".format(len(positions["total"])), "with_thresholds_sub_{0}".format(sub_thresh), "_del_{0}_ins_{1}".format(del_thresh, ins_thresh) ]) sub_header = "".join([ "Sub_positions_{0}_".format(len(positions["sub"])), "with_threshold_sub_{0}".format(sub_thresh) ]) del_header = "".join([ "Del_positions_{0}_".format(len(positions["del"])), "with_threshold_del_{0}".format(del_thresh) ]) ins_header = "".join([ "Ins_positions_{0}_".format(len(positions["ins"])), "with_threshold_ins_{0}".format(ins_thresh) ]) _write_positions(positions_path, positions, total_header, sub_header, del_header, ins_header) window_len = 1000 sum_header = "Tentative Divergent Position Summary" _write_div_summary(div_sum_path, sum_header, positions, len(ctg_profile), window_len) return aln_reader = SynchronizedSamReader(alignment_path, fp.read_sequence_dict(contigs_path), config.vals["max_read_coverage"]) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] for _ in range(num_proc): threads.append( multiprocessing.Process(target=_thread_worker, args=(aln_reader, contigs_info, platform, results_queue, error_queue))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() except KeyboardInterrupt: for t in threads: t.terminate() if not error_queue.empty(): raise error_queue.get() total_aln_errors = [] while not results_queue.empty(): _, ctg_profile, aln_errors = results_queue.get() positions = _write_frequency_path(frequency_path, ctg_profile, sub_thresh, del_thresh, ins_thresh) total_header = "".join([ "Total_positions_{0}_".format(len(positions["total"])), "with_thresholds_sub_{0}".format(sub_thresh), "_del_{0}_ins_{1}".format(del_thresh, ins_thresh) ]) sub_header = "".join([ "Sub_positions_{0}_".format(len(positions["sub"])), "with_threshold_sub_{0}".format(sub_thresh) ]) del_header = "".join([ "Del_positions_{0}_".format(len(positions["del"])), "with_threshold_del_{0}".format(del_thresh) ]) ins_header = "".join([ "Ins_positions_{0}_".format(len(positions["ins"])), "with_threshold_ins_{0}".format(ins_thresh) ]) _write_positions(positions_path, positions, total_header, sub_header, del_header, ins_header) window_len = 1000 sum_header = "Tentative Divergent Position Summary" _write_div_summary(div_sum_path, sum_header, positions, len(ctg_profile), window_len) logger.debug("Total positions: %d", len(positions["total"])) total_aln_errors.extend(aln_errors) mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.debug("Alignment error rate: %f", mean_aln_error)
def make_bubbles(alignment_path, contigs_info, contigs_path, err_mode, num_proc, bubbles_out): """ The main function: takes an alignment and returns bubbles """ aln_reader = SynchronizedSamReader(alignment_path, fp.read_sequence_dict(contigs_path), cfg.vals["max_read_coverage"], use_secondary=True) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] bubbles_out_lock = multiprocessing.Lock() bubbles_out_handle = open(bubbles_out, "w") for _ in range(num_proc): threads.append( multiprocessing.Process( target=_thread_worker, args=(aln_reader, contigs_info, err_mode, results_queue, error_queue, bubbles_out_handle, bubbles_out_lock))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() if t.exitcode == -9: logger.error("Looks like the system ran out of memory") if t.exitcode != 0: raise Exception( "One of the processes exited with code: {0}".format( t.exitcode)) except KeyboardInterrupt: for t in threads: t.terminate() raise if not error_queue.empty(): raise error_queue.get() aln_reader.close() total_bubbles = 0 total_long_bubbles = 0 total_long_branches = 0 total_empty = 0 total_aln_errors = [] coverage_stats = {} while not results_queue.empty(): (ctg_id, num_bubbles, num_long_bubbles, num_empty, num_long_branch, aln_errors, mean_coverage) = results_queue.get() total_long_bubbles += num_long_bubbles total_long_branches += num_long_branch total_empty += num_empty total_aln_errors.extend(aln_errors) total_bubbles += num_bubbles coverage_stats[ctg_id] = mean_coverage mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.debug("Generated %d bubbles", total_bubbles) logger.debug("Split %d long bubbles", total_long_bubbles) logger.debug("Skipped %d empty bubbles", total_empty) logger.debug("Skipped %d bubbles with long branches", total_long_branches) return coverage_stats, mean_aln_error
def _compute_profile(alignment, ref_sequence): """ Computes alignment profile """ if len(alignment) == 0: raise Exception("No alignmemnts!") genome_len = alignment[0].trg_len #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"] min_aln_len = cfg.vals["min_polish_aln_len"] aln_errors = [] #filtered = 0 profile = [ProfileInfo() for _ in range(genome_len)] for i in range(genome_len): profile[i].nucl = ref_sequence[i] for aln in alignment: #if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len: if len(aln.qry_seq) < min_aln_len: #filtered += 1 continue aln_errors.append(aln.err_rate) qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 #if trg_pos >= genome_len: # trg_pos -= genome_len prof_elem = profile[trg_pos] if trg_nuc == "-": prof_elem.insertions[aln.qry_id] += qry_nuc #prof_elem.num_inserts += 1 else: #prof_elem.nucl = trg_nuc prof_elem.coverage += 1 if qry_nuc == "-": prof_elem.num_deletions += 1 elif trg_nuc != qry_nuc: prof_elem.num_missmatch += 1 trg_pos += 1 for i in range(genome_len): for ins_read, ins_str in profile[i].insertions.items(): profile[i].propagated_ins += 1 span = len(ins_str) for j in range(max(0, i - span), i): profile[j].propagated_ins += 1 for j in range(i + 1, min(i + span + 1, genome_len)): profile[j].propagated_ins += 1 #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment))) return profile, aln_errors
def extract_unique_plasmids(trimmed_reads_mapping, trimmed_reads_path, mapping_rate_threshold=0.8, max_length_difference=500, min_sequence_length=1000): trimmed_reads = set() for hit in read_paf(trimmed_reads_mapping): trimmed_reads.add(hit.query) trimmed_reads.add(hit.target) trimmed_reads = list(trimmed_reads) n_trimmed_reads = len(trimmed_reads) read2int = dict() int2read = dict() for i in range(n_trimmed_reads): read2int[trimmed_reads[i]] = i int2read[i] = trimmed_reads[i] similarity_graph = [[] for _ in range(n_trimmed_reads)] #each hit group stores alginmemnts for each (query, target) pair for hit_group in read_paf_grouped(trimmed_reads_mapping): if hit_group[0].query == hit_group[0].target: continue query_mapping_segments = [] target_mapping_segments = [] for hit in hit_group: query_mapping_segments.append( unmapped.MappingSegment(hit.query_start, hit.query_end)) target_mapping_segments.append( unmapped.MappingSegment(hit.target_start, hit.target_end)) query_length = hit_group[0].query_length target_length = hit_group[0].target_length query_mapping_rate = unmapped.calc_mapping_rate( query_length, query_mapping_segments) target_mapping_rate = unmapped.calc_mapping_rate( target_length, target_mapping_segments) if (query_mapping_rate > mapping_rate_threshold or target_mapping_rate > mapping_rate_threshold): #abs(query_length - target_length) < max_length_difference: vertex1 = read2int[hit_group[0].query] vertex2 = read2int[hit_group[0].target] similarity_graph[vertex1].append(vertex2) similarity_graph[vertex2].append(vertex1) connected_components, n_components = \ utils.find_connected_components(similarity_graph) groups = [[] for _ in range(n_components)] for i in range(len(connected_components)): groups[connected_components[i]].append(int2read[i]) #for g in groups: # logger.debug("Group {0}".format(len(g))) # for s in g: # logger.debug("\t{0}".format(seq_lengths[s])) groups = [group for group in groups if len(group) > 1] trimmed_reads_dict = fp.read_sequence_dict(trimmed_reads_path) unique_plasmids = dict() for group in groups: sequence = trimmed_reads_dict[group[0]] if len(sequence) >= min_sequence_length: unique_plasmids[group[0]] = sequence return unique_plasmids