def _is_simple_kmer(profile, position): """ Checks if the kmer with center at the given position is simple """ SIMPLE_LEN = cfg.vals["simple_kmer_length"] extended_len = SIMPLE_LEN * 2 nucl_str = [p.nucl for p in profile[position - extended_len // 2 : position + extended_len // 2]] #single nucleotide homopolymers for i in range(extended_len // 2 - SIMPLE_LEN // 2, extended_len // 2 + SIMPLE_LEN // 2 - 1): if nucl_str[i] == nucl_str[i + 1]: return False #dinucleotide homopolymers for shift in [0, 1]: for i in range(SIMPLE_LEN - shift - 1): pos = extended_len // 2 - SIMPLE_LEN + shift + i * 2 if (nucl_str[pos : pos + 2] == nucl_str[pos + 2 : pos + 4]): return False #trinucleotide homopolymers #for shift in [0, 1, 2]: # for i in xrange(SIMPLE_LEN - shift - 1): # pos = shift + i * 3 # if (nucl_str[pos : pos + 3] == nucl_str[pos + 3 : pos + 6]): # #logger.debug("tri" + "".join(nucl_str)) # return False return True
def shift_gaps(seq_trg, seq_qry): """ Shifts all ambigious query gaps to the right """ lst_trg, lst_qry = list("$" + seq_trg + "$"), list("$" + seq_qry + "$") is_gap = False gap_start = 0 for i in range(len(lst_trg)): if is_gap and lst_qry[i] != "-": is_gap = False swap_left = gap_start - 1 swap_right = i - 1 while (swap_left > 0 and swap_right >= gap_start and lst_qry[swap_left] == lst_trg[swap_right]): lst_qry[swap_left], lst_qry[swap_right] = \ lst_qry[swap_right], lst_qry[swap_left] swap_left -= 1 swap_right -= 1 if not is_gap and lst_qry[i] == "-": is_gap = True gap_start = i return "".join(lst_qry[1:-1])
def _get_partition(profile, err_mode): """ Partitions genome into sub-alignments at solid regions / simple kmers """ #logger.debug("Partitioning genome") SOLID_LEN = cfg.vals["solid_kmer_length"] SIMPLE_LEN = cfg.vals["simple_kmer_length"] MAX_BUBBLE = cfg.vals["max_bubble_length"] solid_flags = [False for _ in range(len(profile))] prof_pos = 0 while prof_pos < len(profile) - SOLID_LEN: if _is_solid_kmer(profile, prof_pos, err_mode): for i in range(prof_pos, prof_pos + SOLID_LEN): solid_flags[i] = True prof_pos += SOLID_LEN else: prof_pos += 1 partition = [] prev_partition = SOLID_LEN long_bubbles = 0 prof_pos = SOLID_LEN while prof_pos < len(profile) - SOLID_LEN: cur_partition = prof_pos + SIMPLE_LEN // 2 landmark = (all(solid_flags[prof_pos : prof_pos + SIMPLE_LEN]) and _is_simple_kmer(profile, cur_partition)) if prof_pos - prev_partition > MAX_BUBBLE: long_bubbles += 1 if landmark or prof_pos - prev_partition > MAX_BUBBLE: partition.append(cur_partition) prev_partition = cur_partition prof_pos += SOLID_LEN else: prof_pos += 1 #logger.debug("Partitioned into {0} segments".format(len(partition) + 1)) #logger.debug("Long bubbles: {0}".format(long_bubbles)) return partition, long_bubbles
def get_consensus(alignment_path, contigs_path, contigs_info, num_proc, platform): """ Main function """ aln_reader = SynchronizedSamReader(alignment_path, fp.read_sequence_dict(contigs_path), max_coverage=cfg.vals["max_read_coverage"], use_secondary=True) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] for _ in range(num_proc): threads.append(multiprocessing.Process(target=_thread_worker, args=(aln_reader, contigs_info, platform, results_queue, error_queue))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() if t.exitcode == -9: logger.error("Looks like the system ran out of memory") if t.exitcode != 0: raise Exception("One of the processes exited with code: {0}" .format(t.exitcode)) except KeyboardInterrupt: for t in threads: t.terminate() raise if not error_queue.empty(): raise error_queue.get() out_fasta = {} total_aln_errors = [] while not results_queue.empty(): ctg_id, ctg_seq, aln_errors = results_queue.get() total_aln_errors.extend(aln_errors) if len(ctg_seq) > 0: out_fasta[ctg_id] = ctg_seq mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.info("Alignment error rate: %f", mean_aln_error) return out_fasta
def get_uniform_alignments(alignments, seq_len): """ Leaves top alignments for each position within contig assuming uniform coverage distribution """ def _get_median(lst): if not lst: raise ValueError("_get_median() arg is an empty sequence") sorted_list = sorted(lst) if len(lst) % 2 == 1: return sorted_list[len(lst) // 2] else: mid1 = sorted_list[(len(lst) // 2) - 1] mid2 = sorted_list[(len(lst) // 2)] return (mid1 + mid2) / 2 WINDOW = 100 MIN_COV = 10 COV_RATE = 1.25 #split contig into windows, get median read coverage over all windows and #determine the quality threshold cutoffs for each window wnd_primary_cov = [0 for _ in range(seq_len // WINDOW + 1)] wnd_aln_quality = [[] for _ in range(seq_len // WINDOW + 1)] wnd_qual_thresholds = [1.0 for _ in range(seq_len // WINDOW + 1)] for aln in alignments: for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW): if not aln.is_secondary: wnd_primary_cov[i] += 1 wnd_aln_quality[i].append(aln.err_rate) #for each window, select top X alignmetns, where X is the median read coverage cov_threshold = max(int(COV_RATE * _get_median(wnd_primary_cov)), MIN_COV) for i in range(len(wnd_aln_quality)): if len(wnd_aln_quality[i]) > cov_threshold: wnd_qual_thresholds[i] = sorted(wnd_aln_quality[i])[cov_threshold] #for each alignment, count in how many windows it passes the threshold filtered_alignments = [] total_sequence = 0 filtered_sequence = 0 for aln in alignments: good_windows = 0 total_windows = aln.trg_end // WINDOW - aln.trg_start // WINDOW total_sequence += aln.trg_end - aln.trg_start for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW): if aln.err_rate <= wnd_qual_thresholds[i]: good_windows += 1 if good_windows > total_windows // 2: filtered_alignments.append(aln) filtered_sequence += aln.trg_end - aln.trg_start #filtered_reads_rate = 1 - float(len(filtered_alignments)) / len(alignments) #filtered_seq_rate = 1 - float(filtered_sequence) / total_sequence #logger.debug("Filtered {0:7.2f}% reads, {1:7.2f}% sequence" # .format(filtered_reads_rate * 100, filtered_seq_rate * 100)) return filtered_alignments
def split_into_chunks(fasta_in, chunk_size): out_dict = {} for header, seq in iteritems(fasta_in): #print len(seq) for i in range(0, max(len(seq) // chunk_size, 1)): chunk_hdr = "{0}$chunk_{1}".format(header, i) start = i * chunk_size end = (i + 1) * chunk_size if len(seq) - end < chunk_size: end = len(seq) #print(start, end) out_dict[chunk_hdr] = seq[start:end] return out_dict
def _is_solid_kmer(profile, position, err_mode): """ Checks if the kmer at given position is solid """ MISSMATCH_RATE = cfg.vals["err_modes"][err_mode]["solid_missmatch"] INS_RATE = cfg.vals["err_modes"][err_mode]["solid_indel"] SOLID_LEN = cfg.vals["solid_kmer_length"] for i in range(position, position + SOLID_LEN): if profile[i].coverage == 0: return False local_missmatch = (profile[i].num_missmatch + profile[i].num_deletions) / profile[i].coverage local_ins = profile[i].num_inserts / profile[i].coverage if local_missmatch > MISSMATCH_RATE or local_ins > INS_RATE: return False return True
def _contig_profile(alignment, platform, genome_len): """ Computes alignment profile """ #leave the best uniform alignments alignment = get_uniform_alignments(alignment, genome_len) aln_errors = [] profile = [Profile() for _ in range(genome_len)] #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"] for aln in alignment: #if aln.err_rate > max_aln_err: continue aln_errors.append(aln.err_rate) #after gap shifting it is possible that #two gaps are aligned against each other qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 if trg_pos >= genome_len: trg_pos -= genome_len #total += 1 prof_elem = profile[trg_pos] if trg_nuc == "-" and qry_nuc != "-": prof_elem.insertions[aln.qry_id] += qry_nuc else: prof_elem.nucl = trg_nuc prof_elem.matches[qry_nuc] += 1 trg_pos += 1 #print "len", genome_len, "median coverage", cov_threshold #print "total bases: ", total, "discarded bases: ", discarded #print "filtered", float(discarded) / total #print "" return profile, aln_errors
def _compute_profile(alignment, platform, genome_len): """ Computes alignment profile """ max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"] min_aln_len = cfg.vals["min_polish_aln_len"] aln_errors = [] #filtered = 0 profile = [ProfileInfo() for _ in range(genome_len)] for aln in alignment: if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len: #filtered += 1 continue aln_errors.append(aln.err_rate) qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 if trg_pos >= genome_len: trg_pos -= genome_len prof_elem = profile[trg_pos] if trg_nuc == "-": prof_elem.num_inserts += 1 else: prof_elem.nucl = trg_nuc prof_elem.coverage += 1 if qry_nuc == "-": prof_elem.num_deletions += 1 elif trg_nuc != qry_nuc: prof_elem.num_missmatch += 1 trg_pos += 1 #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment))) return profile, aln_errors
def make_bubbles(alignment_path, contigs_info, contigs_path, err_mode, num_proc, bubbles_out): """ The main function: takes an alignment and returns bubbles """ aln_reader = SynchronizedSamReader(alignment_path, fp.read_sequence_dict(contigs_path), cfg.vals["max_read_coverage"], use_secondary=True) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] bubbles_out_lock = multiprocessing.Lock() bubbles_out_handle = open(bubbles_out, "w") for _ in range(num_proc): threads.append(multiprocessing.Process(target=_thread_worker, args=(aln_reader, contigs_info, err_mode, results_queue, error_queue, bubbles_out_handle, bubbles_out_lock))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() if t.exitcode == -9: logger.error("Looks like the system ran out of memory") if t.exitcode != 0: raise Exception("One of the processes exited with code: {0}" .format(t.exitcode)) except KeyboardInterrupt: for t in threads: t.terminate() raise if not error_queue.empty(): raise error_queue.get() total_bubbles = 0 total_long_bubbles = 0 total_long_branches = 0 total_empty = 0 total_aln_errors = [] coverage_stats = {} while not results_queue.empty(): (ctg_id, num_bubbles, num_long_bubbles, num_empty, num_long_branch, aln_errors, mean_coverage) = results_queue.get() total_long_bubbles += num_long_bubbles total_long_branches += num_long_branch total_empty += num_empty total_aln_errors.extend(aln_errors) total_bubbles += num_bubbles coverage_stats[ctg_id] = mean_coverage mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1) logger.debug("Generated %d bubbles", total_bubbles) logger.debug("Split %d long bubbles", total_long_bubbles) logger.debug("Skipped %d empty bubbles", total_empty) logger.debug("Skipped %d bubbles with long branches", total_long_branches) return coverage_stats, mean_aln_error
def get_chunk(self): """ Alignment file is expected to be sorted! """ chunk_buffer = [] parsed_contig = None with self.lock: self.aln_file.seek(self.position.value) if self.eof.value: return None, [] current_contig = None while True: self.position.value = self.aln_file.tell() line = self.aln_file.readline() if not line: break if _is_sam_header(line): continue tokens = line.strip().split() if len(tokens) < 11: continue #raise AlignmentException("Error reading SAM file") read_contig = tokens[2] flags = int(tokens[1]) is_unmapped = flags & 0x4 is_secondary = flags & 0x100 is_supplementary = flags & 0x800 #allow supplementary #if is_unmapped or is_secondary: continue if is_unmapped: continue if is_secondary and not self.use_secondary: continue if read_contig in self.processed_contigs: raise AlignmentException("Alignment file is not sorted") if read_contig != current_contig: prev_contig = current_contig current_contig = read_contig if prev_contig is not None: self.processed_contigs.add(prev_contig) parsed_contig = prev_contig break else: chunk_buffer = [tokens] else: chunk_buffer.append(tokens) if not parsed_contig: self.eof.value = True parsed_contig = current_contig #end with sequence_length = 0 alignments = [] for tokens in chunk_buffer: read_id = tokens[0] read_contig = tokens[2] cigar_str = tokens[5] qry_seq = tokens[10] trg_seq = tokens[9] ctg_pos = int(tokens[3]) flags = int(tokens[1]) is_reversed = flags & 0x16 is_secondary = flags & 0x100 if qry_seq == b"*": raise Exception("Error parsing SAM: record without read sequence") trg_start = ctg_pos qry_start = 0 qry_end = len(qry_seq) - qry_seq.count(b'-') trg_end = trg_start + len(trg_seq) - trg_seq.count(b'-') qry_len= len(qry_seq) - qry_seq.count(b'-') trg_len = len(self.ref_fasta[read_contig]) #(trg_start, trg_end, trg_len, trg_seq, #qry_start, qry_end, qry_len, qry_seq, err_rate) = \ # self.parse_cigar(cigar_str, read_str, read_contig, ctg_pos) #OVERHANG = cfg.vals["read_aln_overhang"] #if (float(qry_end - qry_start) / qry_len > self.min_aln_rate or # trg_start < OVERHANG or trg_len - trg_end < OVERHANG): matches = 0 for i in range(len(trg_seq)): if trg_seq[i] == qry_seq[i]: matches += 1 err_rate = 1 - float(matches) / len(trg_seq) aln = Alignment(_STR(read_id), _STR(read_contig), qry_start, qry_end, "-" if is_reversed else "+", qry_len, trg_start, trg_end, "+", trg_len, _STR(qry_seq), _STR(trg_seq), err_rate, is_secondary) alignments.append(aln) sequence_length += qry_end - qry_start #In rare cases minimap2 does not output SQ tag, so need to check if _STR(parsed_contig) in self.seq_lengths: contig_length = self.seq_lengths[_STR(parsed_contig)] if sequence_length // contig_length > self.max_coverage: break if parsed_contig is None: return None, [] return _STR(parsed_contig), alignments
def parse_cigar(self, cigar_str, read_str, ctg_name, ctg_pos): ctg_str = self.ref_fasta[ctg_name] trg_seq = [] qry_seq = [] trg_start = ctg_pos - 1 trg_pos = ctg_pos - 1 qry_start = 0 qry_pos = 0 left_hard = True left_soft = True hard_clipped_left = 0 hard_clipped_right = 0 soft_clipped_left = 0 soft_clipped_right = 0 for token in self.cigar_parser.findall(cigar_str): size, op = int(token[:-1]), token[-1:] if op == b"H": if left_hard: qry_start += size hard_clipped_left += size else: hard_clipped_right += size elif op == b"S": qry_pos += size if left_soft: soft_clipped_left += size else: soft_clipped_right += size elif op == b"M": qry_seq.append(read_str[qry_pos : qry_pos + size].upper()) trg_seq.append(ctg_str[trg_pos : trg_pos + size].upper()) qry_pos += size trg_pos += size elif op == b"I": qry_seq.append(read_str[qry_pos : qry_pos + size].upper()) trg_seq.append(b"-" * size) qry_pos += size elif op == b"D": qry_seq.append(b"-" * size) trg_seq.append(ctg_str[trg_pos : trg_pos + size].upper()) trg_pos += size else: raise AlignmentException("Unsupported CIGAR operation: " + str(op)) left_hard = False if op != b"H": left_soft = False trg_seq = b"".join(trg_seq) qry_seq = b"".join(qry_seq) matches = 0 for i in range(len(trg_seq)): if trg_seq[i] == qry_seq[i]: matches += 1 err_rate = 1 - float(matches) / len(trg_seq) trg_end = trg_pos qry_end = qry_pos + hard_clipped_left qry_len = qry_end + hard_clipped_right qry_start += soft_clipped_left qry_end -= soft_clipped_right return (trg_start, trg_end, len(ctg_str), trg_seq, qry_start, qry_end, qry_len, qry_seq, err_rate)