def __init__(self, adapter1, adapter2, match_probability=RandomMatchProbability(), insert_max_rmp=1E-6, adapter_max_rmp=0.001, min_insert_overlap=1, max_insert_mismatch_frac=0.2, min_adapter_overlap=1, max_adapter_mismatch_frac=0.2, adapter_check_cutoff=9, base_probs=dict(p1=0.25, p2=0.75)): self.adapter1 = adapter1 self.adapter1_len = len(adapter1) self.adapter2 = adapter2 self.adapter2_len = len(adapter2) self.match_probability = match_probability self.insert_max_rmp = insert_max_rmp self.adapter_max_rmp = adapter_max_rmp self.min_insert_overlap = min_insert_overlap self.max_insert_mismatch_frac = float(max_insert_mismatch_frac) self.min_adapter_overlap = min_adapter_overlap self.max_adapter_mismatch_frac = float(max_adapter_mismatch_frac) self.adapter_check_cutoff = adapter_check_cutoff self.base_probs = base_probs self.aligner = MultiAligner(max_insert_mismatch_frac, START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2, min_insert_overlap)
def test_multi_aligner_no_mismatches(): from atropos._align import MultiAligner a = MultiAligner(max_error_rate=0, min_overlap=3) matches = a.locate('AGAGATCAGATGACAGATC', 'GATCA') assert len(matches) == 2 matches.sort(key=lambda x: x[4], reverse=True) assert matches[0][0] == 3 assert matches[0][1] == 8 assert matches[0][2] == 0 assert matches[0][3] == 5 assert matches[0][4] == 5 assert matches[0][5] == 0 assert matches[1][0] == 15 assert matches[1][1] == 19 assert matches[1][2] == 0 assert matches[1][3] == 4 assert matches[1][4] == 4 assert matches[1][5] == 0
def test_multi_aligner_with_mismatches(): from atropos._align import MultiAligner a = MultiAligner(max_error_rate=0.1, min_overlap=10) matches = a.locate('GATATCAGATGACAGATCAGAGATCAGAT', 'GAGATCAGATGA') assert len(matches) == 2 matches.sort(key=lambda x: x[5]) assert matches[0][0] == 19 assert matches[0][1] == 29 assert matches[0][2] == 0 assert matches[0][3] == 10 assert matches[0][4] == 10 assert matches[0][5] == 0 assert matches[1][0] == 0 assert matches[1][1] == 12 assert matches[1][2] == 0 assert matches[1][3] == 12 assert matches[1][4] == 11 assert matches[1][5] == 1
def __init__(self, adapter1, adapter2, match_probability=RandomMatchProbability(), insert_max_rmp=1E-6, adapter_max_rmp=0.001, min_insert_overlap=1, max_insert_mismatch_frac=0.2, min_adapter_overlap=1, max_adapter_mismatch_frac=0.2, adapter_check_cutoff=9, base_probs=dict(p1=0.25, p2=0.75)): self.adapter1 = adapter1 self.adapter1_len = len(adapter1) self.adapter2 = adapter2 self.adapter2_len = len(adapter2) self.match_probability = match_probability self.insert_max_rmp = insert_max_rmp self.adapter_max_rmp = adapter_max_rmp self.min_insert_overlap = min_insert_overlap self.max_insert_mismatch_frac = float(max_insert_mismatch_frac) self.min_adapter_overlap = min_adapter_overlap self.max_adapter_mismatch_frac = float(max_adapter_mismatch_frac) self.adapter_check_cutoff = adapter_check_cutoff self.base_probs = base_probs self.aligner = MultiAligner( max_insert_mismatch_frac, START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2, min_insert_overlap)
class InsertAligner(object): """ Implementation of an insert matching algorithm. This only works with paired-end reads with 3' adapters. """ def __init__(self, adapter1, adapter2, match_probability=RandomMatchProbability(), insert_max_rmp=1E-6, adapter_max_rmp=0.001, min_insert_overlap=1, max_insert_mismatch_frac=0.2, min_adapter_overlap=1, max_adapter_mismatch_frac=0.2, adapter_check_cutoff=9, base_probs=dict(p1=0.25, p2=0.75)): self.adapter1 = adapter1 self.adapter1_len = len(adapter1) self.adapter2 = adapter2 self.adapter2_len = len(adapter2) self.match_probability = match_probability self.insert_max_rmp = insert_max_rmp self.adapter_max_rmp = adapter_max_rmp self.min_insert_overlap = min_insert_overlap self.max_insert_mismatch_frac = float(max_insert_mismatch_frac) self.min_adapter_overlap = min_adapter_overlap self.max_adapter_mismatch_frac = float(max_adapter_mismatch_frac) self.adapter_check_cutoff = adapter_check_cutoff self.base_probs = base_probs self.aligner = MultiAligner( max_insert_mismatch_frac, START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2, min_insert_overlap) def match_insert(self, seq1, seq2): """Use cutadapt aligner for insert and adapter matching""" l1 = len(seq1) l2 = len(seq2) seq_len = min(l1, l2) if l1 > l2: seq1 = seq1[:l2] elif l2 > l1: seq2 = seq1[:l1] seq2_rc = reverse_complement(seq2) def _match(insert_match, offset, insert_match_size, prob): if offset < self.min_adapter_overlap: # The reads are mostly overlapping, to the point where # there's not enough overhang to do a confident adapter # match. We return just the insert match to signal that # error correction can be done even though no adapter # trimming is required. return (insert_match, None, None) # TODO: this is very sensitive to the exact correct choice of adapter. # For example, if you specifiy GATCGGAA... and the correct adapter is # AGATCGGAA..., the prefixes will not match exactly and the alignment # will fail. We need to use a comparison that is a bit more forgiving. a1_match = compare_prefixes(seq1[insert_match_size:], self.adapter1) a2_match = compare_prefixes(seq2[insert_match_size:], self.adapter2) adapter_len = min(offset, self.adapter1_len, self.adapter2_len) max_adapter_mismatches = round(adapter_len * self.max_adapter_mismatch_frac) if a1_match[5] > max_adapter_mismatches and a2_match[5] > max_adapter_mismatches: return None a1_prob = self.match_probability(a1_match[4], adapter_len) a2_prob = self.match_probability(a2_match[4], adapter_len) if (adapter_len > self.adapter_check_cutoff) and ((a1_prob * a2_prob) > self.adapter_max_rmp): return None adapter_len1 = min(self.adapter1_len, l1 - insert_match_size) adapter_len2 = min(self.adapter2_len, l2 - insert_match_size) best_adapter_matches, best_adapter_mismatches = (a1_match if a1_prob < a2_prob else a2_match)[4:6] return ( insert_match, Match(0, adapter_len1, insert_match_size, l1, best_adapter_matches, best_adapter_mismatches), Match(0, adapter_len2, insert_match_size, l2, best_adapter_matches, best_adapter_mismatches) ) # # This is the old way of doing things, where we use the built-in # # Aligner to do a single match. # aligner = Aligner( # seq2_rc, # self.max_insert_mismatch_frac, # START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2, # False, False) # aligner.min_overlap = self.min_insert_overlap # aligner.indel_cost = 100000 # # insert_match = aligner.locate(seq1) # # if not insert_match: # return None # # offset = min(insert_match[0], seq_len - insert_match[3]) # insert_match_size = seq_len - offset # prob = self.match_probability(insert_match[4], insert_match_size) # # if prob > self.insert_max_rmp: # return None # # return _match(insert_match, offset, insert_match_size, prob) # Use an aligner that returns all matches that satisfy the # overlap and error rate thresholds. We sort by matches and # then mismatches, and then check each in turn until we find # one with an adapter match (if any). insert_matches = self.aligner.locate(seq2_rc, seq1) if insert_matches: # Filter by random-match probability filtered_matches = [] for insert_match in insert_matches: offset = min(insert_match[0], seq_len - insert_match[3]) insert_match_size = seq_len - offset prob = self.match_probability(insert_match[4], insert_match_size, **self.base_probs) if prob <= self.insert_max_rmp: filtered_matches.append((insert_match, offset, insert_match_size, prob)) if filtered_matches: if len(filtered_matches) == 1: return _match(*filtered_matches[0]) else: # Test matches in order of random-match probability. # TODO: compare against sorting by length (which is how # SeqPurge essentially does it). #filtered_matches.sort(key=lambda x: x[2], reverse=True) filtered_matches.sort(key=lambda x: x[3]) for m in filtered_matches: match = _match(*m) if match: return match return None
class InsertAligner(object): """ Implementation of an insert matching algorithm. This only works with paired-end reads with 3' adapters. """ def __init__(self, adapter1, adapter2, match_probability=RandomMatchProbability(), insert_max_rmp=1E-6, adapter_max_rmp=0.001, min_insert_overlap=1, max_insert_mismatch_frac=0.2, min_adapter_overlap=1, max_adapter_mismatch_frac=0.2, adapter_check_cutoff=9, base_probs=dict(p1=0.25, p2=0.75)): self.adapter1 = adapter1 self.adapter1_len = len(adapter1) self.adapter2 = adapter2 self.adapter2_len = len(adapter2) self.match_probability = match_probability self.insert_max_rmp = insert_max_rmp self.adapter_max_rmp = adapter_max_rmp self.min_insert_overlap = min_insert_overlap self.max_insert_mismatch_frac = float(max_insert_mismatch_frac) self.min_adapter_overlap = min_adapter_overlap self.max_adapter_mismatch_frac = float(max_adapter_mismatch_frac) self.adapter_check_cutoff = adapter_check_cutoff self.base_probs = base_probs self.aligner = MultiAligner(max_insert_mismatch_frac, START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2, min_insert_overlap) def match_insert(self, seq1, seq2): """Use cutadapt aligner for insert and adapter matching""" l1 = len(seq1) l2 = len(seq2) seq_len = min(l1, l2) if l1 > l2: seq1 = seq1[:l2] elif l2 > l1: seq2 = seq1[:l1] seq2_rc = reverse_complement(seq2) def _match(insert_match, offset, insert_match_size, prob): if offset < self.min_adapter_overlap: # The reads are mostly overlapping, to the point where # there's not enough overhang to do a confident adapter # match. We return just the insert match to signal that # error correction can be done even though no adapter # trimming is required. return (insert_match, None, None) # TODO: this is very sensitive to the exact correct choice of adapter. # For example, if you specifiy GATCGGAA... and the correct adapter is # AGATCGGAA..., the prefixes will not match exactly and the alignment # will fail. We need to use a comparison that is a bit more forgiving. a1_match = compare_prefixes(seq1[insert_match_size:], self.adapter1) a2_match = compare_prefixes(seq2[insert_match_size:], self.adapter2) adapter_len = min(offset, self.adapter1_len, self.adapter2_len) max_adapter_mismatches = round(adapter_len * self.max_adapter_mismatch_frac) if a1_match[5] > max_adapter_mismatches and a2_match[ 5] > max_adapter_mismatches: return None a1_prob = self.match_probability(a1_match[4], adapter_len) a2_prob = self.match_probability(a2_match[4], adapter_len) if (adapter_len > self.adapter_check_cutoff) and ( (a1_prob * a2_prob) > self.adapter_max_rmp): return None adapter_len1 = min(self.adapter1_len, l1 - insert_match_size) adapter_len2 = min(self.adapter2_len, l2 - insert_match_size) best_adapter_matches, best_adapter_mismatches = ( a1_match if a1_prob < a2_prob else a2_match)[4:6] return (insert_match, Match(0, adapter_len1, insert_match_size, l1, best_adapter_matches, best_adapter_mismatches), Match(0, adapter_len2, insert_match_size, l2, best_adapter_matches, best_adapter_mismatches)) # # This is the old way of doing things, where we use the built-in # # Aligner to do a single match. # aligner = Aligner( # seq2_rc, # self.max_insert_mismatch_frac, # START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2, # False, False) # aligner.min_overlap = self.min_insert_overlap # aligner.indel_cost = 100000 # # insert_match = aligner.locate(seq1) # # if not insert_match: # return None # # offset = min(insert_match[0], seq_len - insert_match[3]) # insert_match_size = seq_len - offset # prob = self.match_probability(insert_match[4], insert_match_size) # # if prob > self.insert_max_rmp: # return None # # return _match(insert_match, offset, insert_match_size, prob) # Use an aligner that returns all matches that satisfy the # overlap and error rate thresholds. We sort by matches and # then mismatches, and then check each in turn until we find # one with an adapter match (if any). insert_matches = self.aligner.locate(seq2_rc, seq1) if insert_matches: # Filter by random-match probability filtered_matches = [] for insert_match in insert_matches: offset = min(insert_match[0], seq_len - insert_match[3]) insert_match_size = seq_len - offset prob = self.match_probability(insert_match[4], insert_match_size, **self.base_probs) if prob <= self.insert_max_rmp: filtered_matches.append( (insert_match, offset, insert_match_size, prob)) if filtered_matches: if len(filtered_matches) == 1: return _match(*filtered_matches[0]) else: # Test matches in order of random-match probability. # TODO: compare against sorting by length (which is how # SeqPurge essentially does it). #filtered_matches.sort(key=lambda x: x[2], reverse=True) filtered_matches.sort(key=lambda x: x[3]) for m in filtered_matches: match = _match(*m) if match: return match return None