def _match(insert_match, offset, insert_match_size, prob): if offset < self.min_adapter_overlap: # The reads are mostly overlapping, to the point where # there's not enough overhang to do a confident adapter # match. We return just the insert match to signal that # error correction can be done even though no adapter # trimming is required. return (insert_match, None, None) # TODO: this is very sensitive to the exact correct choice of adapter. # For example, if you specifiy GATCGGAA... and the correct adapter is # AGATCGGAA..., the prefixes will not match exactly and the alignment # will fail. We need to use a comparison that is a bit more forgiving. a1_match = compare_prefixes(seq1[insert_match_size:], self.adapter1) a2_match = compare_prefixes(seq2[insert_match_size:], self.adapter2) adapter_len = min(offset, self.adapter1_len, self.adapter2_len) max_adapter_mismatches = round(adapter_len * self.max_adapter_mismatch_frac) if a1_match[5] > max_adapter_mismatches and a2_match[5] > max_adapter_mismatches: return None a1_prob = self.match_probability(a1_match[4], adapter_len) a2_prob = self.match_probability(a2_match[4], adapter_len) if (adapter_len > self.adapter_check_cutoff) and ((a1_prob * a2_prob) > self.adapter_max_rmp): return None adapter_len1 = min(self.adapter1_len, l1 - insert_match_size) adapter_len2 = min(self.adapter2_len, l2 - insert_match_size) best_adapter_matches, best_adapter_mismatches = (a1_match if a1_prob < a2_prob else a2_match)[4:6] return ( insert_match, Match(0, adapter_len1, insert_match_size, l1, best_adapter_matches, best_adapter_mismatches), Match(0, adapter_len2, insert_match_size, l2, best_adapter_matches, best_adapter_mismatches) )
def compare_suffixes(s1, s2, wildcard_ref=False, wildcard_query=False): """ Find out whether one string is the suffix of the other one, allowing mismatches. Used to find an anchored 3' adapter when no indels are allowed. """ s1 = s1[::-1] s2 = s2[::-1] _, length, _, _, matches, errors = compare_prefixes(s1, s2, wildcard_ref, wildcard_query) return (len(s1) - length, len(s1), len(s2) - length, len(s2), matches, errors)
def compare_suffixes(s1, s2, wildcard_ref=False, wildcard_query=False): """ Find out whether one string is the suffix of the other one, allowing mismatches. Used to find an anchored 3' adapter when no indels are allowed. """ s1 = s1[::-1] s2 = s2[::-1] _, length, _, _, matches, errors = compare_prefixes( s1, s2, wildcard_ref, wildcard_query) return (len(s1) - length, len(s1), len(s2) - length, len(s2), matches, errors)
def _match(insert_match, offset, insert_match_size, prob): if offset < self.min_adapter_overlap: # The reads are mostly overlapping, to the point where # there's not enough overhang to do a confident adapter # match. We return just the insert match to signal that # error correction can be done even though no adapter # trimming is required. return (insert_match, None, None) # TODO: this is very sensitive to the exact correct choice of adapter. # For example, if you specifiy GATCGGAA... and the correct adapter is # AGATCGGAA..., the prefixes will not match exactly and the alignment # will fail. We need to use a comparison that is a bit more forgiving. a1_match = compare_prefixes(seq1[insert_match_size:], self.adapter1) a2_match = compare_prefixes(seq2[insert_match_size:], self.adapter2) adapter_len = min(offset, self.adapter1_len, self.adapter2_len) max_adapter_mismatches = round(adapter_len * self.max_adapter_mismatch_frac) if a1_match[5] > max_adapter_mismatches and a2_match[ 5] > max_adapter_mismatches: return None a1_prob = self.match_probability(a1_match[4], adapter_len) a2_prob = self.match_probability(a2_match[4], adapter_len) if (adapter_len > self.adapter_check_cutoff) and ( (a1_prob * a2_prob) > self.adapter_max_rmp): return None adapter_len1 = min(self.adapter1_len, l1 - insert_match_size) adapter_len2 = min(self.adapter2_len, l2 - insert_match_size) best_adapter_matches, best_adapter_mismatches = ( a1_match if a1_prob < a2_prob else a2_match)[4:6] return (insert_match, Match(0, adapter_len1, insert_match_size, l1, best_adapter_matches, best_adapter_mismatches), Match(0, adapter_len2, insert_match_size, l2, best_adapter_matches, best_adapter_mismatches))