def find_overlap(s, t, min_overlap=1): """ Detect if s and t overlap. Returns: None if no overlap was detected. 0 if s is a prefix of t or t is a prefix of s. Positive int gives index where t starts within s. Negative int gives -index where s starts within t. >>> find_overlap('ABCDE', 'CDE') 2 >>> find_overlap('CDE', 'ABCDEFG') -2 >>> find_overlap('ABC', 'X') is None True """ aligner = Aligner(s, max_error_rate=0) aligner.min_overlap = min_overlap result = aligner.locate(t) if result is None: return None s_start, _, t_start, _, _, _ = result return s_start - t_start
def test_n_wildcard_in_ref_matches_n_wildcard_in_query_back(): aligner = Aligner("NNACGT", max_error_rate=0, wildcard_ref=True, flags=Where.BACK.value) match = aligner.locate("AAANTACGTAAA") assert match == (0, 6, 3, 9, 6, 0)
def _aligner(self, mismatch): if not hasattr(self, "_aligner_a"): max_err_r = (float(mismatch[0]) / len(self.index_a), float(mismatch[1]) / len(self.index_b)) self._aligner_a = Aligner(self.index_a, max_err_r[0]) self._aligner_b = Aligner(self.index_b, max_err_r[1]) self._aligner_a.min_overlap = len(self.index_a) - mismatch[0] self._aligner_b.min_overlap = len(self.index_b) - mismatch[1] return self._aligner_a, self._aligner_b
def locate(reference, query, max_error_rate, flags=SEMIGLOBAL, wildcard_ref=False, wildcard_query=False, min_overlap=1): aligner = Aligner(reference, max_error_rate, flags, wildcard_ref, wildcard_query, min_overlap=min_overlap) return aligner.locate(query)
def test_n_wildcards_not_counted_aligner_back(): ref = 'AGGNNNNNNNNNNNNNNTTC' assert len(ref) == 20 aligner = Aligner(ref, max_error_rate=0.1, wildcard_ref=True, flags=Where.BACK.value, min_overlap=3) assert aligner.effective_length == 6 assert aligner.locate('TTC') is None # adapter start, adapter stop, read start, read stop assert aligner.locate('AGG')[:4] == (0, 3, 0, 3) assert aligner.locate('AGGCCCCCCC')[:4] == (0, 10, 0, 10) assert aligner.locate('ATGCCCCCCC') is None assert aligner.locate('AGGCCCCCCCCCCCCCCATC') is None assert aligner.locate('CCC' + ref.replace('N', 'G') + 'AAA') == (0, 20, 3, 23, 20, 0)
def test_edit_environment(k, s, environment_func): result = list(environment_func(s, k)) strings, distances, matches = zip(*result) naive = set(naive_edit_environment(s, k)) assert len(set(strings)) == len(strings) assert set(strings) == naive error_rate = k / len(s) if s else 0.0 aligner = Aligner(s, max_error_rate=error_rate, flags=0, min_overlap=len(s)) for t, dist, m in result: result = aligner.locate(t) start1, stop1, start2, stop2, matches, errors = result assert errors == dist assert m == matches assert start1 == 0 assert stop1 == len(s) assert start2 == 0 assert stop2 == len(t) assert edit_distance(s, t) == dist assert m <= len(s), (s, t, dist) assert m <= len(t), (s, t, dist)
def test(self): reference = 'CTCCAGCTTAGACATATC' aligner = Aligner(reference, 0.1, flags=BACK) aligner.locate('CC')
def test_100_percent_error_rate(self): reference = 'GCTTAGACATATC' aligner = Aligner(reference, 1.0, flags=BACK) aligner.locate('CAA')
def test_not_only_n_wildcards(self): reference = 'NNNNN' with pytest.raises(ValueError) as info: Aligner(reference, 0.1, wildcard_ref=True) assert "only N wildcards" in info.value.args[0]
def test_100_percent_error_rate(self): reference = 'GCTTAGACATATC' aligner = Aligner(reference, 1.0, flags=Where.BACK.value) aligner.locate('CAA')
def test(self): reference = 'CTCCAGCTTAGACATATC' aligner = Aligner(reference, 0.1, flags=Where.BACK.value) aligner.locate('CC')
def test_find_empty_in_empty(self): aligner = Aligner("", 0, flags=0, min_overlap=0) result = aligner.locate("") assert (0, 0, 0, 0, 0, 0) == result