def test_list_of_words_one_extra(self): subsequence = "jumped over lazy dog".split() sequence = "the big brown fox jumped over the lazy dog".split() for params, expected_outcomes in [ ((0, 0, 0, 0), []), ((1, 0, 0, 1), []), ((0, 1, 0, 1), [ Match(start=4, end=9, dist=1, matched="jumped over the lazy dog".split()) ]), ((0, 0, 1, 1), []), ((1, 1, 1, 1), [ Match(start=4, end=9, dist=1, matched="jumped over the lazy dog".split()) ]), ((2, 2, 2, 2), [ Match(start=4, end=9, dist=1, matched="jumped over the lazy dog".split()) ]), ]: self.expectedOutcomes( self.search(subsequence, sequence, *params), expected_outcomes, )
def test_missing_second_item_complex(self): self.assertTrue( set(self.search(b('bde'), b('abcdefg'), 1, 1, 1, 1)).issubset([ Match(start=1, end=5, dist=1, matched=b('bcde')), Match(start=2, end=5, dist=1, matched=b('cde')), Match(start=3, end=5, dist=1, matched=b('de')), ]))
def test_double_first_item_two_results(self): self.assertEqual( self.search(b('def'), b('abcddefg'), 0, 1, 0), [ Match(start=3, end=7, dist=1, matched=b('ddef')), Match(start=4, end=7, dist=0, matched=b('def')) ], )
def test_double_first_item_two_results(self): self.expectedOutcomes( self.search(b('def'), b('abcddefg'), 0, 1, 0, 1), [ Match(start=3, end=7, dist=1, matched=b('ddef')), Match(start=4, end=7, dist=0, matched=b('def')) ], )
def test_only_deletions(self): self.expectedOutcomes( self.search(b('TESTabc'), b('TEST123'), 0, 0, 5, None), [ Match(start=0, end=4, dist=3, matched=b('TEST')), Match(start=1, end=4, dist=4, matched=b('EST')), Match(start=2, end=4, dist=5, matched=b('ST')), ], )
def test_separate(self): matches = [ Match(start=19, end=29, dist=1, matched='x'*10), Match(start=42, end=52, dist=1, matched='x'*10), Match(start=99, end=109, dist=0, matched='x'*10), ] self.assertEqual( group_matches(matches), [{m} for m in matches], )
def test_two_identical(self): self.expectedOutcomes( self.search(b('abc'), b('abcabc'), max_subs=1), [Match(start=0, end=3, dist=0), Match(start=3, end=6, dist=0)], ) self.expectedOutcomes( self.search(b('abc'), b('abcXabc'), max_subs=1), [Match(start=0, end=3, dist=0), Match(start=4, end=7, dist=0)], )
def test_null_bytes(self): self.assertEqual( self.search(b('abc'), b('xx\0abcxx'), 0, 0, 0, 0), [Match(start=3, end=6, dist=0, matched=b('abc'))], ) self.assertEqual( self.search(b('a\0b'), b('xxa\0bcxx'), 0, 0, 0, 0), [Match(start=2, end=5, dist=0, matched=b('a\0b'))], )
def test_separate_with_duplicate(self): matches = [ Match(start=19, end=29, dist=1), Match(start=42, end=52, dist=1), Match(start=99, end=109, dist=0), ] self.assertEqual( group_matches(matches + [matches[1]]), [set([m]) for m in matches], )
def test_missing_second_to_last_item(self): self.assertEqual( self.search('bce', 'abcdefg', max_l_dist=1), [Match(start=1, end=5, dist=1)], ) self.assertEqual( self.search('bce', 'abcdefg', max_l_dist=2), [Match(start=1, end=5, dist=1)], )
def test_max_substitutions_gte_subseq_len(self): for max_subs in [1, 2, 5]: self.expectedOutcomes( self.search(b('b'), b('abc'), max_subs), [Match(0, 1, 1), Match(1, 2, 0), Match(2, 3, 1)] ) for extra_subs in [0, 1, 7]: self.expectedOutcomes( self.search(b('PATTERN'), b('PATTERN'), len('PATTERN') + extra_subs), [Match(0, len('PATTERN'), 0)] )
def test_double_first_item(self): self.expectedOutcomes( self.search(b('def'), b('abcddefg'), max_subs=1), [Match(start=4, end=7, dist=0)], ) self.expectedOutcomes( self.search(b('def'), b('abcddefg'), max_subs=2), [Match(start=3, end=6, dist=2), Match(start=4, end=7, dist=0)], )
def test_list_of_words(self): subsequence = "over a lazy dog".split() sequence = "the big brown fox jumped over the lazy dog".split() for max_l_dist, expected_outcomes in [ (0, []), (1, [Match(start=5, end=9, dist=1)]), (2, [Match(start=5, end=9, dist=1)]), ]: self.assertEqual( self.search(subsequence, sequence, max_l_dist), expected_outcomes, )
def test_protein_search1(self): # see: # * BioPython archives from March 14th, 2014 # http://lists.open-bio.org/pipermail/biopython/2014-March/009030.html # * https://github.com/taleinat/fuzzysearch/issues/3 text = b(''.join('''\ XXXXXXXXXXXXXXXXXXXGGGTTVTTSSAAAAAAAAAAAAAGGGTTLTTSSAAAAAAAAAAAA AAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBGGGTTLTTSS '''.split())) pattern = b("GGGTTLTTSS") self.expectedOutcomes( self.search(pattern, text, max_subs=0), [Match(start=42, end=52, dist=0), Match(start=99, end=109, dist=0)], ) self.expectedOutcomes( self.search(pattern, text, max_subs=1), [Match(start=19, end=29, dist=1), Match(start=42, end=52, dist=0), Match(start=99, end=109, dist=0)], ) self.expectedOutcomes( self.search(pattern, text, max_subs=2), [Match(start=19, end=29, dist=1), Match(start=42, end=52, dist=0), Match(start=99, end=109, dist=0)], )
def test_non_string_sequences(self): supported_types = [list, tuple] for klass in supported_types: with self.subTest(klass.__name__): self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 3]), 0, 0, 0, 0), [Match(start=0, end=3, dist=0)]) self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 3]), 1, 1, 1, 1), [Match(start=0, end=3, dist=0)]) self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 4]), 0, 0, 0, 0), []) self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 4]), 1, 1, 1, 1), [Match(start=0, end=3, dist=1)]) self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 4]), 0, 0, 1, 1), [Match(start=0, end=3, dist=1)])
def test_list_of_words_one_substituted(self): subsequence = "jumped over my lazy dog".split() sequence = "the big brown fox jumped over the lazy dog".split() for params, expected_outcomes in [ ((0, 0, 0, 0), []), ((1, 0, 0, 1), [Match(start=4, end=9, dist=1)]), ((0, 1, 0, 1), []), ((0, 0, 1, 1), []), ((0, 1, 1, 1), [Match(start=4, end=9, dist=1)]), # substitution = insertion + deletion; dist = 1 !! ((1, 1, 1, 1), [Match(start=4, end=9, dist=1)]), ((2, 2, 2, 2), [Match(start=4, end=9, dist=1)]), ]: self.expectedOutcomes( self.search(subsequence, sequence, *params), expected_outcomes, )
def test_unicode_substring(self): pattern = u('\u03A3\u0393') text = u('\u03A0\u03A3\u0393\u0394') self.expectedOutcomes( self.search(pattern, text, max_subs=0), [Match(1, 3, 0)] )
def test_single_substitution_in_long_text(self): substring = b('PATTERN') text = b(''.join([ x.strip() for x in '''\ FySijRLMtLLWkMnWxTbzIWuxOUbfAahWYKUlOZyhoQhfExJPOSwXxBLrlqdoUwpRW FEtHFiepnOTbkttuagADQaUTvkvKzvqaFaMnAPfolPpmXitKLDQhAqDOJwFzdcKmk cfVStxZGDUbrHjrDwVVRihbklyfqLJjrzGuhVGDzgSpCHXvaGPHebbcUAnAgfqqpA uMOowtptcoQUeAbdqJAmieLDxCrOPivbSwmriQwfFCDTXbswFqClZPnSkDkCyvPCi bmAjVGnuVsrZlPypglXlVVQKzMpQuWQynOLGDqwrAnsvYTcArkEhFpEgahWVQGOvv CTvbYZRVqqPCDRsyWeTVgANxZIyVAtENnndbsHzpEcPUfqCBUroIGRNEIMHYIZANy LeeVKEwihbvWZVOWPeAlmNKnhhoEPIcpDJDzPOYHSltxhSsZeeWMqtAnuSoFOIrqB EPUFIlKkpamljHylnTIWqaESoWbYESVPEeZtlAzpInuwFaNIYUvzpJNIlPtuOjUuT efaGnOXvQeHdaRPrdHCepPATXERNDdnkzuLHQcVWKpgHhGifBySAkWkthrzfZDHDU HJxjpLXseKuldLRftyctGvVKyrRTUCRAakjwTSWivGdksOZabnkBoRtMstlNwXcwg UCFLaWFxjqjasOfNeThrbubVGtyYRROYUOTMUmeSdJcBKxVXiaWDZoHyKtQRXwpVO pEmlpdzKWkFpDtHHdImhDJIXwxzjwyNLaTgPLHmcyhJGqncCblxALMdPEDaRtGFMg BskUxPGATTLKMFeIjgFJpudyMWlASyFSiaDWrOCgRfwjfpMYfuNQIqzvZbguWsnaq tRaXcxavobetBbbfMDjstQLjoJLwiajVRKhFVspIdgrmTMEBbjtpMnSpTkmFcRBZZ GUOWnesGgZeKkIQhlxlRPTtjUbbpaPlmxeiBdUKHHApgvEybUwWwXCoXFsauNiINm AGATFdcaHzgoRpbBFhKdJkLMF'''.splitlines() ])) expected_match = Match(start=541, end=548, dist=1, matched=text[541:548]) self.assertEqual( self.search(substring, text, 1, 0, 0, 1), [expected_match], ) self.assertEqual( self.search(substring, text, 1, 1, 1, 1), [expected_match], )
def find_near_matches_levenshtein(subsequence, sequence, max_l_dist): """Find near-matches of the subsequence in the sequence. This chooses a suitable fuzzy search implementation according to the given parameters. Returns a list of fuzzysearch.Match objects describing the matching parts of the sequence. """ if not subsequence: raise ValueError('Given subsequence is empty!') if max_l_dist < 0: raise ValueError('Maximum Levenshtein distance must be >= 0!') if max_l_dist == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] elif len(subsequence) // (max_l_dist + 1) >= 3: return find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist) else: matches = find_near_matches_levenshtein_linear_programming( subsequence, sequence, max_l_dist) match_groups = group_matches(matches) best_matches = [ get_best_match_in_group(group) for group in match_groups ] return sorted(best_matches)
def find_near_matches_substitutions(subsequence, sequence, max_substitutions): """Find near-matches of the subsequence in the sequence. This chooses a suitable fuzzy search implementation according to the given parameters. Returns a list of fuzzysearch.Match objects describing the matching parts of the sequence. """ _check_arguments(subsequence, sequence, max_substitutions) if max_substitutions == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] elif len(subsequence) // (max_substitutions + 1) >= 3: return find_near_matches_substitutions_ngrams( subsequence, sequence, max_substitutions, ) else: return find_near_matches_substitutions_lp( subsequence, sequence, max_substitutions, )
def find_near_matches_substitutions_ngrams(subsequence, sequence, max_substitutions): if not ( isinstance(subsequence, text_type) or isinstance(sequence, text_type) ): try: results = _subs_only_fnm_ngram_byteslike( subsequence, sequence, max_substitutions) except TypeError: pass else: matches = [ Match( index, index + len(subsequence), count_differences_with_maximum( sequence[index:index+len(subsequence)], subsequence, max_substitutions + 1, ), ) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ] return py_find_near_matches_substitutions_ngrams( subsequence, sequence, max_substitutions)
def find_near_matches_generic(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the maximum allowed number of character substitutions * the maximum allowed number of new characters inserted * and the maximum allowed number of character deletions * the total number of substitutions, insertions and deletions """ if not subsequence: raise ValueError('Given subsequence is empty!') # if the limitations are so strict that only exact matches are allowed, # use search_exact() if search_params.max_l_dist == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] # if the n-gram length would be at least 3, use the n-gram search method elif len(subsequence) // (search_params.max_l_dist + 1) >= 3: return find_near_matches_generic_ngrams(subsequence, sequence, search_params) # use the linear programming search method else: return find_near_matches_generic_linear_programming(subsequence, sequence, search_params)
def test_two_extra(self): sequence = '--abc--de--' pattern = 'abcde' self.assertEqual( fnm_nodels_ngrams(pattern, sequence, 0, 2, 2), [Match(start=2, end=9, dist=2)], ) self.assertEqual( fnm_nodels_ngrams(pattern, sequence, 2, 0, 2), [Match(start=2, end=7, dist=2)], ) self.assertEqual( fnm_nodels_ngrams(pattern, sequence, 2, 2, 2), [Match(start=2, end=7, dist=2), Match(start=2, end=9, dist=2)], )
def test_cases(self): for name, data in self.test_cases_data.items(): substring, text, max_l_dist2expected_matches = data with self.subTest(name=name): for max_l_dist, expected_matches in max_l_dist2expected_matches: self.assertEqual( self.search(substring, text, max_l_dist=max_l_dist), [Match(*x) for x in expected_matches], )
def test_short_substring(self): substring = b('XY') text = b('abcdefXYghij') expected_match = Match(start=6, end=8, dist=0, matched=substring) self.assertEqual( self.search(substring, text, 0, 0, 0, 0), [expected_match], )
def test_substring(self): substring = b('PATTERN') text = b('aaaaaaaaaaPATTERNaaaaaaaaa') expected_match = Match(start=10, end=17, dist=0, matched=substring) self.assertEqual( self.search(substring, text, 0, 0, 0, 0), [expected_match], )
def _find_near_matches_substitutions_lp(subsequence, sequence, max_substitutions): # simple optimization: prepare some often used things in advance _SUBSEQ_LEN = len(subsequence) _SUBSEQ_LEN_MINUS_ONE = _SUBSEQ_LEN - 1 # prepare quick lookup of where a character appears in the subsequence char_indexes_in_subsequence = defaultdict(list) for (index, char) in enumerate(subsequence): char_indexes_in_subsequence[char].append(index) # we'll iterate over the sequence once, but the iteration is split into two # for loops; therefore we prepare an iterator in advance which will be used # in both of the loops sequence_enum_iter = enumerate(sequence) # We'll count the number of matching characters assuming various attempted # alignments of the subsequence to the sequence. At any point in the # sequence there will be N such alignments to update. We'll keep # these in a "circular array" (a.k.a. a ring) which we'll rotate after each # iteration to re-align the indexing. # Initialize the candidate counts by iterating over the first N-1 items in # the sequence. No possible matches in this step! candidates = deque([0], maxlen=_SUBSEQ_LEN) for (index, char) in islice(sequence_enum_iter, _SUBSEQ_LEN_MINUS_ONE): for subseq_index in [ idx for idx in char_indexes_in_subsequence[char] if idx <= index ]: candidates[subseq_index] += 1 candidates.appendleft(0) # From the N-th item onwards, we'll update the candidate counts exactly as # above, and additionally check if the part of the sequence whic began N-1 # items before the current index was a near enough match to the given # sub-sequence. for (index, char) in sequence_enum_iter: for subseq_index in char_indexes_in_subsequence[char]: candidates[subseq_index] += 1 # rotate the ring of candidate counts candidates.rotate(1) # fetch the count for the candidate which started N-1 items ago n_substitutions = _SUBSEQ_LEN - candidates[0] # set the count for the next index to zero candidates[0] = 0 # if the candidate had few enough mismatches, yield a match if n_substitutions <= max_substitutions: yield Match( start=index - _SUBSEQ_LEN_MINUS_ONE, end=index + 1, dist=n_substitutions, )
def test_missing_second_item(self): self.assertEqual( self.search(b('bde'), b('abcdefg'), 0, 1, 0, 1), [Match(start=1, end=5, dist=1, matched=b('bcde'))], ) self.assertEqual( self.search(b('bde'), b('abcdefg'), 0, 0, 0, 0), [], ) self.assertEqual( self.search(b('bde'), b('abcdefg'), 1, 0, 0, 1), [Match(start=2, end=5, dist=1, matched=b('cde'))], ) self.assertEqual( self.search(b('bde'), b('abcdefg'), 0, 0, 1, 1), [Match(start=3, end=5, dist=1, matched=b('de'))], )
def test_one_sub_one_ins(self): sequence = 'abcdefghij' pattern = 'bceXghi' expected_match = Match(start=1, end=9, dist=2) self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 0, 0), []) self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 1, 2), []) self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 0, 2), []) self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 1, 1), []) self.assertEqual( fnm_nodels_ngrams(pattern, sequence, 1, 1, 2), [expected_match], )
def search(self, subsequence, sequence, max_subs): results = fnm_subs_lp_byteslike(subsequence, sequence, max_subs) matches = [ Match(index, index + len(subsequence), count_differences_with_maximum( sequence[index:index + len(subsequence)], subsequence, max_subs + 1, ), matched=sequence[index:index + len(subsequence)]) for index in results ] return matches