def find_near_matches_levenshtein(subsequence, sequence, max_l_dist): """Find near-matches of the subsequence in the sequence. This chooses a suitable fuzzy search implementation according to the given parameters. Returns a list of fuzzysearch.Match objects describing the matching parts of the sequence. """ if not subsequence: raise ValueError('Given subsequence is empty!') if max_l_dist < 0: raise ValueError('Maximum Levenshtein distance must be >= 0!') if max_l_dist == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] elif len(subsequence) // (max_l_dist + 1) >= 3: return find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist) else: matches = find_near_matches_levenshtein_linear_programming( subsequence, sequence, max_l_dist) match_groups = group_matches(matches) best_matches = [ get_best_match_in_group(group) for group in match_groups ] return sorted(best_matches)
def _find_near_matches_generic_ngrams(subsequence, sequence, search_params): max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked # optimization: prepare some often used things in advance subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_l_dist + 1) if ngram_len == 0: raise ValueError( 'the subsequence length must be greater than max_l_dist') for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len start_index = max(0, ngram_start - max_l_dist) end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist) for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index): # try to expand left and/or right according to n_ngram for match in find_near_matches_generic_linear_programming( subsequence, sequence[max(0, index - ngram_start - max_l_dist):index - ngram_start + subseq_len + max_l_dist], search_params, ): yield match._replace( start=match.start + max(0, index - ngram_start - max_l_dist), end=match.end + max(0, index - ngram_start - max_l_dist), )
def find_near_matches_substitutions(subsequence, sequence, max_substitutions): """Find near-matches of the subsequence in the sequence. This chooses a suitable fuzzy search implementation according to the given parameters. Returns a list of fuzzysearch.Match objects describing the matching parts of the sequence. """ _check_arguments(subsequence, sequence, max_substitutions) if max_substitutions == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] elif len(subsequence) // (max_substitutions + 1) >= 3: return find_near_matches_substitutions_ngrams( subsequence, sequence, max_substitutions, ) else: return find_near_matches_substitutions_lp( subsequence, sequence, max_substitutions, )
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist): subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_l_dist + 1) if ngram_len == 0: raise ValueError( 'the subsequence length must be greater than max_l_dist') matches = [] for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before_reversed = subsequence[:ngram_start][::-1] subseq_after = subsequence[ngram_end:] start_index = max(0, ngram_start - max_l_dist) end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist) for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index): # try to expand left and/or right according to n_ngram dist_right, right_expand_size = _expand( subseq_after, sequence[index + ngram_len:index - ngram_start + subseq_len + max_l_dist], max_l_dist, ) if dist_right is None: continue dist_left, left_expand_size = _expand( subseq_before_reversed, sequence[max(0, index - ngram_start - (max_l_dist - dist_right)):index][::-1], max_l_dist - dist_right, ) if dist_left is None: continue assert dist_left + dist_right <= max_l_dist matches.append( Match( start=index - left_expand_size, end=index + ngram_len + right_expand_size, dist=dist_left + dist_right, )) # don't return overlapping matches; instead, group overlapping matches # together and return the best match from each group match_groups = group_matches(matches) best_matches = [get_best_match_in_group(group) for group in match_groups] return sorted(best_matches)
def _find_near_matches_substitutions_ngrams(subsequence, sequence, max_substitutions): subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_substitutions + 1) if ngram_len == 0: raise ValueError( "The subsequence's length must be greater than max_substitutions!") for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before = subsequence[:ngram_start] subseq_after = subsequence[ngram_end:] for index in search_exact( subsequence[ngram_start:ngram_end], sequence, ngram_start, seq_len - (subseq_len - ngram_end), ): n_substitutions = 0 seq_before = sequence[index - ngram_start:index] if subseq_before != seq_before: n_substitutions += count_differences_with_maximum( seq_before, subseq_before, max_substitutions - n_substitutions + 1) if n_substitutions > max_substitutions: continue seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len] if subseq_after != seq_after: if n_substitutions == max_substitutions: continue n_substitutions += count_differences_with_maximum( seq_after, subseq_after, max_substitutions - n_substitutions + 1) if n_substitutions > max_substitutions: continue yield Match( start=index - ngram_start, end=index - ngram_start + subseq_len, dist=n_substitutions, )
def has_near_match_substitutions(subsequence, sequence, max_substitutions): _check_arguments(subsequence, sequence, max_substitutions) if max_substitutions == 0: for start_index in search_exact(subsequence, sequence): return True return False elif len(subsequence) // (max_substitutions + 1) >= 3: return has_near_match_substitutions_ngrams( subsequence, sequence, max_substitutions, ) else: return has_near_match_substitutions_lp( subsequence, sequence, max_substitutions, )
def choose_search_func(search_params): max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked # if the limitations are so strict that only exact matches are allowed, # use search_exact() if search_params.max_l_dist == 0: return lambda subsequence, sequence, search_params: [ Match(index, index + len(subsequence), 0) for index in search_exact(subsequence, sequence) ] # return [ # Match(start_index, start_index + len(subsequence), 0) # for start_index in search_exact(subsequence, sequence) # ] # if only substitutions are allowed, use find_near_matches_substitutions() elif max_insertions == 0 and max_deletions == 0: # max_subs = \ # min([x for x in [max_l_dist, max_substitutions] if x is not None]) return lambda subsequence, sequence, search_params:\ find_near_matches_substitutions( subsequence, sequence, min([x for x in [search_params.max_l_dist, search_params.max_substitutions] if x is not None]) ) # if it is enough to just take into account the maximum Levenshtein # distance, use find_near_matches_levenshtein() elif max_l_dist <= min( (max_substitutions if max_substitutions is not None else (1 << 29)), (max_insertions if max_insertions is not None else (1 << 29)), (max_deletions if max_deletions is not None else (1 << 29)), ): return lambda subsequence, sequence, search_params:\ find_near_matches_levenshtein(subsequence, sequence, search_params.max_l_dist) # if none of the special cases above are met, use the most generic version else: return find_near_matches_generic
def find_near_matches_generic(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the maximum allowed number of character substitutions * the maximum allowed number of new characters inserted * and the maximum allowed number of character deletions * the total number of substitutions, insertions and deletions """ if not subsequence: raise ValueError('Given subsequence is empty!') # if the limitations are so strict that only exact matches are allowed, # use search_exact() if search_params.max_l_dist == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] # if the n-gram length would be at least 3, use the n-gram search method elif len(subsequence) // (search_params.max_l_dist + 1) >= 3: return find_near_matches_generic_ngrams(subsequence, sequence, search_params) # use the linear programming search method else: matches = find_near_matches_generic_linear_programming( subsequence, sequence, search_params) match_groups = group_matches(matches) best_matches = [ get_best_match_in_group(group) for group in match_groups ] return sorted(best_matches)
def search(self, subsequence, sequence, start_index=0, end_index=None): return list(search_exact(subsequence, sequence, start_index, end_index))
def find_near_matches_no_deletions_ngrams(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the number of character substitutions must be less than max_substitutions * no deletions or insertions are allowed """ if not subsequence: raise ValueError('Given subsequence is empty!') max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked max_substitutions = min(max_substitutions, max_l_dist) max_insertions = min(max_insertions, max_l_dist) subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_substitutions + max_insertions + 1) if ngram_len == 0: raise ValueError( "The subsequence's length must be greater than max_subs + max_ins!" ) matches = [] matched_indexes = set() for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before = subsequence[:ngram_start] subseq_before_reversed = subseq_before[::-1] subseq_after = subsequence[ngram_end:] start_index = max(0, ngram_start - max_insertions) end_index = min(seq_len, seq_len - (subseq_len - ngram_end) + max_insertions) for index in search_exact( subsequence[ngram_start:ngram_end], sequence, start_index, end_index, ): if index - ngram_start in matched_indexes: continue seq_after = sequence[index + ngram_len:index + subseq_len - ngram_start + max_insertions] if seq_after.startswith(subseq_after): matches_after = [(0, 0)] else: matches_after = _expand(subseq_after, seq_after, max_substitutions, max_insertions, max_l_dist) if not matches_after: continue _max_substitutions = max_substitutions - min( m[0] for m in matches_after) _max_insertions = max_insertions - min(m[1] for m in matches_after) _max_l_dist = max_l_dist - min(m[0] + m[1] for m in matches_after) seq_before = sequence[index - ngram_start - _max_insertions:index] if seq_before.endswith(subseq_before): matches_before = [(0, 0)] else: matches_before = _expand( subseq_before_reversed, seq_before[::-1], _max_substitutions, _max_insertions, _max_l_dist, ) for (subs_before, ins_before) in matches_before: for (subs_after, ins_after) in matches_after: if (subs_before + subs_after <= max_substitutions and ins_before + ins_after <= max_insertions and subs_before + subs_after + ins_before + ins_after <= max_l_dist): matches.append( Match( start=index - ngram_start - ins_before, end=index - ngram_start + subseq_len + ins_after, dist=subs_before + subs_after + ins_before + ins_after, )) matched_indexes |= set( range( index - ngram_start - ins_before, index - ngram_start - ins_before + max_insertions + 1, )) return sorted(matches, key=lambda match: match.start)