def distance(self, string_1, string_2, max_distance): if string_1 is None or string_2 is None: return helpers.null_distance_results(string_1, string_2, max_distance) if max_distance <= 0: return 0 if string_1 == string_2 else -1 max_distance = int(min(2 ** 31 - 1, max_distance)) # if strings of different lengths, ensure shorter string is in string_1. # This can result in a little faster speed by spending more time # spinning just the inner loop during the main processing. if len(string_1) > len(string_2): string_2, string_1 = string_1, string_2 if len(string_2) - len(string_1) > max_distance: return -1 # identify common suffix and/or prefix that can be ignored len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2) if len_1 == 0: return len_2 if len_2 <= max_distance else -1 if len_2 > len(self._base_char_1_costs): self._base_char_1_costs = np.zeros(len_2, dtype=np.int32) self._base_prev_char_1_costs = np.zeros(len_2, dtype=np.int32) if max_distance < len_2: return self._distance_max(string_1, string_2, len_1, len_2, start, max_distance, self._base_char_1_costs, self._base_prev_char_1_costs) return self._distance(string_1, string_2, len_1, len_2, start, self._base_char_1_costs, self._base_prev_char_1_costs)
def distance(self, string_1: str, string_2: str, max_distance: int) -> int: """Computes the Damerau-Levenshtein optimal string alignment edit distance between two strings. Args: string_1: One of the strings to compare. string_2: The other string to compare. max_distance: The maximum distance that is of interest. Returns: -1 if the distance is greater than the max_distance, 0 if the strings are equivalent, otherwise a positive number whose magnitude increases as difference between the strings increases. """ if string_1 is None or string_2 is None: return helpers.null_distance_results(string_1, string_2, max_distance) if max_distance <= 0: return 0 if string_1 == string_2 else -1 max_distance = int(min(2**31 - 1, max_distance)) # if strings of different lengths, ensure shorter string is in string_1. # This can result in a little faster speed by spending more time spinning # just the inner loop during the main processing. if len(string_1) > len(string_2): string_2, string_1 = string_1, string_2 if len(string_2) - len(string_1) > max_distance: return -1 # identify common suffix and/or prefix that can be ignored len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2) if len_1 == 0: return len_2 if len_2 <= max_distance else -1 if len_2 > len(self._base_char_1_costs): self._base_char_1_costs = [0 for _ in range(len_2)] self._base_prev_char_1_costs = [0 for _ in range(len_2)] if max_distance < len_2: return self._distance_max( string_1, string_2, len_1, len_2, start, max_distance, self._base_char_1_costs, self._base_prev_char_1_costs, ) return self._distance( string_1, string_2, len_1, len_2, start, self._base_char_1_costs, self._base_prev_char_1_costs, )
def distance(self, string_1, string_2, max_distance): if string_1 is None or string_2 is None: return helpers.null_distance_results(string_1, string_2, max_distance) if max_distance <= 0: return 0 if string_1 == string_2 else -1 max_distance = int(min(2**31 - 1, max_distance)) cost = damerau_levenshtein_distance_pyx(string_1, string_2) return cost if cost <= max_distance else -1
def distance(self, string_1, string_2, max_distance): """Compute and return the Levenshtein edit distance between two strings. Parameters ---------- string_1 : str One of the strings to compare. string_2 : str The other string to compare. max_distance : int The maximum distance that is of interest. Returns ------- int -1 if the distance is greater than the maxDistance, 0 if the strings are equivalent, otherwise a positive number whose magnitude increases as difference between the strings increases. """ if string_1 is None or string_2 is None: return helpers.null_distance_results(string_1, string_2, max_distance) if max_distance <= 0: return 0 if string_1 == string_2 else -1 max_distance = max_distance = int(min(2**31 - 1, max_distance)) # if strings of different lengths, ensure shorter string is in # string_1. This can result in a little faster speed by # spending more time spinning just the inner loop during the # main processing. if len(string_1) > len(string_2): string_2, string_1 = string_1, string_2 if len(string_2) - len(string_1) > max_distance: return -1 # identify common suffic and/or prefix that can be ignored len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2) if len_1 == 0: return len_2 if len_2 <= max_distance else -1 if len_2 > len(self._base_char_1_costs): self._base_char_1_costs = np.zeros(len_2, dtype=np.int32) if max_distance < len_2: return self._distance_max(string_1, string_2, len_1, len_2, start, max_distance, self._base_char_1_costs) return self._distance(string_1, string_2, len_1, len_2, start, self._base_char_1_costs)
def distance(self, string_1, string_2, max_distance): if string_1 == string_2: return 0 if string_1 is None or string_2 is None: return helpers.null_distance_results(string_1, string_2, max_distance) if max_distance <= 0: return 0 if string_1 == string_2 else -1 max_distance = int(min(2**31 - 1, max_distance)) # with use of unidecode (really good, but a bit slower) cost = 0.0 # base cost value cost += weighted_levenshtein_osa(unidecode(string_1), unidecode(string_2), *self._weights) # TODO: with use of str.stranslate magic :) # cost = weighted_levenshtein_osa( # string_1.translate(ascii_translator), string_2.translate(ascii_translator) # , *self._weights # ) return cost if cost <= max_distance else -1