def levenshtein(string1, string2): """ Computes the Levenshtein distance between two strings. Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string is carried out using a sequence of the following operators: delete a character, insert a character, and substitute one character for another. Args: string1,string2 (str): Input strings Returns: Levenshtein distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> levenshtein('a', '') 1 >>> levenshtein('example', 'samples') 3 >>> levenshtein('levenshtein', 'frankenstein') 6 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0.0 ins_cost, del_cost, sub_cost, trans_cost = (1, 1, 1, 1) len_str1 = len(string1) len_str2 = len(string2) if len_str1 == 0: return len_str2 * ins_cost if len_str2 == 0: return len_str1 * del_cost d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int) for i in _range(len_str1 + 1): d_mat[i, 0] = i * del_cost for j in _range(len_str2 + 1): d_mat[0, j] = j * ins_cost for i in _range(len_str1): for j in _range(len_str2): d_mat[i + 1, j + 1] = min( d_mat[i + 1, j] + ins_cost, d_mat[i, j + 1] + del_cost, d_mat[i, j] + (sub_cost if string1[i] != string2[j] else 0)) return d_mat[len_str1, len_str2]
def levenshtein(string1, string2): """ Computes the Levenshtein distance between two strings. Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string is carried out using a sequence of the following operators: delete a character, insert a character, and substitute one character for another. Args: string1,string2 (str): Input strings Returns: Levenshtein distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> levenshtein('a', '') 1 >>> levenshtein('example', 'samples') 3 >>> levenshtein('levenshtein', 'frankenstein') 6 Note: This implementation internally uses python-levenshtein package to compute the Levenshtein distance """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # using Levenshtein library return Levenshtein.distance(string1, string2)
def needleman_wunsch(string1, string2, gap_cost=1.0, sim_score=sim_ident): """ Computes the Needleman-Wunsch measure between two strings. The Needleman-Wunsch generalizes the Levenshtein distance and considers global alignment between two strings. Specifically, it is computed by assigning a score to each alignment between two input strings and choosing the score of the best alignment, that is, the maximal score. An alignment between two strings is a set of correspondences between the characters of between them, allowing for gaps. Args: string1,string2 (str) : Input strings gap_cost (float) : Cost of gap (defaults to 1.0) sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults to an identity function, where if two characters are same it returns 1.0 else returns 0. Returns: Needleman-Wunsch measure (float) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> needleman_wunsch('dva', 'deeva') 1.0 >>> needleman_wunsch('dva', 'deeve', 0.0) 2.0 >>> needleman_wunsch('dva', 'deeve', 1.0, sim_score=lambda s1, s2 : (2.0 if s1 == s2 else -1.0)) 1.0 >>> needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, sim_score=lambda s1, s2 : (1.0 if s1 == s2 else -1.0)) 2.5 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in _range(len(string1) + 1): dist_mat[i, 0] = -(i * gap_cost) # DP initialization for j in _range(len(string2) + 1): dist_mat[0, j] = -(j * gap_cost) # Needleman-Wunsch DP calculation for i in _range(1, len(string1) + 1): for j in _range(1, len(string2) + 1): match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1]) delete = dist_mat[i - 1, j] - gap_cost insert = dist_mat[i, j - 1] - gap_cost dist_mat[i, j] = max(match, delete, insert) return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
def get_raw_score(self, string1, string2): """ Computes the bag distance between two strings. For two strings X and Y, the Bag distance is: :math:`max( |bag(string1)-bag(string2)|, |bag(string2)-bag(string1)| )` Args: string1,string2 (str): Input strings Returns: Bag distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> bd = BagDistance() >>> bd.get_raw_score('cat', 'hat') 1 >>> bd.get_raw_score('Niall', 'Neil') 2 >>> bd.get_raw_score('aluminum', 'Catalan') 5 >>> bd.get_raw_score('ATCG', 'TAGC') 0 >>> bd.get_raw_score('abcde', 'xyz') 5 References: * http://www.icmlc.org/icmlc2011/018_icmlc2011.pdf """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0 len_str1 = len(string1) len_str2 = len(string2) if len_str1 == 0: return len_str2 if len_str2 == 0: return len_str1 bag1 = collections.Counter(string1) bag2 = collections.Counter(string2) size1 = sum((bag1 - bag2).values()) size2 = sum((bag2 - bag1).values()) # returning the max of difference of sets return max(size1, size2)
def get_raw_score(self, string1, string2): """ Computes the bag distance between two strings. For two strings X and Y, the Bag distance is: :math:`max( |bag(string1)-bag(string2)|, |bag(string2)-bag(string1)| )` Args: string1,string2 (str): Input strings Returns: Bag distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> bd = BagDistance() >>> bd.get_raw_score('cat', 'hat') 1 >>> bd.get_raw_score('Niall', 'Neil') 2 >>> bd.get_raw_score('aluminum', 'Catalan') 5 >>> bd.get_raw_score('ATCG', 'TAGC') 0 >>> bd.get_raw_score('abcde', 'xyz') 5 References: * String Matching with Metric Trees Using an Approximate Distance: http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0 len_str1 = len(string1) len_str2 = len(string2) if len_str1 == 0: return len_str2 if len_str2 == 0: return len_str1 bag1 = collections.Counter(string1) bag2 = collections.Counter(string2) size1 = sum((bag1 - bag2).values()) size2 = sum((bag2 - bag1).values()) # returning the max of difference of sets return max(size1, size2)
def get_raw_score(self, string1, string2, force_ascii=True, full_process=True): """ Computes the Fuzzy Wuzzy token sort measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str), : Input strings force_ascii (boolean) : Flag to remove non-ascii characters or not full_process (boolean) : Flag to process the string or not. Processing includes removing non alphanumeric characters, converting string to lower case and removing leading and trailing whitespaces. Returns: Token Sort measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = TokenSort() >>> s.get_raw_score('great is scala', 'java is great') 81 >>> s.get_raw_score('Sue', 'sue') 100 >>> s.get_raw_score('C++ and Java', 'Java and Python') 64 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 sorted1 = self._process_string_and_sort(string1, force_ascii, full_process=full_process) sorted2 = self._process_string_and_sort(string2, force_ascii, full_process=full_process) ratio = Ratio() return ratio.get_raw_score(sorted1, sorted2)
def smith_waterman(string1, string2, gap_cost=1.0, sim_score=sim_ident): """ Computes the Smith-Waterman measure between two strings. The Smith–Waterman algorithm performs local sequence alignment; that is, for determining similar regions between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of all possible lengths and optimizes the similarity measure. Args: string1,string2 (str) : Input strings gap_cost (float) : Cost of gap (defaults to 1.0) sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults to an identity function, where if two characters are same it returns 1 else returns 0. Returns: Smith-Waterman measure (float) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> smith_waterman('cat', 'hat') 2.0 >>> smith_waterman('dva', 'deeve', 2.2) 1.0 >>> smith_waterman('dva', 'deeve', 1, sim_score=lambda s1, s2 : (2 if s1 == s2 else -1)) 2.0 >>> smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2 : (1.5 if s1 == s2 else 0.5)) 6.5 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) max_value = 0 # Smith Waterman DP calculations for i in _range(1, len(string1) + 1): for j in _range(1, len(string2) + 1): match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1]) delete = dist_mat[i - 1, j] - gap_cost insert = dist_mat[i, j - 1] - gap_cost dist_mat[i, j] = max(0, match, delete, insert) max_value = max(max_value, dist_mat[i, j]) return max_value
def get_raw_score(self, string1, string2, force_ascii=True, full_process=True): """ Computes the Fuzzy Wuzzy partial token sort measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str), : Input strings force_ascii (boolean) : Flag to remove non-ascii characters or not full_process (boolean) : Flag to process the string or not. Processing includes removing non alphanumeric characters, converting string to lower case and removing leading and trailing whitespaces. Returns: Partial Token Sort measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = PartialTokenSort() >>> s.get_raw_score('great is scala', 'java is great') 81 >>> s.get_raw_score('Sue', 'sue') 100 >>> s.get_raw_score('C++ and Java', 'Java and Python') 64 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 sorted1 = self._process_string_and_sort(string1, force_ascii, full_process=full_process) sorted2 = self._process_string_and_sort(string2, force_ascii, full_process=full_process) partialRatio = PartialRatio() return partialRatio.get_raw_score(sorted1, sorted2)
def get_raw_score(self, string1, string2): """ Computes the Fuzzy Wuzzy ratio measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str): Input strings Returns: Ratio measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = Ratio() >>> s.get_raw_score('Robert', 'Rupert') 67 >>> s.get_raw_score('Sue', 'sue') 67 >>> s.get_raw_score('example', 'samples') 71 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) sm = SequenceMatcher(None, string1, string2) return int(round(100 * sm.ratio()))
def get_sim_score(self, string1, string2): """ Computes the Fuzzy Wuzzy ratio similarity score between two strings. This score is in the range [0,1]. Args: string1,string2 (str): Input strings Returns: Ratio measure similarity score (float) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = Ratio() >>> s.get_sim_score('Robert', 'Rupert') 0.67 >>> s.get_sim_score('Sue', 'sue') 0.67 >>> s.get_sim_score('example', 'samples') 0.71 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 raw_score = 1.0 * self.get_raw_score(string1, string2) sim_score = raw_score / 100 return sim_score
def get_sim_score(self, string1, string2): """ Computes the Fuzzy Wuzzy partial ratio similarity score between two strings. This score is in the range [0,1]. Args: string1,string2 (str): Input strings Returns: Partial Ratio measure similarity score (float) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = PartialRatio() >>> s.get_sim_score('Robert Rupert', 'Rupert') 1.0 >>> s.get_sim_score('Sue', 'sue') 0.67 >>> s.get_sim_score('example', 'samples') 0.86 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 raw_score = 1.0 * self.get_raw_score(string1, string2) sim_score = raw_score / 100 return sim_score
def get_raw_score(self, string1, string2): """ Computes the editex distance between two strings. As described on pages 3 & 4 of Zobel, Justin and Philip Dart. 1996. Phonetic string matching: Lessons from information retrieval. In: Proceedings of the ACM-SIGIR Conference on Research and Development in Information Retrieval, Zurich, Switzerland. 166–173. http://goanna.cs.rmit.edu.au/~jz/fulltext/sigir96.pdf The local variant is based on Ring, Nicholas and Alexandra L. Uitdenbogerd. 2009. Finding ‘Lucy in Disguise’: The Misheard Lyric Matching Problem. In: Proceedings of the 5th Asia Information Retrieval Symposium, Sapporo, Japan. 157-167. http://www.seg.rmit.edu.au/research/download.php?manuscript=404 Args: string1,string2 (str): Input strings Returns: Editex distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> ed = Editex() >>> ed.get_raw_score('cat', 'hat') 2 >>> ed.get_raw_score('Niall', 'Neil') 2 >>> ed.get_raw_score('aluminum', 'Catalan') 12 >>> ed.get_raw_score('ATCG', 'TAGC') 6 References: * Abydos Library - https://github.com/chrislit/abydos/blob/master/abydos/distance.py """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0 # convert both the strings to NFKD normalized unicode string1 = unicodedata.normalize('NFKD', text_type(string1.upper())) string2 = unicodedata.normalize('NFKD', text_type(string2.upper())) # convert ß to SS (for Python2) string1 = string1.replace('ß', 'SS') string2 = string2.replace('ß', 'SS') if len(string1) == 0: return len(string2) * self.mismatch_cost if len(string2) == 0: return len(string1) * self.mismatch_cost d_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.int) len1 = len(string1) len2 = len(string2) string1 = ' ' + string1 string2 = ' ' + string2 editex_helper = EditexHelper(self.match_cost, self.mismatch_cost, self.group_cost) if not self.local: for i in xrange(1, len1 + 1): d_mat[i, 0] = d_mat[i - 1, 0] + editex_helper.d_cost( string1[i - 1], string1[i]) for j in xrange(1, len2 + 1): d_mat[0, j] = d_mat[0, j - 1] + editex_helper.d_cost(string2[j - 1], string2[j]) for i in xrange(1, len1 + 1): for j in xrange(1, len2 + 1): d_mat[i, j] = min(d_mat[i - 1, j] + editex_helper.d_cost( string1[i - 1], string1[i]), d_mat[i, j - 1] + editex_helper.d_cost( string2[j - 1], string2[j]), d_mat[i - 1, j - 1] + editex_helper.r_cost( string1[i], string2[j])) return d_mat[len1, len2]
def get_raw_score(self, string1, string2): """ Computes the Soundex phonetic similarity between two strings. Phonetic measure such as soundex match string based on their sound. These measures have been especially effective in matching names, since names are often spelled in different ways that sound the same. For example, Meyer, Meier, and Mire sound the same, as do Smith, Smithe, and Smythe. Soundex is used primarily to match surnames. It does not work as well for names of East Asian origins, because much of the discriminating power of these names resides in the vowel sounds, which the code ignores. Args: string1,string2 (str): Input strings Returns: Soundex similarity score (int) is returned Raises: TypeError : If the inputs are not strings Examples: >>> s = Soundex() >>> s.get_raw_score('Robert', 'Rupert') 1 >>> s.get_raw_score('Sue', 's') 1 >>> s.get_raw_score('Gough', 'Goff') 0 >>> s.get_raw_score('a,,li', 'ali') 1 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # remove all chars but alphanumeric characters string1 = re.sub("[^a-zA-Z0-9]", "", string1) string2 = re.sub("[^a-zA-Z0-9]", "", string2) utils.sim_check_for_zero_len(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 1 string1, string2 = string1.upper(), string2.upper() first_letter1, first_letter2 = string1[0], string2[0] string1, string2 = string1[1:], string2[1:] # remove occurrences of vowels, 'y', 'w' and 'h' string1 = re.sub('[AEIOUYWH]', '', string1) string2 = re.sub('[AEIOUYWH]', '', string2) # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4 # (M,N)->5 (R)->6 string1 = re.sub('[BFPV]', '1', string1) string1 = re.sub('[CGJKQSXZ]', '2', string1) string1 = re.sub('[DT]', '3', string1) string1 = re.sub('[L]', '4', string1) string1 = re.sub('[MN]', '5', string1) string1 = re.sub('[R]', '6', string1) string2 = re.sub('[BFPV]', '1', string2) string2 = re.sub('[CGJKQSXZ]', '2', string2) string2 = re.sub('[DT]', '3', string2) string2 = re.sub('[L]', '4', string2) string2 = re.sub('[MN]', '5', string2) string2 = re.sub('[R]', '6', string2) string1 = first_letter1 + string1[:3] string2 = first_letter2 + string2[:3] return 1 if string1 == string2 else 0
def get_raw_score(self, string1, string2): """ Computes the Fuzzy Wuzzy partial ratio measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str): Input strings Returns: Partial Ratio measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = PartialRatio() >>> s.get_raw_score('Robert Rupert', 'Rupert') 100 >>> s.get_raw_score('Sue', 'sue') 67 >>> s.get_raw_score('example', 'samples') 86 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) # string1 should be smaller in length than string2. If this is not the case # then swap string1 and string2 if len(string1) > len(string2): temp = string1 string1 = string2 string2 = temp sm = SequenceMatcher(None, string1, string2) matching_blocks = sm.get_matching_blocks() scores = [] for block in matching_blocks: string2_starting_index = 0 if (block[1] - block[0] > 0): string2_starting_index = block[1] - block[0] string2_ending_index = string2_starting_index + len(string1) string2_substr = string2[string2_starting_index:string2_ending_index] sm2 = SequenceMatcher(None, string1, string2_substr) similarity_ratio = sm2.ratio() if similarity_ratio > .995: return 100 else: scores.append(similarity_ratio) return int(round(100 * max(scores)))
def soundex(string1, string2): """ Computes the Soundex phonetic similarity between two strings. Phonetic measure such as soundex match string based on their sound. These measures have been especially effective in matching names, since names are often spelled in different ways that sound the same. For example, Meyer, Meier, and Mire sound the same, as do Smith, Smithe, and Smythe. Soundex is used primarily to match surnames. It does not work as well for names of East Asian origins, because much of the discriminating power of these names resides in the vowel sounds, which the code ignores. Args: string1,string2 (str): Input strings Returns: Soundex similarity score (int) is returned Raises: TypeError : If the inputs are not strings Examples: >>> soundex('Robert', 'Rupert') 1 >>> soundex('Sue', 's') 1 >>> soundex('Gough', 'Goff') 0 >>> soundex('a,,li', 'ali') 1 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 1 utils.sim_check_for_zero_len(string1, string2) string1, string2 = string1.upper(), string2.upper() firstLetter1, firstLetter2 = string1[0], string2[0] string1, string2 = string1[1:], string2[1:] # remove occurrences of vowels, 'y', 'w' and 'h' string1 = re.sub('[AEIOUYWH]', '', string1) string2 = re.sub('[AEIOUYWH]', '', string2) # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4 (M,N)->5 (R)->6 string1 = re.sub('[BFPV]', '1', string1) string1 = re.sub('[CGJKQSXZ]', '2', string1) string1 = re.sub('[DT]', '3', string1) string1 = re.sub('[L]', '4', string1) string1 = re.sub('[MN]', '5', string1) string1 = re.sub('[R]', '6', string1) string2 = re.sub('[BFPV]', '1', string2) string2 = re.sub('[CGJKQSXZ]', '2', string2) string2 = re.sub('[DT]', '3', string2) string2 = re.sub('[L]', '4', string2) string2 = re.sub('[MN]', '5', string2) string2 = re.sub('[R]', '6', string2) # remove all chars but digits string1 = re.sub("\D", "", string1) string2 = re.sub("\D", "", string2) string1 = firstLetter1 + string1[:3] string2 = firstLetter2 + string2[:3] return 1 if string1 == string2 else 0
def editex(string1, string2, match_cost=0, group_cost=1, mismatch_cost=2, local=False): """ Computes the editex distance between two strings. As described on pages 3 & 4 of Zobel, Justin and Philip Dart. 1996. Phonetic string matching: Lessons from information retrieval. In: Proceedings of the ACM-SIGIR Conference on Research and Development in Information Retrieval, Zurich, Switzerland. 166–173. http://goanna.cs.rmit.edu.au/~jz/fulltext/sigir96.pdf The local variant is based on Ring, Nicholas and Alexandra L. Uitdenbogerd. 2009. Finding ‘Lucy in Disguise’: The Misheard Lyric Matching Problem. In: Proceedings of the 5th Asia Information Retrieval Symposium, Sapporo, Japan. 157-167. http://www.seg.rmit.edu.au/research/download.php?manuscript=404 Args: string1,string2 (str): Input strings match_cost (int): Weight to give the correct char match, default=0 group_cost (int): Weight to give if the chars are in the same editex group, default=1 mismatch_cost (int): Weight to give the incorrect char match, default=2 local (boolean): Local variant on/off, default=False Returns: Editex distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> editex('cat', 'hat') 2 >>> editex('Niall', 'Neil') 2 >>> editex('aluminum', 'Catalan') 12 >>> editex('ATCG', 'TAGC') 6 References: * Abydos Library - https://github.com/chrislit/abydos/blob/master/abydos/distance.py """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0 # convert both the strings to NFKD normalized unicode string1 = unicodedata.normalize('NFKD', _unicode(string1.upper())) string2 = unicodedata.normalize('NFKD', _unicode(string2.upper())) # convert ß to SS (for Python2) string1 = string1.replace('ß', 'SS') string2 = string2.replace('ß', 'SS') if string1 == string2: return 0 if len(string1) == 0: return len(string2) * mismatch_cost if len(string2) == 0: return len(string1) * mismatch_cost d_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.int) len1 = len(string1) len2 = len(string2) string1 = ' ' + string1 string2 = ' ' + string2 editex_helper = utils.Editex(match_cost, mismatch_cost, group_cost) if not local: for i in _range(1, len1 + 1): d_mat[i, 0] = d_mat[i - 1, 0] + editex_helper.d_cost(string1[i - 1], string1[i]) for j in _range(1, len2 + 1): d_mat[0, j] = d_mat[0, j - 1] + editex_helper.d_cost(string2[j - 1], string2[j]) for i in _range(1, len1 + 1): for j in _range(1, len2 + 1): d_mat[i, j] = min(d_mat[i - 1, j] + editex_helper.d_cost(string1[i - 1], string1[i]), d_mat[i, j - 1] + editex_helper.d_cost(string2[j - 1], string2[j]), d_mat[i - 1, j - 1] + editex_helper.r_cost(string1[i], string2[j])) return d_mat[len1, len2]
def levenshtein(string1, string2): """ Computes the Levenshtein distance between two strings. Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string is carried out using a sequence of the following operators: delete a character, insert a character, and substitute one character for another. Args: string1,string2 (str): Input strings Returns: Levenshtein distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> levenshtein('a', '') 1 >>> levenshtein('example', 'samples') 3 >>> levenshtein('levenshtein', 'frankenstein') 6 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0.0 ins_cost, del_cost, sub_cost, trans_cost = (1, 1, 1, 1) len_str1 = len(string1) len_str2 = len(string2) if len_str1 == 0: return len_str2 * ins_cost if len_str2 == 0: return len_str1 * del_cost d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int) for i in _range(len_str1 + 1): d_mat[i, 0] = i * del_cost for j in _range(len_str2 + 1): d_mat[0, j] = j * ins_cost for i in _range(len_str1): for j in _range(len_str2): d_mat[i + 1, j + 1] = min( d_mat[i + 1, j] + ins_cost, d_mat[i, j + 1] + del_cost, d_mat[i, j] + (sub_cost if string1[i] != string2[j] else 0) ) return d_mat[len_str1, len_str2]