def correct_word(self, word): if word in self.vocab_to_freq: return word #Edit distance between sh = Simhash(word, f=self.f) candidates = self.simhash_index.get_near_dups(sh) if not candidates: #No near dups. Oh well. This word will go as it is. print 'no candidates' return word if len(candidates) == 1: #Only one candidate, so assume this is the correction return candidates[0] lev_dist_gen = ((other_w, levenshtein(other_w, word)) for other_w in candidates) closest_words, dists = zip(*all_min_or_max(lev_dist_gen, min, lambda item: item[1])) if len(closest_words) == 1: #One of the candidates had the best edit distance. Return that. return closest_words[0] #OK, there are multiple closest words. Rely on word frequency to choose the right one. vocab_to_freq = self.vocab_to_freq word_freq_gen = ((other_w, vocab_to_freq[other_w]) for other_w in closest_words) most_freq_words, freqs = zip(*all_min_or_max(word_freq_gen, max, lambda item: item[1])) #using choice because at this point there's no other way to narrow it down, unless we #track higher order ngrams. return choice(most_freq_words)
def compute_levenshtein(args): i, x1, x2 = args return i, levenshtein(x1, x2)
def dmetr(name1, name2): max_len = max(len(name1), len(name2)) max_dist = int(ceil(max_len * (1.0 - thresh))) ldist = levenshtein(name1, name2) return (1.0 - float(ldist) / max_len) if (ldist != -1 and max_len != 0) else 0.0
def similarity_ratio(self, line: '_Line') -> float: ratio = 1 - float(levenshtein( self.cleaned_text, line.cleaned_text)) / max( len(self.cleaned_text), len(line.cleaned_text)) return ratio