Esempio n. 1
0
 def correct_word(self, word):
   
   if word in self.vocab_to_freq:
     return word
   
   #Edit distance between
   sh = Simhash(word, f=self.f)
   candidates = self.simhash_index.get_near_dups(sh)
   
   if not candidates:
     #No near dups. Oh well. This word will go as it is.
     print 'no candidates'
     return word
   
   if len(candidates) == 1:
     #Only one candidate, so assume this is the correction
     return candidates[0]
     
   lev_dist_gen = ((other_w, levenshtein(other_w, word)) for other_w in candidates)
   closest_words, dists = zip(*all_min_or_max(lev_dist_gen, min, lambda item: item[1]))
   
   if len(closest_words) == 1:
     #One of the candidates had the best edit distance. Return that.
     return closest_words[0]
   
   #OK, there are multiple closest words. Rely on word frequency to choose the right one.
   vocab_to_freq = self.vocab_to_freq
   word_freq_gen = ((other_w, vocab_to_freq[other_w]) for other_w in closest_words)
   most_freq_words, freqs = zip(*all_min_or_max(word_freq_gen, max, lambda item: item[1]))
   
   #using choice because at this point there's no other way to narrow it down, unless we
   #track higher order ngrams.
   return choice(most_freq_words)
def compute_levenshtein(args):
    i, x1, x2 = args

    return i, levenshtein(x1, x2)
Esempio n. 3
0
 def dmetr(name1, name2):
     max_len = max(len(name1), len(name2))
     max_dist = int(ceil(max_len * (1.0 - thresh)))
     ldist = levenshtein(name1, name2)
     return (1.0 - float(ldist) / max_len) if (ldist != -1
                                               and max_len != 0) else 0.0
Esempio n. 4
0
    def similarity_ratio(self, line: '_Line') -> float:
        ratio = 1 - float(levenshtein(
            self.cleaned_text, line.cleaned_text)) / max(
                len(self.cleaned_text), len(line.cleaned_text))

        return ratio