def test_similarity(self): self.assertEqual( metrics.levenshtein_similarity("night", "nacht"), metrics.similarity("night", "nacht", metrics.LEVENSHTEIN)) self.assertEqual( metrics.dice_coefficient("night", "nacht"), metrics.similarity("night", "nacht", metrics.DICE))
def test_similarity(self): self.assertEqual( metrics.levenshtein_similarity("night", "nacht"), metrics.similarity("night", "nacht", metrics.LEVENSHTEIN)) self.assertEqual(metrics.dice_coefficient("night", "nacht"), metrics.similarity("night", "nacht", metrics.DICE)) print("pattern.metrics.similarity()")
def fuzzySearch(self, result, query): best, best_i = 0, None for i in range(len(result) - len(query) + 1): score = similarity(result[i:i+len(query)], query) if best < score: best = score best_i = i return result[best_i+len(query):] if best_i != None else ''
def fuzzySearch(self, result, query): best, best_i = 0, None for i in range(len(result) - len(query) + 1): score = similarity(result[i:i + len(query)], query) if best < score: best = score best_i = i return result[best_i + len(query):] if best_i != None else ''
def joinSimilar(self, t1, t2): s1, w1 = t1 s2, w2 = t2 if w1 == 0 or w2 == 0: return #already previously merged sim = similarity(' '.join(s1), ' '.join(s2)) if sim > .75 and sim != 1: if w1 > w2: self._grams[s2] = 0 self._grams[s1] += w2 else: self._grams[s1] = 0 self._grams[s2] += w1
# 1) Suffix "-bie" and prefix "brie-" are almost identical and go together nicely. # 2) Semantically, "briefing" refers to a one-sided kind of meeting, # where a meeting is a form of communication that many people find mindless; # whereas "zombies" are mindless and can't communicate. # There is a vaguely humoristic connection between the two concepts. # How about: "the drill sergeant zombriefed the men" ? # To simulate (2) algorithmically, we'd need lots of learning material. # Let's see if we can simulate (1) with a few tricks. from pattern.metrics import similarity # The similarity() function computes (1 - the Levenshtein distance): # http://www.clips.ua.ac.be/pages/pattern-metrics#similarity # The higher the number (0.0-1.0), the more similar two strings are. print similarity("bie", "brie") # 0.75 # So, given a word (e.g., "briefing"), we could look for a second word # that we can glue to the left of it - if the prefix of the given word # and the suffix of the second word are sufficiently similar # (e.g., similarity >= 0.75) ... from pattern.en import lexicon # English {word: word type}-dictionary w1 = "briefing" for w2 in lexicon.keys(): if w2[0].isupper(): # Exclude proper names like "Herbie". continue if len(w2) <= 3: # Length of "zombie" > 3, OK. continue