def test_multiple_edits(self): """Applying edits of different types.""" self.assertEqual(levenshtein("kitten", "sitting"), 5) self.assertEqual(levenshtein("car", "curse", sub_cost=1), 3) self.assertEqual(recursive_levenshtein("intention", "execution"), 8) self.assertEqual(recursive_levenshtein("ghost", "mostly"), 5)
def test_empty_target_word(self): """The distance from a string to the empty target.""" self.assertEqual(levenshtein("void", ""), 4) self.assertEqual(levenshtein("void", "", del_cost=2), 8) self.assertEqual(levenshtein("void", "", ins_cost=3), 4) self.assertEqual(levenshtein("void", "", sub_cost=4), 4) self.assertEqual(recursive_levenshtein("void", ""), 4)
def test_identical_strings(self): """The distance for identical strings should be 0.""" strings = ("", "cat", "circle", "Levenshtein") for s in strings: self.assertEqual(levenshtein(s, s), 0) self.assertEqual(recursive_levenshtein(s, s), 0)
def test_time(self): """The recursive implementation is expected to perform slower.""" start = time() x = levenshtein("YPOEHOHRIWUBXMNHZF", "YCPOEHORIDUBXNHZF") elapsed = time() - start start_rec = time() y = recursive_levenshtein("YPOEHOHRIWUBXMNHZF", "YCPOEHORIDUBXNHZF") elapsed_rec = time() - start_rec self.assertEqual(x, y) self.assertLess(elapsed, elapsed_rec)
def test_levenshtein(self): """ For certain types of errors, Levenshtein edit distance between words' spellings is greater than the distance between their Soundex indices. """ sent1 = 'PLEZ CNOKE IF AN RNSR IS NOT REQID' toks1 = sent1.lower().split() sent2 = 'Please knock if an answer is not required' toks2 = sent2.lower().split() total = sum(levenshtein(*pair) for pair in zip(toks1, toks2)) sndx_total = sum( levenshtein(soundex(pair[0]), soundex(pair[1])) for pair in zip(toks1, toks2)) self.assertGreater(total, sndx_total) homophones = (('tail', 'tale'), ('right', 'write'), ('flower', 'flour'), ('break', 'brake'), ('accept', 'except'), ('you\'re', 'your')) total2 = sum(levenshtein(*pair, sub_cost=1) for pair in homophones) sndx_total2 = sum( levenshtein(soundex(pair[0]), soundex(pair[1])) for pair in homophones) self.assertGreaterEqual(total2, sndx_total2)
def test_substitution(self): """Substitution in various positions with varying cost.""" self.assertEqual(levenshtein("late", "date"), 2) self.assertEqual(levenshtein("tell", "tall"), 2) self.assertEqual(levenshtein("pass", "past"), 2) self.assertEqual(recursive_levenshtein("glide", "slide"), 2) self.assertEqual(recursive_levenshtein("tic", "tac"), 2) self.assertEqual(recursive_levenshtein("quite", "quits"), 2) self.assertEqual(levenshtein("train", "chain"), 4) self.assertEqual(levenshtein("breath", "breeze"), 6) self.assertEqual(recursive_levenshtein("ace", "place"), 2) self.assertEqual(recursive_levenshtein("L", "XXXL"), 3) self.assertEqual(levenshtein("quit", "suit", sub_cost=1), 1) self.assertEqual(levenshtein("meet", "meat", sub_cost=2), 2) self.assertEqual(levenshtein("look", "loom", sub_cost=0.5), 0.5) self.assertEqual(levenshtein("tooth", "teeth", sub_cost=0.5), 1)
def test_insertion(self): """Insertion in various positions with varying cost.""" self.assertEqual(levenshtein("cross", "across"), 1) self.assertEqual(levenshtein("tree", "three"), 1) self.assertEqual(levenshtein("cheer", "cheers"), 1) self.assertEqual(recursive_levenshtein("cross", "across"), 1) self.assertEqual(recursive_levenshtein("tree", "three"), 1) self.assertEqual(recursive_levenshtein("cheer", "cheers"), 1) self.assertEqual(levenshtein("ace", "place"), 2) self.assertEqual(levenshtein("L", "XXXL"), 3) self.assertEqual(recursive_levenshtein("ace", "place"), 2) self.assertEqual(recursive_levenshtein("L", "XXXL"), 3) self.assertEqual(levenshtein("ate", "mate", ins_cost=2), 2) self.assertEqual(levenshtein("met", "meet", ins_cost=3), 3) self.assertEqual(levenshtein("law", "lawn", ins_cost=4), 4) self.assertEqual(levenshtein("low", "below", ins_cost=0.5), 1)
def test_deletion(self): """Deletion in various positions with varying cost.""" self.assertEqual(levenshtein("tale", "ale"), 1) self.assertEqual(levenshtein("plain", "plan"), 1) self.assertEqual(levenshtein("dreams", "dream"), 1) self.assertEqual(recursive_levenshtein("tale", "ale"), 1) self.assertEqual(recursive_levenshtein("plain", "plan"), 1) self.assertEqual(recursive_levenshtein("dreams", "dream"), 1) self.assertEqual(levenshtein("trace", "ace"), 2) self.assertEqual(levenshtein("oooops", "ops"), 3) self.assertEqual(recursive_levenshtein("trace", "ace"), 2) self.assertEqual(recursive_levenshtein("oooops", "ops"), 3) self.assertEqual(levenshtein("shot", "hot", del_cost=2), 2) self.assertEqual(levenshtein("heat", "hat", del_cost=3), 3) self.assertEqual(levenshtein("hits", "hit", del_cost=4), 4) self.assertEqual(levenshtein("shut", "hut", del_cost=0.5), 0.5)
def full_pair_to_json(pair, w2c=collections.defaultdict(itertools.count().__next__), control_synonyms=True, meanings_only=False, include_jaccard=True, stops=spacy.lang.en.stop_words.STOP_WORDS | {'d', 's', "'", 're', 've', 'll', 'm'} | set(string.punctuation)): # unpack (i1, (s1, v1)), (i2, (s2, v2)) = pair # meaning scores meaning_scores = {"l2": l2(v1, v2), "cdist": cdist(v1, v2)} if meanings_only: jdict = { "idx": [i1, i2], #"sentences":[s1,s2], #useful for debugging, but eats up a lot of space. "meaning_scores": meaning_scores, "text_scores": {}, } return json.dumps(jdict) # text scores: preproc # convert sentences to reps c1, c2 = to_chr_seq(w2c, s1), to_chr_seq(w2c, s2) # filter stop words s1_f, s2_f = (w for w in s1 if w not in stops), (w for w in s2 if w not in stops) c1_f, c2_f = to_chr_seq(w2c, s1_f), to_chr_seq(w2c, s2_f) # text scores text_scores = { "lev": levenshtein(c1, c2), "lev_n": levenshtein_normalised(c1, c2), "lev_f": levenshtein(c1_f, c2_f), "lev_fn": levenshtein_normalised(c1_f, c2_f), } if include_jaccard: text_scores.update({ "jac": jaccard(c1, c2), "jac_f": jaccard(c1_f, c2_f), }) if control_synonyms: syn_s1 = list(map(default_synonym, s1)) syn_s2 = list(map(default_synonym, s2)) syn_c1, syn_c2 = to_chr_seq(w2c, syn_s1), to_chr_seq(w2c, syn_s2) syn_s1_f, syn_s2_f = (w for w in syn_s1 if w not in stops), (w for w in syn_s2 if w not in stops) syn_c1_f, syn_c2_f = to_chr_seq(w2c, syn_s1_f), to_chr_seq(w2c, syn_s2_f) text_scores.update({ "lev_syn": levenshtein(syn_c1, syn_c2), "lev_syn_n": levenshtein_normalised(syn_c1, syn_c2), "lev_syn_f": levenshtein(syn_c1_f, syn_c2_f), "lev_syn_fn": levenshtein_normalised(syn_c1_f, syn_c2_f), }) if include_jaccard: text_scores.update({ "jac_syn": jaccard(syn_c1, syn_c2), "jac_syn_f": jaccard(syn_c1_f, syn_c2_f), }) # return jdict = { "idx": [i1, i2], #"sentences":[s1,s2], #useful for debugging, but eats up a lot of space. "meaning_scores": meaning_scores, "text_scores": text_scores, } return json.dumps(jdict)