Esempio n. 1
0
    def test_multiple_edits(self):
        """Applying edits of different types."""

        self.assertEqual(levenshtein("kitten", "sitting"), 5)
        self.assertEqual(levenshtein("car", "curse", sub_cost=1), 3)
        self.assertEqual(recursive_levenshtein("intention", "execution"), 8)
        self.assertEqual(recursive_levenshtein("ghost", "mostly"), 5)
Esempio n. 2
0
    def test_empty_target_word(self):
        """The distance from a string to the empty target."""

        self.assertEqual(levenshtein("void", ""), 4)
        self.assertEqual(levenshtein("void", "", del_cost=2), 8)
        self.assertEqual(levenshtein("void", "", ins_cost=3), 4)
        self.assertEqual(levenshtein("void", "", sub_cost=4), 4)
        self.assertEqual(recursive_levenshtein("void", ""), 4)
Esempio n. 3
0
    def test_identical_strings(self):
        """The distance for identical strings should be 0."""

        strings = ("", "cat", "circle", "Levenshtein")
        for s in strings:
            self.assertEqual(levenshtein(s, s), 0)
            self.assertEqual(recursive_levenshtein(s, s), 0)
Esempio n. 4
0
    def test_time(self):
        """The recursive implementation is expected to perform slower."""

        start = time()
        x = levenshtein("YPOEHOHRIWUBXMNHZF", "YCPOEHORIDUBXNHZF")
        elapsed = time() - start

        start_rec = time()
        y = recursive_levenshtein("YPOEHOHRIWUBXMNHZF", "YCPOEHORIDUBXNHZF")
        elapsed_rec = time() - start_rec

        self.assertEqual(x, y)
        self.assertLess(elapsed, elapsed_rec)
Esempio n. 5
0
    def test_levenshtein(self):
        """
        For certain types of errors, Levenshtein edit distance between words'
        spellings is greater than the distance between their Soundex indices.
        """

        sent1 = 'PLEZ CNOKE IF AN RNSR IS NOT REQID'
        toks1 = sent1.lower().split()
        sent2 = 'Please knock if an answer is not required'
        toks2 = sent2.lower().split()
        total = sum(levenshtein(*pair) for pair in zip(toks1, toks2))
        sndx_total = sum(
            levenshtein(soundex(pair[0]), soundex(pair[1]))
            for pair in zip(toks1, toks2))
        self.assertGreater(total, sndx_total)

        homophones = (('tail', 'tale'), ('right', 'write'),
                      ('flower', 'flour'), ('break', 'brake'),
                      ('accept', 'except'), ('you\'re', 'your'))
        total2 = sum(levenshtein(*pair, sub_cost=1) for pair in homophones)
        sndx_total2 = sum(
            levenshtein(soundex(pair[0]), soundex(pair[1]))
            for pair in homophones)
        self.assertGreaterEqual(total2, sndx_total2)
Esempio n. 6
0
    def test_substitution(self):
        """Substitution in various positions with varying cost."""

        self.assertEqual(levenshtein("late", "date"), 2)
        self.assertEqual(levenshtein("tell", "tall"), 2)
        self.assertEqual(levenshtein("pass", "past"), 2)

        self.assertEqual(recursive_levenshtein("glide", "slide"), 2)
        self.assertEqual(recursive_levenshtein("tic", "tac"), 2)
        self.assertEqual(recursive_levenshtein("quite", "quits"), 2)

        self.assertEqual(levenshtein("train", "chain"), 4)
        self.assertEqual(levenshtein("breath", "breeze"), 6)

        self.assertEqual(recursive_levenshtein("ace", "place"), 2)
        self.assertEqual(recursive_levenshtein("L", "XXXL"), 3)

        self.assertEqual(levenshtein("quit", "suit", sub_cost=1), 1)
        self.assertEqual(levenshtein("meet", "meat", sub_cost=2), 2)
        self.assertEqual(levenshtein("look", "loom", sub_cost=0.5), 0.5)
        self.assertEqual(levenshtein("tooth", "teeth", sub_cost=0.5), 1)
Esempio n. 7
0
    def test_insertion(self):
        """Insertion in various positions with varying cost."""

        self.assertEqual(levenshtein("cross", "across"), 1)
        self.assertEqual(levenshtein("tree", "three"), 1)
        self.assertEqual(levenshtein("cheer", "cheers"), 1)

        self.assertEqual(recursive_levenshtein("cross", "across"), 1)
        self.assertEqual(recursive_levenshtein("tree", "three"), 1)
        self.assertEqual(recursive_levenshtein("cheer", "cheers"), 1)

        self.assertEqual(levenshtein("ace", "place"), 2)
        self.assertEqual(levenshtein("L", "XXXL"), 3)

        self.assertEqual(recursive_levenshtein("ace", "place"), 2)
        self.assertEqual(recursive_levenshtein("L", "XXXL"), 3)

        self.assertEqual(levenshtein("ate", "mate", ins_cost=2), 2)
        self.assertEqual(levenshtein("met", "meet", ins_cost=3), 3)
        self.assertEqual(levenshtein("law", "lawn", ins_cost=4), 4)
        self.assertEqual(levenshtein("low", "below", ins_cost=0.5), 1)
Esempio n. 8
0
    def test_deletion(self):
        """Deletion in various positions with varying cost."""

        self.assertEqual(levenshtein("tale", "ale"), 1)
        self.assertEqual(levenshtein("plain", "plan"), 1)
        self.assertEqual(levenshtein("dreams", "dream"), 1)

        self.assertEqual(recursive_levenshtein("tale", "ale"), 1)
        self.assertEqual(recursive_levenshtein("plain", "plan"), 1)
        self.assertEqual(recursive_levenshtein("dreams", "dream"), 1)

        self.assertEqual(levenshtein("trace", "ace"), 2)
        self.assertEqual(levenshtein("oooops", "ops"), 3)

        self.assertEqual(recursive_levenshtein("trace", "ace"), 2)
        self.assertEqual(recursive_levenshtein("oooops", "ops"), 3)

        self.assertEqual(levenshtein("shot", "hot", del_cost=2), 2)
        self.assertEqual(levenshtein("heat", "hat", del_cost=3), 3)
        self.assertEqual(levenshtein("hits", "hit", del_cost=4), 4)
        self.assertEqual(levenshtein("shut", "hut", del_cost=0.5), 0.5)
def full_pair_to_json(pair,
                      w2c=collections.defaultdict(itertools.count().__next__),
                      control_synonyms=True,
                      meanings_only=False,
                      include_jaccard=True,
                      stops=spacy.lang.en.stop_words.STOP_WORDS
                      | {'d', 's', "'", 're', 've', 'll', 'm'}
                      | set(string.punctuation)):
    # unpack
    (i1, (s1, v1)), (i2, (s2, v2)) = pair

    # meaning scores
    meaning_scores = {"l2": l2(v1, v2), "cdist": cdist(v1, v2)}
    if meanings_only:
        jdict = {
            "idx": [i1, i2],
            #"sentences":[s1,s2], #useful for debugging, but eats up a lot of space.
            "meaning_scores": meaning_scores,
            "text_scores": {},
        }
        return json.dumps(jdict)

    # text scores: preproc
    # convert sentences to reps
    c1, c2 = to_chr_seq(w2c, s1), to_chr_seq(w2c, s2)
    # filter stop words
    s1_f, s2_f = (w for w in s1 if w not in stops), (w for w in s2
                                                     if w not in stops)
    c1_f, c2_f = to_chr_seq(w2c, s1_f), to_chr_seq(w2c, s2_f)

    # text scores
    text_scores = {
        "lev": levenshtein(c1, c2),
        "lev_n": levenshtein_normalised(c1, c2),
        "lev_f": levenshtein(c1_f, c2_f),
        "lev_fn": levenshtein_normalised(c1_f, c2_f),
    }
    if include_jaccard:
        text_scores.update({
            "jac": jaccard(c1, c2),
            "jac_f": jaccard(c1_f, c2_f),
        })
    if control_synonyms:
        syn_s1 = list(map(default_synonym, s1))
        syn_s2 = list(map(default_synonym, s2))
        syn_c1, syn_c2 = to_chr_seq(w2c, syn_s1), to_chr_seq(w2c, syn_s2)
        syn_s1_f, syn_s2_f = (w for w in syn_s1
                              if w not in stops), (w for w in syn_s2
                                                   if w not in stops)
        syn_c1_f, syn_c2_f = to_chr_seq(w2c,
                                        syn_s1_f), to_chr_seq(w2c, syn_s2_f)
        text_scores.update({
            "lev_syn":
            levenshtein(syn_c1, syn_c2),
            "lev_syn_n":
            levenshtein_normalised(syn_c1, syn_c2),
            "lev_syn_f":
            levenshtein(syn_c1_f, syn_c2_f),
            "lev_syn_fn":
            levenshtein_normalised(syn_c1_f, syn_c2_f),
        })
        if include_jaccard:
            text_scores.update({
                "jac_syn": jaccard(syn_c1, syn_c2),
                "jac_syn_f": jaccard(syn_c1_f, syn_c2_f),
            })

    # return
    jdict = {
        "idx": [i1, i2],
        #"sentences":[s1,s2], #useful for debugging, but eats up a lot of space.
        "meaning_scores": meaning_scores,
        "text_scores": text_scores,
    }
    return json.dumps(jdict)