def test_basic(self):
     test_cases = (
         (0, "", ""),
         (0, "same string", "same string"),
         (1, "a", "b"),
         (1, "12345", "123 45"),
         (1, "the fall", "the falll"),
         (2, "rolling stones", "roling stone"),
         (5, "", "12345"),
         (2, "1234", "123456"),
         (2, "2345", "123456"),
         (2, "3456", "123456"),
         (3, "abc", "xyz"),
         (3, "kitten", "sitting"),
         (3, "saturday", "sunday"),
         (9, "VERY", "different"))
     for expected_dist, string_1, string_2 in test_cases:
         dist = similarity.get_levenshtein_distance(string_1, string_2)
         self.assertEqual(expected_dist, dist, 
                          msg="(%s, %s)" % (string_1, string_2))
         dist = similarity.get_levenshtein_distance(string_2, string_1)
         self.assertEqual(expected_dist, dist,
                          msg="(%s, %s)" % (string_2, string_1))
         # Test setting max_value for many different values.
         for max_value in xrange(1, expected_dist+2):
             clamped_dist = similarity.get_levenshtein_distance(
                 string_1, string_2, max_value=max_value)
             self.assertEqual(min(expected_dist, max_value),
                              clamped_dist)
Example #2
0
 def test_basic(self):
     test_cases = ((0, "", ""), (0, "same string", "same string"),
                   (1, "a", "b"), (1, "12345", "123 45"), (1, "the fall",
                                                           "the falll"),
                   (2, "rolling stones", "roling stone"), (5, "", "12345"),
                   (2, "1234", "123456"), (2, "2345", "123456"),
                   (2, "3456", "123456"), (3, "abc", "xyz"), (3, "kitten",
                                                              "sitting"),
                   (3, "saturday", "sunday"), (9, "VERY", "different"))
     for expected_dist, string_1, string_2 in test_cases:
         dist = similarity.get_levenshtein_distance(string_1, string_2)
         self.assertEqual(expected_dist,
                          dist,
                          msg="(%s, %s)" % (string_1, string_2))
         dist = similarity.get_levenshtein_distance(string_2, string_1)
         self.assertEqual(expected_dist,
                          dist,
                          msg="(%s, %s)" % (string_2, string_1))
         # Test setting max_value for many different values.
         for max_value in xrange(1, expected_dist + 2):
             clamped_dist = similarity.get_levenshtein_distance(
                 string_1, string_2, max_value=max_value)
             self.assertEqual(min(expected_dist, max_value), clamped_dist)
Example #3
0
def suggest(name):
    canon_name = similarity.canonicalize_string(name)
    _global_lock.acquire()
    try:
        canon_whitelist = list(_global_whitelist)
    finally:
        _global_lock.release()
    best_guess = None
    # We ignore any items that are more than 10 edits away from our
    # original name.
    MAX_DIST = 10
    MAX_NORM_DIST = 0.25
    best_dist = 1e+100
    for guess in canon_whitelist:
        normalizer = (len(guess)+len(canon_name)/2.0)
        max_value = min(MAX_DIST, int(1+normalizer*MAX_NORM_DIST))
        lev_dist = similarity.get_levenshtein_distance(
            canon_name, guess, max_value=max_value)
        if lev_dist < MAX_DIST:
            normalized_lev_dist = lev_dist / normalizer
            if normalized_lev_dist < MAX_NORM_DIST:
                best_guess = guess
                best_dist = normalized_lev_dist
    return _global_whitelist.get(best_guess)