def test_compare_implementations(): # Compare the implementations of python-Levenshtein to our # pure-Python implementations if Levenshtein is False: raise unittest.SkipTest # Test on strings with randomly placed common char for string1, string2 in _random_common_char_pairs(n_pairs=50): assert (string_distances._jaro_winkler( string1, string2, winkler=False) == Levenshtein.jaro(string1, string2)) assert (string_distances._jaro_winkler( string1, string2, winkler=True) == Levenshtein.jaro_winkler(string1, string2)) assert (string_distances.levenshtein_ratio( string1, string2) == Levenshtein.ratio(string1, string2)) # Test on random strings for string1, string2 in _random_string_pairs(n_pairs=50): assert (string_distances._jaro_winkler( string1, string2, winkler=False) == Levenshtein.jaro(string1, string2)) assert (string_distances._jaro_winkler( string1, string2, winkler=True) == Levenshtein.jaro_winkler(string1, string2)) assert (string_distances.levenshtein_ratio( string1, string2) == Levenshtein.ratio(string1, string2))
def test_compare_implementations(): # Compare the implementations of python-Levenshtein to our # pure-Python implementations if Levenshtein is False: raise unittest.SkipTest for string1, string2 in _random_string_pairs(n_pairs=10): assert (string_distances._jaro_winkler( string1, string2, winkler=False) == Levenshtein.jaro(string1, string2)) assert (string_distances._jaro_winkler( string1, string2) == Levenshtein.jaro_winkler(string1, string2)) assert (string_distances.levenshtein_ratio( string1, string2) == Levenshtein.ratio(string1, string2))
def test_similarity_encoder(): X = np.array(['aa', 'aaa', 'aaab']).reshape(-1, 1) X_test = np.array([['Aa', 'aAa', 'aaa', 'aaab', ' aaa c']]).reshape(-1, 1) similarities = ['levenshtein-ratio', 'jaro-winkler', 'jaro', 'ngram'] for similarity in similarities: model = similarity_encoder.SimilarityEncoder(similarity=similarity, handle_unknown='ignore') encoder = model.fit(X).transform(X_test) if similarity == 'levenshtein-ratio': ans = np.zeros((len(X_test), len(X))) for i, x_t in enumerate(X_test.reshape(-1)): for j, x in enumerate(X.reshape(-1)): ans[i, j] = string_distances.levenshtein_ratio(x_t, x) assert np.array_equal(encoder, ans) if similarity == 'jaro-winkler': ans = np.zeros((len(X_test), len(X))) for i, x_t in enumerate(X_test.reshape(-1)): for j, x in enumerate(X.reshape(-1)): ans[i, j] = string_distances.jaro_winkler(x_t, x) assert np.array_equal(encoder, ans) if similarity == 'jaro': ans = np.zeros((len(X_test), len(X))) for i, x_t in enumerate(X_test.reshape(-1)): for j, x in enumerate(X.reshape(-1)): ans[i, j] = string_distances.jaro(x_t, x) assert np.array_equal(encoder, ans) if similarity == 'ngram': ans = np.zeros((len(X_test), len(X))) for i, x_t in enumerate(X_test.reshape(-1)): for j, x in enumerate(X.reshape(-1)): ans[i, j] = string_distances.ngram_similarity(x_t, x, 3) assert np.array_equal(encoder, ans)
def test_identical_strings(): # Test that if 2 strings are the same, the similarity for string1, _ in _random_string_pairs(n_pairs=10): assert string_distances.jaro(string1, string1) == 1 assert string_distances.jaro_winkler(string1, string1) == 1 assert string_distances.levenshtein_ratio(string1, string1) == 1