Esempio n. 1
0
def test_multiple_models():
    tfidf_matcher = TFIDF(n_gram_range=(3, 3),
                          min_similarity=0,
                          model_id="TF-IDF")
    tfidf_large_matcher = TFIDF(n_gram_range=(3, 6), min_similarity=0)
    base_edit_matcher = EditDistance(n_jobs=1)
    ratio_matcher = EditDistance(n_jobs=1, scorer=fuzz.ratio)
    rapidfuzz_matcher = RapidFuzz(n_jobs=1)
    matchers = [
        tfidf_matcher, tfidf_large_matcher, base_edit_matcher, ratio_matcher,
        rapidfuzz_matcher
    ]

    model = PolyFuzz(matchers).match(from_list, to_list)

    # Test if correct matches are found
    for model_id in model.get_ids():
        assert model_id in model.get_matches().keys()
        assert isinstance(model.get_matches(model_id), pd.DataFrame)
    assert len(model.get_matches()) == len(matchers)

    # Test if error is raised when accessing clusters before creating them
    with pytest.raises(ValueError):
        model.get_clusters()

    with pytest.raises(ValueError):
        model.get_cluster_mappings()

    # Test if groupings are found
    model.group()
    for model_id in model.get_ids():
        assert model_id in model.get_cluster_mappings().keys()
    assert len(model.get_cluster_mappings()) == len(matchers)
Esempio n. 2
0
def test_grouper_same_list():
    model = PolyFuzz("TF-IDF").match(from_list, from_list)
    model.group(link_min_similarity=0.75, group_all_strings=True)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']

    assert model.get_clusters() == {1: ['apples', 'apple', 'appl']}
    assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1, 'appl': 1}
Esempio n. 3
0
def test_grouper(method):
    model = PolyFuzz(method).match(from_list, to_list)
    model.group(link_min_similarity=0.75)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']

    assert model.get_clusters() == {1: ['apples', 'apple']}
    assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1}