Esempio n. 1
0
def test_partial_fit():
    """Test partial_fit."""
    blocks = (y <= 1)

    clusterer1 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1])
    assert_equal(len(clusterer1.clusterers_), 1)
    clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1])
    assert_equal(len(clusterer1.clusterers_), 2)

    clusterer2 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer2.fit(X, blocks=blocks)

    c1 = clusterer1.predict(X, blocks=blocks)
    c2 = clusterer2.labels_

    assert_equal(paired_f_score(c1, c2), 1.0)
Esempio n. 2
0
def test_partial_fit():
    """Test partial_fit."""
    blocks = (y <= 1)

    clusterer1 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1])
    assert_equal(len(clusterer1.clusterers_), 1)
    clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1])
    assert_equal(len(clusterer1.clusterers_), 2)

    clusterer2 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer2.fit(X, blocks=blocks)

    c1 = clusterer1.predict(X, blocks=blocks)
    c2 = clusterer2.labels_

    assert_equal(paired_f_score(c1, c2), 1.0)
Esempio n. 3
0
    truth = data["y"]

    print("hello")

    # Block clustering with fixed threshold
    block_clusterer = BlockClustering(
        blocking=block_last_name_first_initial,
        base_estimator=ScipyHierarchicalClustering(threshold=0.5,
                                                   affinity=affinity,
                                                   method="complete"),
        verbose=3,
        n_jobs=-1)
    block_clusterer.fit(X)
    labels = block_clusterer.labels_

    # Print clusters
    for cluster in np.unique(labels):
        entries = set()

        for name, affiliation in X[labels == cluster]:
            entries.add((name, affiliation))

        print("Cluster #%d = %s" % (cluster, entries))
    print()

    # Statistics
    print("Number of blocks =", len(block_clusterer.clusterers_))
    print("True number of clusters", len(np.unique(truth)))
    print("Number of computed clusters", len(np.unique(labels)))
    print("Paired F-score =", paired_f_score(truth, labels))
Esempio n. 4
0
    X = data["X"]
    truth = data["y"]

    # Block clustering with fixed threshold
    block_clusterer = BlockClustering(
        blocking=block_last_name_first_initial,
        base_estimator=ScipyHierarchicalClustering(
            threshold=0.5,
            affinity=affinity,
            method="complete"),
        verbose=3,
        n_jobs=-1)
    block_clusterer.fit(X)
    labels = block_clusterer.labels_

    # Print clusters
    for cluster in np.unique(labels):
        entries = set()

        for name, affiliation in X[labels == cluster]:
            entries.add((name, affiliation))

        print("Cluster #%d = %s" % (cluster, entries))
    print()

    # Statistics
    print("Number of blocks =", len(block_clusterer.clusterers_))
    print("True number of clusters", len(np.unique(truth)))
    print("Number of computed clusters", len(np.unique(labels)))
    print("Paired F-score =", paired_f_score(truth, labels))