コード例 #1
0
def test_v_measure_and_mutual_information(seed=36):
    # Check relation between v_measure, entropy and mutual information
    for i in np.logspace(1, 4, 4).astype(np.int):
        random_state = np.random.RandomState(seed)
        labels_a, labels_b = (random_state.randint(0, 10, i),
                              random_state.randint(0, 10, i))
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            2.0 * mutual_info_score(labels_a, labels_b) /
                            (entropy(labels_a) + entropy(labels_b)), 0)
        avg = 'arithmetic'
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            normalized_mutual_info_score(labels_a, labels_b,
                                                         average_method=avg)
                            )
コード例 #2
0
def test_int_input():
    X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]
    for dtype in [np.int32, np.int64]:
        X_int = np.array(X_list, dtype=dtype)
        X_int_csr = sp.csr_matrix(X_int)
        init_int = X_int[:2]

        fitted_models = [
            KMeans(n_clusters=2).fit(X_int),
            KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int),
            # mini batch kmeans is very unstable on such a small dataset hence
            # we use many inits
            MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int),
            MiniBatchKMeans(n_clusters=2, n_init=10,
                            batch_size=2).fit(X_int_csr),
            MiniBatchKMeans(n_clusters=2,
                            batch_size=2,
                            init=init_int,
                            n_init=1).fit(X_int),
            MiniBatchKMeans(n_clusters=2,
                            batch_size=2,
                            init=init_int,
                            n_init=1).fit(X_int_csr),
        ]

        for km in fitted_models:
            assert km.cluster_centers_.dtype == np.float64

        expected_labels = [0, 1, 1, 0, 0, 1]
        scores = np.array([
            v_measure_score(expected_labels, km.labels_)
            for km in fitted_models
        ])
        assert_array_almost_equal(scores, np.ones(scores.shape[0]))
コード例 #3
0
def test_mini_batch_k_means_random_init_partial_fit():
    km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)

    # use the partial_fit API for online learning
    for X_minibatch in np.array_split(X, 10):
        km.partial_fit(X_minibatch)

    # compute the labeling on the complete dataset
    labels = km.predict(X)
    assert v_measure_score(true_labels, labels) == 1.0
コード例 #4
0
def test_scaled_weights():
    # scaling all sample weights by a common factor
    # shouldn't change the result
    sample_weight = np.ones(n_samples)
    for estimator in [
            KMeans(n_clusters=n_clusters, random_state=42),
            MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    ]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=0.5 * sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
コード例 #5
0
def test_unit_weights_vs_no_weights():
    # not passing any sample weights should be equivalent
    # to all weights equal to one
    sample_weight = np.ones(n_samples)
    for estimator in [
            KMeans(n_clusters=n_clusters, random_state=42),
            MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    ]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
コード例 #6
0
def test_exactly_zero_info_score():
    # Check numerical stability when information is exactly zero
    for i in np.logspace(1, 4, 4).astype(np.int):
        labels_a, labels_b = (np.ones(i, dtype=np.int),
                              np.arange(i, dtype=np.int))
        assert normalized_mutual_info_score(labels_a, labels_b) == 0.0
        assert v_measure_score(labels_a, labels_b) == 0.0
        assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0
        assert normalized_mutual_info_score(labels_a, labels_b) == 0.0
        for method in ["min", "geometric", "arithmetic", "max"]:
            assert adjusted_mutual_info_score(labels_a, labels_b,
                                              method) == 0.0
            assert normalized_mutual_info_score(labels_a, labels_b,
                                                method) == 0.0
コード例 #7
0
def _check_fitted_model(km):
    # check that the number of clusters centers and distinct labels match
    # the expectation
    centers = km.cluster_centers_
    assert centers.shape == (n_clusters, n_features)

    labels = km.labels_
    assert np.unique(labels).shape[0] == n_clusters

    # check that the labels assignment are perfect (up to a permutation)
    assert v_measure_score(true_labels, labels) == 1.0
    assert km.inertia_ > 0.0

    # check error on dataset being too small
    assert_raise_message(
        ValueError, "n_samples=1 should be >= n_clusters=%d" % km.n_clusters,
        km.fit, [[0., 1.]])
コード例 #8
0
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X,
                                                   n_clusters=n_clusters,
                                                   sample_weight=None,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert centers.shape == (n_clusters, n_features)

    labels = labels
    assert np.unique(labels).shape[0] == n_clusters

    # check that the labels assignment are perfect (up to a permutation)
    assert v_measure_score(true_labels, labels) == 1.0
    assert inertia > 0.0

    # check warning when centers are passed
    assert_warns(RuntimeWarning,
                 k_means,
                 X,
                 n_clusters=n_clusters,
                 sample_weight=None,
                 init=centers)

    # to many clusters desired
    with pytest.raises(ValueError):
        k_means(X, n_clusters=X.shape[0] + 1, sample_weight=None)

    # kmeans for algorithm='elkan' raises TypeError on sparse matrix
    assert_raise_message(TypeError, "algorithm='elkan' not supported for "
                         "sparse input X",
                         k_means,
                         X=X_csr,
                         n_clusters=2,
                         sample_weight=None,
                         algorithm="elkan")
コード例 #9
0
def test_weighted_vs_repeated():
    # a sample weight of N should yield the same result as an N-fold
    # repetition of the sample
    rng = np.random.RandomState(0)
    sample_weight = rng.randint(1, 5, size=n_samples)
    X_repeat = np.repeat(X, sample_weight, axis=0)
    estimators = [
        KMeans(init="k-means++", n_clusters=n_clusters, random_state=42),
        KMeans(init="random", n_clusters=n_clusters, random_state=42),
        KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42),
        MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42)
    ]
    for estimator in estimators:
        est_weighted = clone(estimator).fit(X, sample_weight=sample_weight)
        est_repeated = clone(estimator).fit(X_repeat)
        repeated_labels = np.repeat(est_weighted.labels_, sample_weight)
        assert_almost_equal(
            v_measure_score(est_repeated.labels_, repeated_labels), 1.0)
        if not isinstance(estimator, MiniBatchKMeans):
            assert_almost_equal(_sort_centers(est_weighted.cluster_centers_),
                                _sort_centers(est_repeated.cluster_centers_))
コード例 #10
0
def test_beta_parameter():
    # test for when beta passed to
    # homogeneity_completeness_v_measure
    # and v_measure_score
    beta_test = 0.2
    h_test = 0.67
    c_test = 0.42
    v_test = ((1 + beta_test) * h_test * c_test
              / (beta_test * h_test + c_test))

    h, c, v = homogeneity_completeness_v_measure(
        [0, 0, 0, 1, 1, 1],
        [0, 1, 0, 1, 2, 2],
        beta=beta_test)
    assert_almost_equal(h, h_test, 2)
    assert_almost_equal(c, c_test, 2)
    assert_almost_equal(v, v_test, 2)

    v = v_measure_score(
        [0, 0, 0, 1, 1, 1],
        [0, 1, 0, 1, 2, 2],
        beta=beta_test)
    assert_almost_equal(v, v_test, 2)
コード例 #11
0
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack',
                                 random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories),
                         batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time, v_measure_score(y_cocluster, y_true)))

print("MiniBatchKMeans...")
start_time = time()
y_kmeans = kmeans.fit_predict(X)
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time, v_measure_score(y_kmeans, y_true)))

feature_names = vectorizer.get_feature_names()
document_names = list(newsgroups.target_names[i] for i in newsgroups.target)


def bicluster_ncut(i):
    rows, cols = cocluster.get_indices(i)
    if not (np.any(rows) and np.any(cols)):
        import sys