Example #1
0
def test_sample_weight_length():
    # check that an error is raised when passing sample weights
    # with an incompatible shape
    km = KMeans(n_clusters=n_clusters, random_state=42)
    msg = r'sample_weight.shape == \(2,\), expected \(100,\)'
    with pytest.raises(ValueError, match=msg):
        km.fit(X, sample_weight=np.ones(2))
Example #2
0
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
    # check that fit.predict gives same result as fit_predict
    # There's a very small chance of failure with elkan on unstructured dataset
    # because predict method uses fast euclidean distances computation which
    # may cause small numerical instabilities.
    # NB: This test is largely redundant with respect to test_predict and
    #     test_predict_equal_labels.  This test has the added effect of
    #     testing idempotence of the fittng procesdure which appears to
    #     be where it fails on some MacOS setups.
    if sys.platform == "darwin":
        pytest.xfail(
            "Known failures on MacOS, See "
            "https://github.com/scikit-learn/scikit-learn/issues/12644")
    if not (algo == 'elkan' and constructor is sp.csr_matrix):
        rng = np.random.RandomState(seed)

        X = make_blobs(n_samples=1000,
                       n_features=10,
                       centers=10,
                       random_state=rng)[0].astype(dtype, copy=False)
        X = constructor(X)

        kmeans = KMeans(algorithm=algo,
                        n_clusters=10,
                        random_state=seed,
                        tol=tol,
                        max_iter=max_iter,
                        n_jobs=1)

        labels_1 = kmeans.fit(X).predict(X)
        labels_2 = kmeans.fit_predict(X)

        assert_array_equal(labels_1, labels_2)
Example #3
0
def test_int_input():
    X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]
    for dtype in [np.int32, np.int64]:
        X_int = np.array(X_list, dtype=dtype)
        X_int_csr = sp.csr_matrix(X_int)
        init_int = X_int[:2]

        fitted_models = [
            KMeans(n_clusters=2).fit(X_int),
            KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int),
            # mini batch kmeans is very unstable on such a small dataset hence
            # we use many inits
            MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int),
            MiniBatchKMeans(n_clusters=2, n_init=10,
                            batch_size=2).fit(X_int_csr),
            MiniBatchKMeans(n_clusters=2,
                            batch_size=2,
                            init=init_int,
                            n_init=1).fit(X_int),
            MiniBatchKMeans(n_clusters=2,
                            batch_size=2,
                            init=init_int,
                            n_init=1).fit(X_int_csr),
        ]

        for km in fitted_models:
            assert km.cluster_centers_.dtype == np.float64

        expected_labels = [0, 1, 1, 0, 0, 1]
        scores = np.array([
            v_measure_score(expected_labels, km.labels_)
            for km in fitted_models
        ])
        assert_array_almost_equal(scores, np.ones(scores.shape[0]))
Example #4
0
def test_sample_weight_missing():
    from mrex.cluster import KMeans

    clf = AdaBoostClassifier(KMeans(), algorithm="SAMME")
    assert_raises(ValueError, clf.fit, X, y_regr)

    clf = AdaBoostRegressor(KMeans())
    assert_raises(ValueError, clf.fit, X, y_regr)
Example #5
0
def test_predict_equal_labels(algo):
    km = KMeans(random_state=13,
                n_jobs=1,
                n_init=1,
                max_iter=1,
                algorithm=algo)
    km.fit(X)
    assert_array_equal(km.predict(X), km.labels_)
Example #6
0
def test_k_means_init_fitted_centers(data):
    # Get a local optimum
    centers = KMeans(n_clusters=3).fit(X).cluster_centers_

    # Fit starting from a local optimum shouldn't change the solution
    new_centers = KMeans(n_clusters=3, init=centers,
                         n_init=1).fit(X).cluster_centers_
    assert_array_almost_equal(centers, new_centers)
Example #7
0
def test_result_of_kmeans_equal_in_diff_n_jobs():
    # PR 9288
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(50, 10))

    result_1 = KMeans(n_clusters=3, random_state=0, n_jobs=1).fit(X).labels_
    result_2 = KMeans(n_clusters=3, random_state=0, n_jobs=2).fit(X).labels_
    assert_array_equal(result_1, result_2)
Example #8
0
def test_k_means_copyx():
    # Check if copy_x=False returns nearly equal X after de-centering.
    my_X = X.copy()
    km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42)
    km.fit(my_X)
    _check_fitted_model(km)

    # check if my_X is centered
    assert_array_almost_equal(my_X, X)
Example #9
0
def test_k_means_n_init():
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(40, 2))

    # two regression tests on bad n_init argument
    # previous bug: n_init <= 0 threw non-informative TypeError (#3858)
    with pytest.raises(ValueError, match="n_init"):
        KMeans(n_init=0).fit(X)
    with pytest.raises(ValueError, match="n_init"):
        KMeans(n_init=-1).fit(X)
Example #10
0
def test_supervised_cluster_scorers():
    # Test clustering scorers against gold standard labeling.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    km = KMeans(n_clusters=3)
    km.fit(X_train)
    for name in CLUSTER_SCORERS:
        score1 = get_scorer(name)(km, X_test, y_test)
        score2 = getattr(cluster_module, name)(y_test, km.predict(X_test))
        assert_almost_equal(score1, score2)
Example #11
0
def test_transform():
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    X_new = km.transform(km.cluster_centers_)

    for c in range(n_clusters):
        assert X_new[c, c] == 0
        for c2 in range(n_clusters):
            if c != c2:
                assert X_new[c, c2] > 0
Example #12
0
def test_k_means_empty_cluster_relocated():
    # check that empty clusters are correctly relocated when using sample
    # weights (#13486)
    X = np.array([[-1], [1]])
    sample_weight = [1.9, 0.1]
    init = np.array([[-1], [10]])

    km = KMeans(n_clusters=2, init=init, n_init=1)
    km.fit(X, sample_weight=sample_weight)

    assert len(set(km.labels_)) == 2
    assert_allclose(km.cluster_centers_, [[-1], [1]])
Example #13
0
def test_k_means_init_centers():
    # This test is used to check KMeans won't mutate the user provided input
    # array silently even if input data and init centers have the same type
    X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]])
    init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]])
    for dtype in [np.int32, np.int64, np.float32, np.float64]:
        X_test = dtype(X_small)
        init_centers_test = dtype(init_centers)
        assert_array_equal(init_centers, init_centers_test)
        km = KMeans(init=init_centers_test, n_clusters=3, n_init=1)
        km.fit(X_test)
        assert np.may_share_memory(km.cluster_centers_, init_centers) is False
Example #14
0
def test_k_means_fortran_aligned_data():
    # Check the KMeans will work well, even if X is a fortran-aligned data.
    X = np.asfortranarray([[0, 0], [0, 1], [0, 1]])
    centers = np.array([[0, 0], [0, 1]])
    labels = np.array([0, 1, 1])
    km = KMeans(n_init=1,
                init=centers,
                precompute_distances=False,
                random_state=42,
                n_clusters=2)
    km.fit(X)
    assert_array_almost_equal(km.cluster_centers_, centers)
    assert_array_equal(km.labels_, labels)
Example #15
0
def test_pipeline_spectral_clustering(seed=36):
    # Test using pipeline to do spectral clustering
    random_state = np.random.RandomState(seed)
    se_rbf = SpectralEmbedding(n_components=n_clusters,
                               affinity="rbf",
                               random_state=random_state)
    se_knn = SpectralEmbedding(n_components=n_clusters,
                               affinity="nearest_neighbors",
                               n_neighbors=5,
                               random_state=random_state)
    for se in [se_rbf, se_knn]:
        km = KMeans(n_clusters=n_clusters, random_state=random_state)
        km.fit(se.fit_transform(S))
        assert_array_almost_equal(
            normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2)
Example #16
0
def test_sparse_validate_centers():
    from mrex.datasets import load_iris

    iris = load_iris()
    X = iris.data

    # Get a local optimum
    centers = KMeans(n_clusters=4).fit(X).cluster_centers_

    # Test that a ValueError is raised for validate_center_shape
    classifier = KMeans(n_clusters=3, init=centers, n_init=1)

    msg = r"The shape of the initial centers \(\(4L?, 4L?\)\) " \
          "does not match the number of clusters 3"
    with pytest.raises(ValueError, match=msg):
        classifier.fit(X)
Example #17
0
def test_scoring_is_not_metric():
    assert_raises_regexp(ValueError, 'make_scorer', check_scoring,
                         LogisticRegression(), f1_score)
    assert_raises_regexp(ValueError, 'make_scorer', check_scoring,
                         LogisticRegression(), roc_auc_score)
    assert_raises_regexp(ValueError, 'make_scorer', check_scoring, Ridge(),
                         r2_score)
    assert_raises_regexp(ValueError, 'make_scorer', check_scoring, KMeans(),
                         cluster_module.adjusted_rand_score)
Example #18
0
def test_k_means_non_collapsed():
    # Check k_means with a bad initialization does not yield a singleton
    # Starting with bad centers that are quickly ignored should not
    # result in a repositioning of the centers to the center of mass that
    # would lead to collapsed centers which in turns make the clustering
    # dependent of the numerical unstabilities.
    my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]])
    array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]])
    km = KMeans(init=array_init, n_clusters=3, random_state=42, n_init=1)
    km.fit(my_X)

    # centers must not been collapsed
    assert len(np.unique(km.labels_)) == 3

    centers = km.cluster_centers_
    assert np.linalg.norm(centers[0] - centers[1]) >= 0.1
    assert np.linalg.norm(centers[0] - centers[2]) >= 0.1
    assert np.linalg.norm(centers[1] - centers[2]) >= 0.1
Example #19
0
def test_k_means_new_centers():
    # Explore the part of the code where a new center is reassigned
    X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0],
                  [0, 0, 0, 0], [0, 1, 0, 0]])
    labels = [0, 1, 2, 1, 1, 2]
    bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]])

    km = KMeans(n_clusters=3,
                init=bad_centers,
                n_init=1,
                max_iter=10,
                random_state=1)
    for this_X in (X, sp.coo_matrix(X)):
        km.fit(this_X)
        this_labels = km.labels_
        # Reorder the labels so that the first instance is in cluster 0,
        # the second in cluster 1, ...
        this_labels = np.unique(this_labels, return_index=True)[1][this_labels]
        np.testing.assert_array_equal(this_labels, labels)
Example #20
0
def test_kmeans_results(representation, algo, dtype):
    # cheks that kmeans works as intended
    array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation]
    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
    sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)

    expected_labels = [0, 0, 1, 1]
    expected_inertia = 0.1875
    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
    expected_n_iter = 2

    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
    kmeans.fit(X, sample_weight=sample_weight)

    assert_array_equal(kmeans.labels_, expected_labels)
    assert_almost_equal(kmeans.inertia_, expected_inertia)
    assert_array_almost_equal(kmeans.cluster_centers_, expected_centers)
    assert kmeans.n_iter_ == expected_n_iter
Example #21
0
def test_weighted_vs_repeated():
    # a sample weight of N should yield the same result as an N-fold
    # repetition of the sample
    rng = np.random.RandomState(0)
    sample_weight = rng.randint(1, 5, size=n_samples)
    X_repeat = np.repeat(X, sample_weight, axis=0)
    estimators = [
        KMeans(init="k-means++", n_clusters=n_clusters, random_state=42),
        KMeans(init="random", n_clusters=n_clusters, random_state=42),
        KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42),
        MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42)
    ]
    for estimator in estimators:
        est_weighted = clone(estimator).fit(X, sample_weight=sample_weight)
        est_repeated = clone(estimator).fit(X_repeat)
        repeated_labels = np.repeat(est_weighted.labels_, sample_weight)
        assert_almost_equal(
            v_measure_score(est_repeated.labels_, repeated_labels), 1.0)
        if not isinstance(estimator, MiniBatchKMeans):
            assert_almost_equal(_sort_centers(est_weighted.cluster_centers_),
                                _sort_centers(est_repeated.cluster_centers_))
Example #22
0
def test_scaled_weights():
    # scaling all sample weights by a common factor
    # shouldn't change the result
    sample_weight = np.ones(n_samples)
    for estimator in [
            KMeans(n_clusters=n_clusters, random_state=42),
            MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    ]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=0.5 * sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
Example #23
0
def test_unit_weights_vs_no_weights():
    # not passing any sample weights should be equivalent
    # to all weights equal to one
    sample_weight = np.ones(n_samples)
    for estimator in [
            KMeans(n_clusters=n_clusters, random_state=42),
            MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    ]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
Example #24
0
def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    iris = load_iris()
    scaler = StandardScaler()
    km = KMeans(random_state=0)
    # As pipeline doesn't clone estimators on construction,
    # it must have its own estimators
    scaler_for_pipeline = StandardScaler()
    km_for_pipeline = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([
        ('scaler', scaler_for_pipeline),
        ('Kmeans', km_for_pipeline)
    ])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred)
Example #25
0
def test_elkan_results(distribution):
    # check that results are identical between lloyd and elkan algorithms
    rnd = np.random.RandomState(0)
    if distribution == 'normal':
        X = rnd.normal(size=(50, 10))
    else:
        X, _ = make_blobs(random_state=rnd)

    km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1)
    km_elkan = KMeans(algorithm='elkan',
                      n_clusters=5,
                      random_state=0,
                      n_init=1)

    km_full.fit(X)
    km_elkan.fit(X)
    assert_array_almost_equal(km_elkan.cluster_centers_,
                              km_full.cluster_centers_)
    assert_array_equal(km_elkan.labels_, km_full.labels_)
Example #26
0
def test_n_init():
    # Check that increasing the number of init increases the quality
    n_runs = 5
    n_init_range = [1, 5, 10]
    inertia = np.zeros((len(n_init_range), n_runs))
    for i, n_init in enumerate(n_init_range):
        for j in range(n_runs):
            km = KMeans(n_clusters=n_clusters,
                        init="random",
                        n_init=n_init,
                        random_state=j).fit(X)
            inertia[i, j] = km.inertia_

    inertia = inertia.mean(axis=1)
    failure_msg = ("Inertia %r should be decreasing"
                   " when n_init is increasing.") % list(inertia)
    for i in range(len(n_init_range) - 1):
        assert inertia[i] >= inertia[i + 1], failure_msg
Example #27
0
def test_less_centers_than_unique_points():
    X = np.asarray([[0, 0], [0, 1], [1, 0], [1,
                                             0]])  # last point is duplicated

    km = KMeans(n_clusters=4).fit(X)

    # only three distinct points, so only three clusters
    # can have points assigned to them
    assert set(km.labels_) == set(range(3))

    # k_means should warn that fewer labels than cluster
    # centers have been used
    msg = ("Number of distinct clusters (3) found smaller than "
           "n_clusters (4). Possibly due to duplicate points in X.")
    assert_warns_message(ConvergenceWarning,
                         msg,
                         k_means,
                         X,
                         sample_weight=None,
                         n_clusters=4)
Example #28
0
def test_score(algo):
    # Check that fitting k-means with multiple inits gives better score
    km1 = KMeans(n_clusters=n_clusters,
                 max_iter=1,
                 random_state=42,
                 n_init=1,
                 algorithm=algo)
    s1 = km1.fit(X).score(X)
    km2 = KMeans(n_clusters=n_clusters,
                 max_iter=10,
                 random_state=42,
                 n_init=1,
                 algorithm=algo)
    s2 = km2.fit(X).score(X)
    assert s2 > s1
Example #29
0
def test_max_iter_error():
    km = KMeans(max_iter=-1)
    assert_raise_message(ValueError, 'Number of iterations should be', km.fit,
                         X)
Example #30
0
def test_full_vs_elkan():
    km1 = KMeans(algorithm='full', random_state=13).fit(X)
    km2 = KMeans(algorithm='elkan', random_state=13).fit(X)

    assert homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0