def test_sample_weight_length(): # check that an error is raised when passing sample weights # with an incompatible shape km = KMeans(n_clusters=n_clusters, random_state=42) msg = r'sample_weight.shape == \(2,\), expected \(100,\)' with pytest.raises(ValueError, match=msg): km.fit(X, sample_weight=np.ones(2))
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol): # check that fit.predict gives same result as fit_predict # There's a very small chance of failure with elkan on unstructured dataset # because predict method uses fast euclidean distances computation which # may cause small numerical instabilities. # NB: This test is largely redundant with respect to test_predict and # test_predict_equal_labels. This test has the added effect of # testing idempotence of the fittng procesdure which appears to # be where it fails on some MacOS setups. if sys.platform == "darwin": pytest.xfail( "Known failures on MacOS, See " "https://github.com/scikit-learn/scikit-learn/issues/12644") if not (algo == 'elkan' and constructor is sp.csr_matrix): rng = np.random.RandomState(seed) X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[0].astype(dtype, copy=False) X = constructor(X) kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, tol=tol, max_iter=max_iter, n_jobs=1) labels_1 = kmeans.fit(X).predict(X) labels_2 = kmeans.fit_predict(X) assert_array_equal(labels_1, labels_2)
def test_int_input(): X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]] for dtype in [np.int32, np.int64]: X_int = np.array(X_list, dtype=dtype) X_int_csr = sp.csr_matrix(X_int) init_int = X_int[:2] fitted_models = [ KMeans(n_clusters=2).fit(X_int), KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int), # mini batch kmeans is very unstable on such a small dataset hence # we use many inits MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int), MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int_csr), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int_csr), ] for km in fitted_models: assert km.cluster_centers_.dtype == np.float64 expected_labels = [0, 1, 1, 0, 0, 1] scores = np.array([ v_measure_score(expected_labels, km.labels_) for km in fitted_models ]) assert_array_almost_equal(scores, np.ones(scores.shape[0]))
def test_sample_weight_missing(): from mrex.cluster import KMeans clf = AdaBoostClassifier(KMeans(), algorithm="SAMME") assert_raises(ValueError, clf.fit, X, y_regr) clf = AdaBoostRegressor(KMeans()) assert_raises(ValueError, clf.fit, X, y_regr)
def test_predict_equal_labels(algo): km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1, algorithm=algo) km.fit(X) assert_array_equal(km.predict(X), km.labels_)
def test_k_means_init_fitted_centers(data): # Get a local optimum centers = KMeans(n_clusters=3).fit(X).cluster_centers_ # Fit starting from a local optimum shouldn't change the solution new_centers = KMeans(n_clusters=3, init=centers, n_init=1).fit(X).cluster_centers_ assert_array_almost_equal(centers, new_centers)
def test_result_of_kmeans_equal_in_diff_n_jobs(): # PR 9288 rnd = np.random.RandomState(0) X = rnd.normal(size=(50, 10)) result_1 = KMeans(n_clusters=3, random_state=0, n_jobs=1).fit(X).labels_ result_2 = KMeans(n_clusters=3, random_state=0, n_jobs=2).fit(X).labels_ assert_array_equal(result_1, result_2)
def test_k_means_copyx(): # Check if copy_x=False returns nearly equal X after de-centering. my_X = X.copy() km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42) km.fit(my_X) _check_fitted_model(km) # check if my_X is centered assert_array_almost_equal(my_X, X)
def test_k_means_n_init(): rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 2)) # two regression tests on bad n_init argument # previous bug: n_init <= 0 threw non-informative TypeError (#3858) with pytest.raises(ValueError, match="n_init"): KMeans(n_init=0).fit(X) with pytest.raises(ValueError, match="n_init"): KMeans(n_init=-1).fit(X)
def test_supervised_cluster_scorers(): # Test clustering scorers against gold standard labeling. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) km = KMeans(n_clusters=3) km.fit(X_train) for name in CLUSTER_SCORERS: score1 = get_scorer(name)(km, X_test, y_test) score2 = getattr(cluster_module, name)(y_test, km.predict(X_test)) assert_almost_equal(score1, score2)
def test_transform(): km = KMeans(n_clusters=n_clusters) km.fit(X) X_new = km.transform(km.cluster_centers_) for c in range(n_clusters): assert X_new[c, c] == 0 for c2 in range(n_clusters): if c != c2: assert X_new[c, c2] > 0
def test_k_means_empty_cluster_relocated(): # check that empty clusters are correctly relocated when using sample # weights (#13486) X = np.array([[-1], [1]]) sample_weight = [1.9, 0.1] init = np.array([[-1], [10]]) km = KMeans(n_clusters=2, init=init, n_init=1) km.fit(X, sample_weight=sample_weight) assert len(set(km.labels_)) == 2 assert_allclose(km.cluster_centers_, [[-1], [1]])
def test_k_means_init_centers(): # This test is used to check KMeans won't mutate the user provided input # array silently even if input data and init centers have the same type X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]]) init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]]) for dtype in [np.int32, np.int64, np.float32, np.float64]: X_test = dtype(X_small) init_centers_test = dtype(init_centers) assert_array_equal(init_centers, init_centers_test) km = KMeans(init=init_centers_test, n_clusters=3, n_init=1) km.fit(X_test) assert np.may_share_memory(km.cluster_centers_, init_centers) is False
def test_k_means_fortran_aligned_data(): # Check the KMeans will work well, even if X is a fortran-aligned data. X = np.asfortranarray([[0, 0], [0, 1], [0, 1]]) centers = np.array([[0, 0], [0, 1]]) labels = np.array([0, 1, 1]) km = KMeans(n_init=1, init=centers, precompute_distances=False, random_state=42, n_clusters=2) km.fit(X) assert_array_almost_equal(km.cluster_centers_, centers) assert_array_equal(km.labels_, labels)
def test_pipeline_spectral_clustering(seed=36): # Test using pipeline to do spectral clustering random_state = np.random.RandomState(seed) se_rbf = SpectralEmbedding(n_components=n_clusters, affinity="rbf", random_state=random_state) se_knn = SpectralEmbedding(n_components=n_clusters, affinity="nearest_neighbors", n_neighbors=5, random_state=random_state) for se in [se_rbf, se_knn]: km = KMeans(n_clusters=n_clusters, random_state=random_state) km.fit(se.fit_transform(S)) assert_array_almost_equal( normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2)
def test_sparse_validate_centers(): from mrex.datasets import load_iris iris = load_iris() X = iris.data # Get a local optimum centers = KMeans(n_clusters=4).fit(X).cluster_centers_ # Test that a ValueError is raised for validate_center_shape classifier = KMeans(n_clusters=3, init=centers, n_init=1) msg = r"The shape of the initial centers \(\(4L?, 4L?\)\) " \ "does not match the number of clusters 3" with pytest.raises(ValueError, match=msg): classifier.fit(X)
def test_scoring_is_not_metric(): assert_raises_regexp(ValueError, 'make_scorer', check_scoring, LogisticRegression(), f1_score) assert_raises_regexp(ValueError, 'make_scorer', check_scoring, LogisticRegression(), roc_auc_score) assert_raises_regexp(ValueError, 'make_scorer', check_scoring, Ridge(), r2_score) assert_raises_regexp(ValueError, 'make_scorer', check_scoring, KMeans(), cluster_module.adjusted_rand_score)
def test_k_means_non_collapsed(): # Check k_means with a bad initialization does not yield a singleton # Starting with bad centers that are quickly ignored should not # result in a repositioning of the centers to the center of mass that # would lead to collapsed centers which in turns make the clustering # dependent of the numerical unstabilities. my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]]) array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]]) km = KMeans(init=array_init, n_clusters=3, random_state=42, n_init=1) km.fit(my_X) # centers must not been collapsed assert len(np.unique(km.labels_)) == 3 centers = km.cluster_centers_ assert np.linalg.norm(centers[0] - centers[1]) >= 0.1 assert np.linalg.norm(centers[0] - centers[2]) >= 0.1 assert np.linalg.norm(centers[1] - centers[2]) >= 0.1
def test_k_means_new_centers(): # Explore the part of the code where a new center is reassigned X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0]]) labels = [0, 1, 2, 1, 1, 2] bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]]) km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10, random_state=1) for this_X in (X, sp.coo_matrix(X)): km.fit(this_X) this_labels = km.labels_ # Reorder the labels so that the first instance is in cluster 0, # the second in cluster 1, ... this_labels = np.unique(this_labels, return_index=True)[1][this_labels] np.testing.assert_array_equal(this_labels, labels)
def test_kmeans_results(representation, algo, dtype): # cheks that kmeans works as intended array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation] X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) expected_labels = [0, 0, 1, 1] expected_inertia = 0.1875 expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) expected_n_iter = 2 kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) kmeans.fit(X, sample_weight=sample_weight) assert_array_equal(kmeans.labels_, expected_labels) assert_almost_equal(kmeans.inertia_, expected_inertia) assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) assert kmeans.n_iter_ == expected_n_iter
def test_weighted_vs_repeated(): # a sample weight of N should yield the same result as an N-fold # repetition of the sample rng = np.random.RandomState(0) sample_weight = rng.randint(1, 5, size=n_samples) X_repeat = np.repeat(X, sample_weight, axis=0) estimators = [ KMeans(init="k-means++", n_clusters=n_clusters, random_state=42), KMeans(init="random", n_clusters=n_clusters, random_state=42), KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42) ] for estimator in estimators: est_weighted = clone(estimator).fit(X, sample_weight=sample_weight) est_repeated = clone(estimator).fit(X_repeat) repeated_labels = np.repeat(est_weighted.labels_, sample_weight) assert_almost_equal( v_measure_score(est_repeated.labels_, repeated_labels), 1.0) if not isinstance(estimator, MiniBatchKMeans): assert_almost_equal(_sort_centers(est_weighted.cluster_centers_), _sort_centers(est_repeated.cluster_centers_))
def test_scaled_weights(): # scaling all sample weights by a common factor # shouldn't change the result sample_weight = np.ones(n_samples) for estimator in [ KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42) ]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=0.5 * sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def test_unit_weights_vs_no_weights(): # not passing any sample weights should be equivalent # to all weights equal to one sample_weight = np.ones(n_samples) for estimator in [ KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42) ]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0) # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([ ('scaler', scaler_for_pipeline), ('Kmeans', km_for_pipeline) ]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
def test_elkan_results(distribution): # check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(0) if distribution == 'normal': X = rnd.normal(size=(50, 10)) else: X, _ = make_blobs(random_state=rnd) km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1) km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1) km_full.fit(X) km_elkan.fit(X) assert_array_almost_equal(km_elkan.cluster_centers_, km_full.cluster_centers_) assert_array_equal(km_elkan.labels_, km_full.labels_)
def test_n_init(): # Check that increasing the number of init increases the quality n_runs = 5 n_init_range = [1, 5, 10] inertia = np.zeros((len(n_init_range), n_runs)) for i, n_init in enumerate(n_init_range): for j in range(n_runs): km = KMeans(n_clusters=n_clusters, init="random", n_init=n_init, random_state=j).fit(X) inertia[i, j] = km.inertia_ inertia = inertia.mean(axis=1) failure_msg = ("Inertia %r should be decreasing" " when n_init is increasing.") % list(inertia) for i in range(len(n_init_range) - 1): assert inertia[i] >= inertia[i + 1], failure_msg
def test_less_centers_than_unique_points(): X = np.asarray([[0, 0], [0, 1], [1, 0], [1, 0]]) # last point is duplicated km = KMeans(n_clusters=4).fit(X) # only three distinct points, so only three clusters # can have points assigned to them assert set(km.labels_) == set(range(3)) # k_means should warn that fewer labels than cluster # centers have been used msg = ("Number of distinct clusters (3) found smaller than " "n_clusters (4). Possibly due to duplicate points in X.") assert_warns_message(ConvergenceWarning, msg, k_means, X, sample_weight=None, n_clusters=4)
def test_score(algo): # Check that fitting k-means with multiple inits gives better score km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1, algorithm=algo) s1 = km1.fit(X).score(X) km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1, algorithm=algo) s2 = km2.fit(X).score(X) assert s2 > s1
def test_max_iter_error(): km = KMeans(max_iter=-1) assert_raise_message(ValueError, 'Number of iterations should be', km.fit, X)
def test_full_vs_elkan(): km1 = KMeans(algorithm='full', random_state=13).fit(X) km2 = KMeans(algorithm='elkan', random_state=13).fit(X) assert homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0