def test_sample_weight_length(): # check that an error is raised when passing sample weights # with an incompatible shape km = KMeans(n_clusters=n_clusters, random_state=42) msg = r'sample_weight.shape == \(2,\), expected \(100,\)' with pytest.raises(ValueError, match=msg): km.fit(X, sample_weight=np.ones(2))
def test_predict_equal_labels(algo): km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1, algorithm=algo) km.fit(X) assert_array_equal(km.predict(X), km.labels_)
def test_k_means_copyx(): # Check if copy_x=False returns nearly equal X after de-centering. my_X = X.copy() km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42) km.fit(my_X) _check_fitted_model(km) # check if my_X is centered assert_array_almost_equal(my_X, X)
def test_supervised_cluster_scorers(): # Test clustering scorers against gold standard labeling. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) km = KMeans(n_clusters=3) km.fit(X_train) for name in CLUSTER_SCORERS: score1 = get_scorer(name)(km, X_test, y_test) score2 = getattr(cluster_module, name)(y_test, km.predict(X_test)) assert_almost_equal(score1, score2)
def test_transform(): km = KMeans(n_clusters=n_clusters) km.fit(X) X_new = km.transform(km.cluster_centers_) for c in range(n_clusters): assert X_new[c, c] == 0 for c2 in range(n_clusters): if c != c2: assert X_new[c, c2] > 0
def test_k_means_empty_cluster_relocated(): # check that empty clusters are correctly relocated when using sample # weights (#13486) X = np.array([[-1], [1]]) sample_weight = [1.9, 0.1] init = np.array([[-1], [10]]) km = KMeans(n_clusters=2, init=init, n_init=1) km.fit(X, sample_weight=sample_weight) assert len(set(km.labels_)) == 2 assert_allclose(km.cluster_centers_, [[-1], [1]])
def test_k_means_init_centers(): # This test is used to check KMeans won't mutate the user provided input # array silently even if input data and init centers have the same type X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]]) init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]]) for dtype in [np.int32, np.int64, np.float32, np.float64]: X_test = dtype(X_small) init_centers_test = dtype(init_centers) assert_array_equal(init_centers, init_centers_test) km = KMeans(init=init_centers_test, n_clusters=3, n_init=1) km.fit(X_test) assert np.may_share_memory(km.cluster_centers_, init_centers) is False
def test_k_means_fortran_aligned_data(): # Check the KMeans will work well, even if X is a fortran-aligned data. X = np.asfortranarray([[0, 0], [0, 1], [0, 1]]) centers = np.array([[0, 0], [0, 1]]) labels = np.array([0, 1, 1]) km = KMeans(n_init=1, init=centers, precompute_distances=False, random_state=42, n_clusters=2) km.fit(X) assert_array_almost_equal(km.cluster_centers_, centers) assert_array_equal(km.labels_, labels)
def test_score(algo): # Check that fitting k-means with multiple inits gives better score km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1, algorithm=algo) s1 = km1.fit(X).score(X) km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1, algorithm=algo) s2 = km2.fit(X).score(X) assert s2 > s1
def test_pipeline_spectral_clustering(seed=36): # Test using pipeline to do spectral clustering random_state = np.random.RandomState(seed) se_rbf = SpectralEmbedding(n_components=n_clusters, affinity="rbf", random_state=random_state) se_knn = SpectralEmbedding(n_components=n_clusters, affinity="nearest_neighbors", n_neighbors=5, random_state=random_state) for se in [se_rbf, se_knn]: km = KMeans(n_clusters=n_clusters, random_state=random_state) km.fit(se.fit_transform(S)) assert_array_almost_equal( normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2)
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol): # check that fit.predict gives same result as fit_predict # There's a very small chance of failure with elkan on unstructured dataset # because predict method uses fast euclidean distances computation which # may cause small numerical instabilities. # NB: This test is largely redundant with respect to test_predict and # test_predict_equal_labels. This test has the added effect of # testing idempotence of the fittng procesdure which appears to # be where it fails on some MacOS setups. if sys.platform == "darwin": pytest.xfail( "Known failures on MacOS, See " "https://github.com/scikit-learn/scikit-learn/issues/12644") if not (algo == 'elkan' and constructor is sp.csr_matrix): rng = np.random.RandomState(seed) X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[0].astype(dtype, copy=False) X = constructor(X) kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, tol=tol, max_iter=max_iter, n_jobs=1) labels_1 = kmeans.fit(X).predict(X) labels_2 = kmeans.fit_predict(X) assert_array_equal(labels_1, labels_2)
def test_sparse_validate_centers(): from mrex.datasets import load_iris iris = load_iris() X = iris.data # Get a local optimum centers = KMeans(n_clusters=4).fit(X).cluster_centers_ # Test that a ValueError is raised for validate_center_shape classifier = KMeans(n_clusters=3, init=centers, n_init=1) msg = r"The shape of the initial centers \(\(4L?, 4L?\)\) " \ "does not match the number of clusters 3" with pytest.raises(ValueError, match=msg): classifier.fit(X)
def test_k_means_non_collapsed(): # Check k_means with a bad initialization does not yield a singleton # Starting with bad centers that are quickly ignored should not # result in a repositioning of the centers to the center of mass that # would lead to collapsed centers which in turns make the clustering # dependent of the numerical unstabilities. my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]]) array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]]) km = KMeans(init=array_init, n_clusters=3, random_state=42, n_init=1) km.fit(my_X) # centers must not been collapsed assert len(np.unique(km.labels_)) == 3 centers = km.cluster_centers_ assert np.linalg.norm(centers[0] - centers[1]) >= 0.1 assert np.linalg.norm(centers[0] - centers[2]) >= 0.1 assert np.linalg.norm(centers[1] - centers[2]) >= 0.1
def test_k_means_new_centers(): # Explore the part of the code where a new center is reassigned X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0]]) labels = [0, 1, 2, 1, 1, 2] bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]]) km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10, random_state=1) for this_X in (X, sp.coo_matrix(X)): km.fit(this_X) this_labels = km.labels_ # Reorder the labels so that the first instance is in cluster 0, # the second in cluster 1, ... this_labels = np.unique(this_labels, return_index=True)[1][this_labels] np.testing.assert_array_equal(this_labels, labels)
def test_elkan_results(distribution): # check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(0) if distribution == 'normal': X = rnd.normal(size=(50, 10)) else: X, _ = make_blobs(random_state=rnd) km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1) km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1) km_full.fit(X) km_elkan.fit(X) assert_array_almost_equal(km_elkan.cluster_centers_, km_full.cluster_centers_) assert_array_equal(km_elkan.labels_, km_full.labels_)
def test_kmeans_results(representation, algo, dtype): # cheks that kmeans works as intended array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation] X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) expected_labels = [0, 0, 1, 1] expected_inertia = 0.1875 expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) expected_n_iter = 2 kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) kmeans.fit(X, sample_weight=sample_weight) assert_array_equal(kmeans.labels_, expected_labels) assert_almost_equal(kmeans.inertia_, expected_inertia) assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) assert kmeans.n_iter_ == expected_n_iter
data=data) # in this case the seeding of the centers is deterministic, hence we run the # kmeans algorithm only once with n_init=1 pca = PCA(n_components=n_digits).fit(data) bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1), name="PCA-based", data=data) print(82 * '_') # ############################################################################# # Visualize the results on PCA-reduced data reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(1)
# ############################################################################# # Generate sample data np.random.seed(0) batch_size = 45 centers = [[1, 1], [-1, -1], [1, -1]] n_clusters = len(centers) X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7) # ############################################################################# # Compute clustering with Means k_means = KMeans(init='k-means++', n_clusters=3, n_init=10) t0 = time.time() k_means.fit(X) t_batch = time.time() - t0 # ############################################################################# # Compute clustering with MiniBatchKMeans mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size, n_init=10, max_no_improvement=10, verbose=0) t0 = time.time() mbk.fit(X) t_mini_batch = time.time() - t0
def test_iter_attribute(): # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off # it's right value (#11340). estimator = KMeans(algorithm="elkan", max_iter=1) estimator.fit(np.random.rand(10, 10)) assert estimator.n_iter_ == 1
def test_k_means_precompute_distances_flag(): # check that a warning is raised if the precompute_distances flag is not # supported km = KMeans(precompute_distances="wrong") with pytest.raises(ValueError): km.fit(X)
def test_k_means_init(data, init): km = KMeans(init=init, n_clusters=n_clusters, random_state=42, n_init=1) km.fit(data) _check_fitted_model(km)