def testKMeansFitPredict(self): # check that fit.predict gives same result as fit_predict algos = ['full', 'elkan'] seed_max_iter_tols = [ (0, 2, 1e-7), # strict non-convergence (1, 2, 1e-1), # loose non-convergence (3, 300, 1e-7), # strict convergence (4, 300, 1e-1), # loose convergence ] for algo in algos: for seed, max_iter, tol in seed_max_iter_tols: rng = np.random.RandomState(seed) X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[0] kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, tol=tol, max_iter=max_iter, init='k-means++') labels_1 = kmeans.fit(X).predict(X) labels_2 = kmeans.fit_predict(X) # Due to randomness in the order in which chunks of data are processed when # using more than one thread, the absolute values of the labels can be # different between the 2 strategies but they should correspond to the same # clustering. self.assertAlmostEqual(v_measure_score(labels_1, labels_2), 1)
def testKMeansInit(self): # non centered, sparse centers to check the centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) X_csr = sp.csr_matrix(X) for data in [X, X_csr]: for init in ['random', 'k-means++', 'k-means||', centers.copy()]: data = mt.tensor(data, chunk_size=50) km = KMeans(init=init, n_clusters=n_clusters, random_state=42, n_init=1, algorithm='elkan') km.fit(data) self._check_fitted_model(km, n_clusters, n_features, true_labels) X = mt.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) kmeans = KMeans(n_clusters=2, random_state=0, n_init=1, init='k-means||').fit(X) self.assertEqual(sorted(kmeans.cluster_centers_.fetch().tolist()), sorted([[10., 2.], [1., 2.]]))
def testConsistentResultWithSklearn(self): rnd = np.random.RandomState(0) X, _ = make_blobs(random_state=rnd) raw = X X = mt.tensor(X, chunk_size=50) km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1, tol=1e-4, init='k-means++') sk_km_elkan = SK_KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1, tol=1e-4, init='k-means++') km_elkan.fit(X) sk_km_elkan.fit(raw) np.testing.assert_allclose(km_elkan.cluster_centers_, sk_km_elkan.cluster_centers_) np.testing.assert_array_equal(km_elkan.labels_, sk_km_elkan.labels_) self.assertEqual(km_elkan.n_iter_, sk_km_elkan.n_iter_)
def testRelocatedClusters(self): # check that empty clusters are relocated as expected # second center too far from others points will be empty at first iter init_centers = np.array([[0.5, 0.5], [3, 3]]) expected_labels = [0, 0, 1, 1] expected_inertia = 0.25 expected_centers = [[0.25, 0], [0.75, 1]] expected_n_iter = 3 representations = ['dense', 'sparse'] algos = ['full', 'elkan'] for representation in representations: array_constr = { 'dense': np.array, 'sparse': sp.csr_matrix }[representation] X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) for algo in algos: kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) kmeans.fit(X) np.testing.assert_array_equal(kmeans.labels_, expected_labels) np.testing.assert_almost_equal(kmeans.inertia_, expected_inertia) np.testing.assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) self.assertEqual(kmeans.n_iter_, expected_n_iter)
def testKMeansFortranAlignedData(self): # Check the KMeans will work well, even if X is a fortran-aligned data. X = np.asfortranarray([[0, 0], [0, 1], [0, 1]]) centers = np.array([[0, 0], [0, 1]]) labels = np.array([0, 1, 1]) km = KMeans(n_init=1, init=centers, random_state=42, n_clusters=2) km.fit(X) np.testing.assert_array_almost_equal(km.cluster_centers_, centers) np.testing.assert_array_equal(km.labels_, labels)
def testKMeansNInit(self): rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 2)) # two regression tests on bad n_init argument # previous bug: n_init <= 0 threw non-informative TypeError (#3858) with pytest.raises(ValueError, match="n_init"): KMeans(n_init=0, init='k-means++').fit(X) with pytest.raises(ValueError, match="n_init"): KMeans(n_init=-1, init='k-means++').fit(X)
def test_k_means_fortran_aligned_data(setup): # Check the KMeans will work well, even if X is a fortran-aligned data. X = np.asfortranarray([[0, 0], [0, 1], [0, 1]]) centers = np.array([[0, 0], [0, 1]]) labels = np.array([0, 1, 1]) km = KMeans(n_init=1, init=centers, random_state=42, n_clusters=2, algorithm='elkan') km.fit(X) np.testing.assert_array_almost_equal(km.cluster_centers_, centers) np.testing.assert_array_equal(km.labels_, labels)
def test_elkan_results(setup, distribution, tol): # check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(0) if distribution == 'normal': X = rnd.normal(size=(5000, 10)) else: X, _ = make_blobs(random_state=rnd) km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1, tol=tol, init='k-means++') km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1, tol=tol, init='k-means++') km_full.fit(X) km_elkan.fit(X) np.testing.assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) np.testing.assert_array_equal(km_elkan.labels_, km_full.labels_) assert km_elkan.n_iter_ == km_full.n_iter_ assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
def testElkanResultsSparse(self): for distribution in ['normal', 'blobs']: # check that results are identical between lloyd and elkan algorithms # with sparse input rnd = np.random.RandomState(0) if distribution == 'normal': X = sp.random(100, 100, density=0.1, format='csr', random_state=rnd) X.data = rnd.randn(len(X.data)) else: X, _ = make_blobs(n_samples=100, n_features=100, random_state=rnd) X = sp.csr_matrix(X) km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1, init='k-means++') km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1, init='k-means++') km_full.fit(X) km_elkan.fit(X) np.testing.assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) np.testing.assert_allclose(km_elkan.labels_, km_full.labels_)
def testFitTransform(self): centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 X = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42)[0] X1 = KMeans(n_clusters=3, random_state=51, init='k-means++').fit(X).transform(X) X2 = KMeans(n_clusters=3, random_state=51, init='k-means++').fit_transform(X) np.testing.assert_array_almost_equal(X1, X2)
def testLearnInLocalCluster(self, *_): from mars.learn.cluster import KMeans from mars.learn.neighbors import NearestNeighbors from sklearn.cluster import KMeans as SK_KMEANS from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors with new_cluster(scheduler_n_process=2, worker_n_process=3, shared_memory='20M') as cluster: rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y, session=cluster.session) snn = SkNearestNeighbors(n_neighbors=3) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) raw = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) X = mt.array(raw) kmeans = KMeans(n_clusters=2, random_state=0, init='k-means++').fit(X) sk_km_elkan = SK_KMEANS(n_clusters=2, random_state=0, init='k-means++').fit(raw) np.testing.assert_allclose(kmeans.cluster_centers_, sk_km_elkan.cluster_centers_)
def test_k_means_new_centers(setup): # Explore the part of the code where a new center is reassigned X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0]]) labels = [0, 1, 2, 1, 1, 2] bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]]) km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10, random_state=1, algorithm='elkan') for this_X in (X, sp.coo_matrix(X)): km.fit(this_X) this_labels = km.labels_.fetch() # Reorder the labels so that the first instance is in cluster 0, # the second in cluster 1, ... this_labels = np.unique(this_labels, return_index=True)[1][this_labels] np.testing.assert_array_equal(this_labels, labels)
def test_k_means_results(setup, representation, dtype, algo): array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation] X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) expected_labels = [0, 0, 1, 1] expected_inertia = 0.1875 expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) expected_n_iter = 2 kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) kmeans.fit(X, sample_weight=sample_weight) np.testing.assert_array_equal(kmeans.labels_, expected_labels) np.testing.assert_almost_equal(kmeans.inertia_, expected_inertia) np.testing.assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) assert kmeans.n_iter_ == expected_n_iter
def testScore(self): centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42)[0] for algo in ['full', 'elkan']: # Check that fitting k-means with multiple inits gives better score km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1, algorithm=algo, init='k-means++') s1 = km1.fit(X).score(X).fetch() km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1, algorithm=algo, init='k-means++') s2 = km2.fit(X).score(X).fetch() self.assertGreater(s2, s1)
def testTransform(self): centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42)[0] km = KMeans(n_clusters=n_clusters, init='k-means++', algorithm='elkan') km.fit(X) X_new = km.transform(km.cluster_centers_).fetch() for c in range(n_clusters): assert X_new[c, c] == 0 for c2 in range(n_clusters): if c != c2: assert X_new[c, c2] > 0
def test_k_means_fit_predict(setup, algo, seed, max_iter, tol): # check that fit.predict gives same result as fit_predict rng = np.random.RandomState(seed) X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[0] kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, tol=tol, max_iter=max_iter, init='k-means++') labels_1 = kmeans.fit(X).predict(X) labels_2 = kmeans.fit_predict(X) # Due to randomness in the order in which chunks of data are processed when # using more than one thread, the absolute values of the labels can be # different between the 2 strategies but they should correspond to the same # clustering. assert pytest.approx(v_measure_score(labels_1, labels_2)) == 1
def testDistributedKMeans(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: run_kwargs = {'timeout': timeout} rnd = np.random.RandomState(0) X, _ = make_blobs(random_state=rnd) raw = X X = mt.tensor(X, chunk_size=50) km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1, tol=1e-4, init='k-means++') sk_km_elkan = SK_KMEANS(algorithm='elkan', n_clusters=5, random_state=0, n_init=1, tol=1e-4, init='k-means++') km_elkan.fit(X, session=sess, run_kwargs=run_kwargs) sk_km_elkan.fit(raw) np.testing.assert_allclose(km_elkan.cluster_centers_, sk_km_elkan.cluster_centers_) np.testing.assert_array_equal(km_elkan.labels_, sk_km_elkan.labels_) self.assertEqual(km_elkan.n_iter_, sk_km_elkan.n_iter_) with new_session(service_ep) as sess2: run_kwargs = {'timeout': timeout} rnd = np.random.RandomState(0) X, _ = make_blobs(random_state=rnd) X = mt.tensor(X, chunk_size=50) kmeans = KMeans(n_clusters=5, random_state=0, n_init=1, tol=1e-4, init='k-means||') kmeans.fit(X, session=sess2, run_kwargs=run_kwargs)
def testKMeansInit(self): # non centered, sparse centers to check the centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) X_csr = sp.csr_matrix(X) for data in [X, X_csr]: for init in ['random', 'k-means++', 'k-means||', centers.copy()]: km = KMeans(init=init, n_clusters=n_clusters, random_state=42, n_init=1) km.fit(data) self._check_fitted_model(km, n_clusters, n_features, true_labels)
def testKMeansResults(self): representations = ['dense', 'sparse'] dtypes = [np.float32, np.float64] algos = ['full', 'elkan'] for representation in representations: array_constr = { 'dense': np.array, 'sparse': sp.csr_matrix }[representation] for dtype in dtypes: X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) expected_labels = [0, 0, 1, 1] expected_inertia = 0.1875 expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) expected_n_iter = 2 for algo in algos: kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) kmeans.fit(X, sample_weight=sample_weight) np.testing.assert_array_equal(kmeans.labels_, expected_labels) np.testing.assert_almost_equal(kmeans.inertia_, expected_inertia) np.testing.assert_array_almost_equal( kmeans.cluster_centers_, expected_centers) self.assertEqual(kmeans.n_iter_, expected_n_iter)
def testKMeansConvergence(self): for algorithm in ['full', 'elkan']: # Check that KMeans stops when convergence is reached when tol=0. (#16075) rnd = np.random.RandomState(0) X = rnd.normal(size=(5000, 10)) km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0, n_init=1, tol=0, max_iter=300, init='k-means++').fit(X) self.assertLess(km.n_iter_, 300)
def testElkanResults(self): # check that results are identical between lloyd and elkan algorithms distributions = ['normal', 'blobs'] tols = [1e-2, 1e-4, 1e-8] for distribution in distributions: rnd = np.random.RandomState(0) if distribution == 'normal': X = rnd.normal(size=(5000, 10)) else: X, _ = make_blobs(random_state=rnd) for tol in tols: km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1, tol=tol, init='k-means++') km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1, tol=tol, init='k-means++') km_full.fit(X) km_elkan.fit(X) np.testing.assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) np.testing.assert_array_equal(km_elkan.labels_, km_full.labels_) self.assertEqual(km_elkan.n_iter_, km_full.n_iter_) self.assertEqual(km_elkan.inertia_, pytest.approx(km_full.inertia_, rel=1e-6))
def testKMeansExplicitInitShape(self): # test for sensible errors when giving explicit init # with wrong number of features or clusters rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 3)) # mismatch of number of features km = KMeans(n_init=1, init=X[:, :2], n_clusters=len(X), algorithm='elkan') msg = "does not match the number of features of the data" with pytest.raises(ValueError, match=msg): km.fit(X) # for callable init km = KMeans(n_init=1, init=lambda X_, k, random_state: X_[:, :2], n_clusters=len(X), algorithm='elkan') with pytest.raises(ValueError, match=msg): km.fit(X) # mismatch of number of clusters msg = "does not match the number of clusters" km = KMeans(n_init=1, init=X[:2, :], n_clusters=3, algorithm='elkan') with pytest.raises(ValueError, match=msg): km.fit(X) # for callable init km = KMeans(n_init=1, init=lambda X_, k, random_state: X_[:2, :], n_clusters=3, algorithm='elkan') with pytest.raises(ValueError, match=msg): km.fit(X)