def test_sparse(self): """ Tests K-means produces the same results using dense and sparse data structures. """ file_ = "tests/files/libsvm/2" x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) kmeans = KMeans(random_state=170) y_sparse = kmeans.fit_predict(x_sp).collect() sparse_c = kmeans.centers.toarray() kmeans = KMeans(random_state=170) y_dense = kmeans.fit_predict(x_ds).collect() dense_c = kmeans.centers self.assertTrue(np.allclose(sparse_c, dense_c)) self.assertTrue(np.array_equal(y_sparse, y_dense))
def test_kmeans(self): """ Tests K-means fit_predict and compares the result with regular ds-arrays """ config.session.execute("TRUNCATE TABLE hecuba.istorage") config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib") x, y = make_blobs(n_samples=1500, random_state=170) x_filtered = np.vstack( (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) block_size = (x_filtered.shape[0] // 10, x_filtered.shape[1]) x_train = ds.array(x_filtered, block_size=block_size) x_train_hecuba = ds.array(x=x_filtered, block_size=block_size) x_train_hecuba.make_persistent(name="hecuba_dislib.test_array") kmeans = KMeans(n_clusters=3, random_state=170) labels = kmeans.fit_predict(x_train).collect() kmeans2 = KMeans(n_clusters=3, random_state=170) h_labels = kmeans2.fit_predict(x_train_hecuba).collect() self.assertTrue(np.allclose(kmeans.centers, kmeans2.centers)) self.assertTrue(np.allclose(labels, h_labels))
def test_fit_predict(self): """ Tests fit_predict.""" x, y = make_blobs(n_samples=1500, random_state=170) x_filtered = np.vstack( (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) x_train = ds.array(x_filtered, block_size=(300, 2)) kmeans = KMeans(n_clusters=3, random_state=170) labels = kmeans.fit_predict(x_train).collect() skmeans = SKMeans(n_clusters=3, random_state=170) sklabels = skmeans.fit_predict(x_filtered) centers = np.array([[-8.941375656533449, -5.481371322614891], [-4.524023204953875, 0.06235042593214654], [2.332994701667008, 0.37681003933082696]]) self.assertTrue(np.allclose(centers, kmeans.centers)) self.assertTrue(np.allclose(labels, sklabels))
def main(): """ Usage example copied from scikit-learn's webpage. """ plt.figure(figsize=(12, 12)) n_samples = 1500 random_state = 170 x, y = make_blobs(n_samples=n_samples, random_state=random_state) dis_x = ds.array(x, block_size=(300, 2)) # Incorrect number of clusters kmeans = KMeans(n_clusters=2, random_state=random_state) y_pred = kmeans.fit_predict(dis_x).collect() plt.subplot(221) plt.scatter(x[:, 0], x[:, 1], c=y_pred) centers = kmeans.centers plt.scatter(centers[:, 0], centers[:, 1], c="red") plt.title("Incorrect Number of Blobs") # Anisotropicly distributed data transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] x_aniso = np.dot(x, transformation) dis_x_aniso = ds.array(x_aniso, block_size=(300, 2)) kmeans = KMeans(n_clusters=3, random_state=random_state) y_pred = kmeans.fit_predict(dis_x_aniso).collect() plt.subplot(222) plt.scatter(x_aniso[:, 0], x_aniso[:, 1], c=y_pred) centers = kmeans.centers plt.scatter(centers[:, 0], centers[:, 1], c="red") plt.title("Anisotropicly Distributed Blobs") # Different variance x_varied, y_varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) dis_x_varied = ds.array(x_varied, block_size=(300, 2)) kmeans = KMeans(n_clusters=3, random_state=random_state) y_pred = kmeans.fit_predict(dis_x_varied).collect() plt.subplot(223) plt.scatter(x_varied[:, 0], x_varied[:, 1], c=y_pred) centers = kmeans.centers plt.scatter(centers[:, 0], centers[:, 1], c="red") plt.title("Unequal Variance") # Unevenly sized blobs x_filtered = np.vstack((x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) dis_x_filtered = ds.array(x_filtered, block_size=(300, 2)) kmeans = KMeans(n_clusters=3, random_state=random_state) y_pred = kmeans.fit_predict(dis_x_filtered).collect() plt.subplot(224) plt.scatter(x_filtered[:, 0], x_filtered[:, 1], c=y_pred) centers = kmeans.centers plt.scatter(centers[:, 0], centers[:, 1], c="red") plt.title("Unevenly Sized Blobs") plt.show()
def _initialize_parameters(self, x, random_state): """Initialization of the Gaussian mixture parameters. Parameters ---------- x : ds-array, shape=(n_samples, n_features) Data points. random_state : RandomState A random number generator instance. """ if self.weights_init is not None: self.weights_ = self.weights_init / np.sum(self.weights_init) if self.means_init is not None: self.means_ = self.means_init if self.precisions_init is not None: if self.covariance_type == 'full': self.precisions_cholesky_ = np.array( [linalg.cholesky(prec_init, lower=True) for prec_init in self.precisions_init]) elif self.covariance_type == 'tied': self.precisions_cholesky_ = linalg.cholesky( self.precisions_init, lower=True) else: self.precisions_cholesky_ = self.precisions_init initialize_params = (self.weights_init is None or self.means_init is None or self.precisions_init is None) if initialize_params: n_components = self.n_components resp_blocks = [] if self.init_params == 'kmeans': if self.verbose: print("KMeans initialization start") seed = random_state.randint(0, int(1e8)) kmeans = KMeans(n_clusters=n_components, random_state=seed, verbose=self.verbose) y = kmeans.fit_predict(x) self.kmeans = kmeans for y_part in y._iterator(axis=0): resp_blocks.append([_resp_subset(y_part._blocks, n_components)]) elif self.init_params == 'random': chunks = x._n_blocks[0] seeds = random_state.randint(np.iinfo(np.int32).max, size=chunks) for i, x_row in enumerate(x._iterator(axis=0)): resp_blocks.append([_random_resp_subset(x_row.shape[0], n_components, seeds[i])]) else: raise ValueError("Unimplemented initialization method '%s'" % self.init_params) resp = Array(blocks=resp_blocks, top_left_shape=(x._top_left_shape[0], n_components), reg_shape=(x._reg_shape[0], n_components), shape=(x.shape[0], n_components), sparse=False) weights, nk, means = self._estimate_parameters(x, resp) if self.means_init is None: self.means_ = means if self.weights_init is None: self.weights_ = weights if self.precisions_init is None: cov, p_c = _estimate_covariances(x, resp, nk, self.means_, self.reg_covar, self.covariance_type, self.arity) self.covariances_ = cov self.precisions_cholesky_ = p_c for resp_block in resp._blocks: compss_delete_object(resp_block)