def test_sparse(self): """ Tests GaussianMixture produces the same results using dense and sparse data structures """ file_ = "tests/files/libsvm/2" x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) covariance_types = 'full', 'tied', 'diag', 'spherical' for cov_type in covariance_types: gm = GaussianMixture(n_components=4, random_state=0, covariance_type=cov_type) labels_sparse = gm.fit_predict(x_sparse).collect() labels_dense = gm.fit_predict(x_dense).collect() self.assertTrue(np.array_equal(labels_sparse, labels_dense))
def test_fit_predict(self): """Tests GaussianMixture.fit_predict()""" x, y = make_blobs(n_samples=1500, random_state=170) x_filtered = np.vstack( (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10))) ds_x = ds.array(x_filtered, block_size=(300, 2)) gm = GaussianMixture(n_components=3, random_state=170) pred = gm.fit_predict(ds_x).collect() self.assertEqual(len(pred), 610) accuracy = np.count_nonzero(pred == y_real) / len(pred) self.assertGreater(accuracy, 0.99)
def test_fit_predict_vs_fit_and_predict(self): """Tests GaussianMixture fit_predict() eq. fit() and predict() for both converged and not converged runs (and a fixed random_state).""" x0 = np.random.normal(size=(1000, 2)) x1 = np.random.normal(size=(2000, 2)) x0 = np.dot(x0, [[1.2, 1], [0, 0.5]]) + [0, 3] x1 = np.dot(x1, [[0.4, 0], [1, 2.5]]) + [1, 0] x = np.concatenate((x0, x1)) x_ds = ds.array(x, (1500, 2)) # We check the cases with and without convergence with warnings.catch_warnings(): warnings.simplefilter("ignore", ConvergenceWarning) for max_iter, converges in ((5, False), (100, True)): gm1 = GaussianMixture(n_components=2, max_iter=max_iter, random_state=0) gm1.fit(x_ds) labels1 = gm1.predict(x_ds) gm2 = GaussianMixture(n_components=2, max_iter=max_iter, random_state=0) labels2 = gm2.fit_predict(x_ds) self.assertTrue(np.all(labels1.collect() == labels2.collect())) self.assertEqual(gm1.n_iter, gm2.n_iter) self.assertEqual(converges, gm1.converged_) self.assertEqual(gm1.converged_, gm2.converged_) self.assertEqual(gm1.lower_bound_, gm2.lower_bound_) gm1.weights_ = compss_wait_on(gm1.weights_) gm1.means_ = compss_wait_on(gm1.means_) gm1.covariances_ = compss_wait_on(gm1.covariances_) gm2.weights_ = compss_wait_on(gm2.weights_) gm2.means_ = compss_wait_on(gm2.means_) gm2.covariances_ = compss_wait_on(gm2.covariances_) self.assertTrue(np.all(gm1.weights_ == gm2.weights_)) self.assertTrue(np.all(gm1.means_ == gm2.means_)) self.assertTrue(np.all(gm1.covariances_ == gm2.covariances_))
def main(): # Based on tests.test_gm.GaussianMixtureTest.test_covariance_types # Copied code START """ Tests GaussianMixture covariance types """ np.random.seed(0) n_samples = 600 n_features = 2 def create_anisotropic_dataset(): """Create dataset with 2 anisotropic gaussians of different weight""" n0 = 2 * n_samples // 3 n1 = n_samples // 3 x0 = np.random.normal(size=(n0, n_features)) x1 = np.random.normal(size=(n1, n_features)) transformation = [[0.6, -0.6], [-0.4, 0.8]] x0 = np.dot(x0, transformation) x1 = np.dot(x1, transformation) + [0, 3] x = np.concatenate((x0, x1)) y = np.concatenate((np.zeros(n0), np.ones(n1))) return x, y def create_spherical_blobs_dataset(): """Create dataset with 2 spherical gaussians of different weight, variance and position""" n0 = 2 * n_samples // 3 n1 = n_samples // 3 x0 = np.random.normal(size=(n0, 2), scale=0.5, loc=[2, 0]) x1 = np.random.normal(size=(n1, 2), scale=2.5) x = np.concatenate((x0, x1)) y = np.concatenate((np.zeros(n0), np.ones(n1))) return x, y def create_uncorrelated_dataset(): """Create dataset with 2 gaussians forming a cross of uncorrelated variables""" n0 = 2 * n_samples // 3 n1 = n_samples // 3 x0 = np.random.normal(size=(n0, n_features)) x1 = np.random.normal(size=(n1, n_features)) x0 = np.dot(x0, [[1.2, 0], [0, 0.5]]) + [0, 3] x1 = np.dot(x1, [[0.4, 0], [0, 2.5]]) + [1, 0] x = np.concatenate((x0, x1)) y = np.concatenate((np.zeros(n0), np.ones(n1))) return x, y def create_correlated_dataset(): """Create dataset with 2 gaussians forming a cross of correlated variables""" x, y = create_uncorrelated_dataset() x = np.dot(x, [[1, 1], [-1, 1]]) return x, y datasets = { 'aniso': create_anisotropic_dataset(), 'blobs': create_spherical_blobs_dataset(), 'uncorr': create_uncorrelated_dataset(), 'corr': create_correlated_dataset() } real_labels = {k: v[1] for k, v in datasets.items()} for k, v in datasets.items(): datasets[k] = ds.array(v[0], block_size=(200, v[0].shape[1])) covariance_types = 'full', 'tied', 'diag', 'spherical' def compute_accuracy(real, predicted): """ Computes classification accuracy for binary (0/1) labels""" equal_labels = np.count_nonzero(predicted == real) equal_ratio = equal_labels / len(real) return max(equal_ratio, 1 - equal_ratio) pred_labels = {} for cov_type in covariance_types: pred_labels[cov_type] = {} gm = GaussianMixture(n_components=2, covariance_type=cov_type, random_state=0) for k, x in datasets.items(): pred_labels[cov_type][k] = gm.fit_predict(x) accuracy = {} for cov_type in covariance_types: accuracy[cov_type] = {} for k, pred in pred_labels[cov_type].items(): pred = pred.collect() pred_labels[cov_type][k] = pred accuracy[cov_type][k] = compute_accuracy(real_labels[k], pred) # Copied code END # Plot START plt.figure(figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 for i_ds, (ds_name, x) in enumerate(datasets.items()): x = x.collect() plt.subplot(len(datasets), len(covariance_types) + 1, plot_num) if i_ds == 0: plt.title('original', size=18) colors = np.array(['#377eb8', '#ff7f00']) label_colors = colors[real_labels[ds_name].astype(int)] plt.scatter(x[:, 0], x[:, 1], s=10, color=label_colors) plt.xticks(()) plt.yticks(()) plot_num += 1 for cov_type in covariance_types: plt.subplot(len(datasets), len(covariance_types) + 1, plot_num) if i_ds == 0: plt.title(cov_type, size=18) colors = np.array(['#377eb8', '#ff7f00']) label_colors = colors[pred_labels[cov_type][ds_name]] plt.scatter(x[:, 0], x[:, 1], s=10, color=label_colors) plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2f' % accuracy[cov_type][ds_name]).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 plt.show()
def test_covariance_types(self): """ Tests GaussianMixture covariance types """ np.random.seed(0) n_samples = 600 n_features = 2 def create_anisotropic_dataset(): """Create dataset with 2 anisotropic gaussians of different weight""" n0 = 2 * n_samples // 3 n1 = n_samples // 3 x0 = np.random.normal(size=(n0, n_features)) x1 = np.random.normal(size=(n1, n_features)) transformation = [[0.6, -0.6], [-0.4, 0.8]] x0 = np.dot(x0, transformation) x1 = np.dot(x1, transformation) + [0, 3] x = np.concatenate((x0, x1)) y = np.concatenate((np.zeros(n0), np.ones(n1))) return x, y def create_spherical_blobs_dataset(): """Create dataset with 2 spherical gaussians of different weight, variance and position""" n0 = 2 * n_samples // 3 n1 = n_samples // 3 x0 = np.random.normal(size=(n0, 2), scale=0.5, loc=[2, 0]) x1 = np.random.normal(size=(n1, 2), scale=2.5) x = np.concatenate((x0, x1)) y = np.concatenate((np.zeros(n0), np.ones(n1))) return x, y def create_uncorrelated_dataset(): """Create dataset with 2 gaussians forming a cross of uncorrelated variables""" n0 = 2 * n_samples // 3 n1 = n_samples // 3 x0 = np.random.normal(size=(n0, n_features)) x1 = np.random.normal(size=(n1, n_features)) x0 = np.dot(x0, [[1.2, 0], [0, 0.5]]) + [0, 3] x1 = np.dot(x1, [[0.4, 0], [0, 2.5]]) + [1, 0] x = np.concatenate((x0, x1)) y = np.concatenate((np.zeros(n0), np.ones(n1))) return x, y def create_correlated_dataset(): """Create dataset with 2 gaussians forming a cross of correlated variables""" x, y = create_uncorrelated_dataset() x = np.dot(x, [[1, 1], [-1, 1]]) return x, y datasets = {'aniso': create_anisotropic_dataset(), 'blobs': create_spherical_blobs_dataset(), 'uncorr': create_uncorrelated_dataset(), 'corr': create_correlated_dataset()} real_labels = {k: v[1] for k, v in datasets.items()} for k, v in datasets.items(): datasets[k] = ds.array(v[0], block_size=(200, v[0].shape[1])) covariance_types = 'full', 'tied', 'diag', 'spherical' def compute_accuracy(real, predicted): """ Computes classification accuracy for binary (0/1) labels""" equal_labels = np.count_nonzero(predicted == real) equal_ratio = equal_labels / len(real) return max(equal_ratio, 1 - equal_ratio) pred_labels = {} for cov_type in covariance_types: pred_labels[cov_type] = {} gm = GaussianMixture(n_components=2, covariance_type=cov_type, random_state=0) for k, x in datasets.items(): pred_labels[cov_type][k] = gm.fit_predict(x) accuracy = {} for cov_type in covariance_types: accuracy[cov_type] = {} for k, pred in pred_labels[cov_type].items(): accuracy[cov_type][k] = \ compute_accuracy(real_labels[k], pred.collect()) # Covariance type 'full'. # Assert good accuracy in all tested datasets. self.assertGreater(accuracy['full']['aniso'], 0.9) self.assertGreater(accuracy['full']['blobs'], 0.9) self.assertGreater(accuracy['full']['uncorr'], 0.9) self.assertGreater(accuracy['full']['corr'], 0.9) # Covariance type 'tied'. # Assert good accuracy only for 'aniso'. self.assertGreater(accuracy['tied']['aniso'], 0.9) self.assertLess(accuracy['tied']['blobs'], 0.9) self.assertLess(accuracy['tied']['uncorr'], 0.9) self.assertLess(accuracy['tied']['corr'], 0.9) # Covariance type 'diag'. # Assert good accuracy only for 'blobs' and 'uncorr'. self.assertLess(accuracy['diag']['aniso'], 0.9) self.assertGreater(accuracy['diag']['blobs'], 0.9) self.assertGreater(accuracy['diag']['uncorr'], 0.9) self.assertLess(accuracy['diag']['corr'], 0.9) # Covariance type 'spherical'. # Assert good accuracy only for 'blobs'. self.assertLess(accuracy['spherical']['aniso'], 0.9) self.assertGreater(accuracy['spherical']['blobs'], 0.9) self.assertLess(accuracy['spherical']['uncorr'], 0.9) self.assertLess(accuracy['spherical']['corr'], 0.9)