Beispiel #1
0
 def test_not_converged_warning(self):
     """ Tests GaussianMixture warns when not converged """
     with self.assertWarns(ConvergenceWarning):
         x, _ = load_iris(return_X_y=True)
         x_ds = ds.array(x, (75, 4))
         gm = GaussianMixture(max_iter=1)
         gm.fit(x_ds)
Beispiel #2
0
 def test_init_random(self):
     """ Tests GaussianMixture random initialization """
     x = ds.random_array((50, 3), (10, 3), random_state=0)
     gm = GaussianMixture(init_params='random', n_components=4,
                          arity=2, random_state=170)
     gm.fit(x)
     self.assertGreater(gm.n_iter, 5)
Beispiel #3
0
 def test_means_init_and_weights_init(self):
     """ Tests GaussianMixture means_init and weights_init parameters """
     x, _ = load_iris(return_X_y=True)
     x_ds = ds.array(x, (75, 4))
     weights_init = [1 / 3, 1 / 3, 1 / 3]
     means_init = np.array([[5, 3, 2, 0],
                            [6, 3, 4, 1],
                            [7, 3, 6, 2]])
     gm = GaussianMixture(random_state=0, n_components=3,
                          weights_init=weights_init, means_init=means_init)
     gm.fit(x_ds)
     self.assertTrue(gm.converged_)
Beispiel #4
0
    def test_fit(self):
        """Tests GaussianMixture.fit()"""

        x = np.array([[1, 2], [2, 1], [-3, -3], [-1, -2], [-2, -1], [3, 3]])
        ds_x = ds.array(x, block_size=(3, 2))

        gm = GaussianMixture(n_components=2, random_state=666)
        gm.fit(ds_x)

        expected_weights = np.array([0.5, 0.5])
        expected_means = np.array([[-2, -2], [2, 2]])
        expected_cov = np.array([[[0.66671688, 0.33338255],
                                  [0.33338255, 0.66671688]],

                                 [[0.66671688, 0.33338255],
                                  [0.33338255, 0.66671688]]])
        expected_pc = np.array([[[1.22469875, -0.70714834],
                                 [0., 1.4141944]],

                                [[1.22469875, -0.70714834],
                                 [0., 1.4141944]]])

        gm.weights_ = compss_wait_on(gm.weights_)
        gm.means_ = compss_wait_on(gm.means_)
        gm.covariances_ = compss_wait_on(gm.covariances_)
        gm.precisions_cholesky_ = compss_wait_on(gm.precisions_cholesky_)

        self.assertTrue((np.allclose(gm.weights_, expected_weights)))
        self.assertTrue((np.allclose(gm.means_, expected_means)))
        self.assertTrue((np.allclose(gm.covariances_, expected_cov)))
        self.assertTrue((np.allclose(gm.precisions_cholesky_, expected_pc)))
Beispiel #5
0
    def test_predict(self):
        """Tests GaussianMixture.predict()"""
        x_train = np.array([[1, 2], [-1, -2], [2, 1], [-2, -1]])
        ds_x_train = ds.array(x_train, block_size=(2, 2))

        gm = GaussianMixture(n_components=2, random_state=666)
        gm.fit(ds_x_train)

        x_test = np.concatenate((x_train, [[2, 2], [-1, -3]]))
        ds_x_test = ds.array(x_test, block_size=(2, 2))
        pred = gm.predict(ds_x_test).collect()

        self.assertTrue(pred[0] != pred[1])
        self.assertTrue(pred[0] == pred[2] == pred[4])
        self.assertTrue(pred[1] == pred[3] == pred[5])
Beispiel #6
0
    def test_fit_predict(self):
        """Tests GaussianMixture.fit_predict()"""
        x, y = make_blobs(n_samples=1500, random_state=170)
        x_filtered = np.vstack(
            (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))
        y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10)))

        ds_x = ds.array(x_filtered, block_size=(300, 2))

        gm = GaussianMixture(n_components=3, random_state=170)
        pred = gm.fit_predict(ds_x).collect()

        self.assertEqual(len(pred), 610)
        accuracy = np.count_nonzero(pred == y_real) / len(pred)
        self.assertGreater(accuracy, 0.99)
Beispiel #7
0
    def test_sparse(self):
        """ Tests GaussianMixture produces the same results using dense and
        sparse data structures """
        file_ = "tests/files/libsvm/2"

        x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True)
        x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False)

        covariance_types = 'full', 'tied', 'diag', 'spherical'

        for cov_type in covariance_types:
            gm = GaussianMixture(n_components=4, random_state=0,
                                 covariance_type=cov_type)
            labels_sparse = gm.fit_predict(x_sparse).collect()
            labels_dense = gm.fit_predict(x_dense).collect()
            self.assertTrue(np.array_equal(labels_sparse, labels_dense))
Beispiel #8
0
 def test_init_params(self):
     """Tests that GaussianMixture params are set"""
     n_components = 2
     covariance_type = 'diag'
     tol = 1e-4
     reg_covar = 1e-5
     max_iter = 3
     init_params = 'random'
     weights_init = np.array([0.4, 0.6])
     means_init = np.array([[0, 0], [2, 3]])
     precisions_init = 'todo'
     random_state = RandomState(666)
     gm = GaussianMixture(n_components=n_components,
                          covariance_type=covariance_type,
                          tol=tol,
                          reg_covar=reg_covar,
                          max_iter=max_iter,
                          init_params=init_params,
                          weights_init=weights_init,
                          means_init=means_init,
                          precisions_init=precisions_init,
                          random_state=random_state)
     expected = (n_components, covariance_type, tol, reg_covar,
                 max_iter, init_params, weights_init, means_init,
                 precisions_init, random_state)
     real = (gm.n_components, gm.covariance_type, gm.tol, gm.reg_covar,
             gm.max_iter, gm.init_params, gm.weights_init, gm.means_init,
             gm.precisions_init, gm.random_state)
     self.assertEqual(expected, real)
Beispiel #9
0
    def test_precisions_init_spherical(self):
        """ Tests GaussianMixture with precisions_init='spherical' """
        x, _ = load_iris(return_X_y=True)
        x_ds = ds.array(x, (75, 4))
        weights_init = [1 / 3, 1 / 3, 1 / 3]
        means_init = np.array([[5, 3, 2, 0],
                               [6, 3, 4, 1],
                               [7, 3, 6, 2]])
        np.random.seed(0)
        precisions_init = np.random.rand(3) * 2

        gm = GaussianMixture(covariance_type='spherical', random_state=0,
                             n_components=3, weights_init=weights_init,
                             means_init=means_init,
                             precisions_init=precisions_init)
        gm.fit(x_ds)
        self.assertTrue(gm.converged_)
Beispiel #10
0
    def test_verbose(self):
        """ Tests GaussianMixture verbose mode prints text """
        x = ds.array([[0, 0], [0, 1], [1, 0]], (3, 2))
        gm = GaussianMixture(verbose=True, max_iter=2)

        saved_stdout = sys.stdout
        try:
            sys.stdout = io.StringIO()

            # Call code that has to print
            gm.fit(x)

            captured_output = sys.stdout.getvalue()
        finally:
            sys.stdout = saved_stdout

        self.assertTrue(len(captured_output) > 0)
Beispiel #11
0
    def test_precisions_init_tied(self):
        """ Tests GaussianMixture with precisions_init='tied' """
        x, _ = load_iris(return_X_y=True)
        x_ds = ds.array(x, (75, 4))
        weights_init = [1 / 3, 1 / 3, 1 / 3]
        means_init = [[5, 3, 2, 0],
                      [6, 3, 4, 1],
                      [7, 3, 6, 2]]
        np.random.seed(0)
        rand_matrix = np.random.rand(4, 4)
        precisions_init = np.matmul(rand_matrix, rand_matrix.T)

        gm = GaussianMixture(covariance_type='tied', random_state=0,
                             n_components=3, weights_init=weights_init,
                             means_init=means_init,
                             precisions_init=precisions_init)
        gm.fit(x_ds)
        self.assertTrue(gm.converged_)
Beispiel #12
0
def main():
    n_samples = 100000000
    n_chunks = 768
    chunk_size = int(np.ceil(n_samples / n_chunks))
    n_features = 100
    n_clusters = 50

    x = ds.random_array((n_samples, n_features), (chunk_size, n_features))
    gmm = GaussianMixture(n_components=n_clusters,
                          max_iter=5,
                          tol=0,
                          init_params="random")
    performance.measure("GMM", "100M", gmm.fit, x)
Beispiel #13
0
 def test_check_tol(self):
     """Tests GaussianMixture tol validation"""
     x = ds.array([[0, 0], [0, 1], [1, 0]], block_size=(10, 2))
     with self.assertRaises(ValueError):
         gm = GaussianMixture(tol=-0.1)
         gm.fit(x)
def main():
    np.random.seed(0)

    # ============
    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    # ============
    n_samples = 1500
    noisy_circles = make_circles(n_samples=n_samples,
                                 factor=.5,
                                 noise=.05,
                                 random_state=170)
    noisy_moons = make_moons(n_samples=n_samples, noise=.05)
    blobs = make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    # Anisotropicly distributed data
    random_state = 170
    X, y = make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)

    # blobs with varied variances
    varied = make_blobs(n_samples=n_samples,
                        cluster_std=[1.0, 2.5, 0.5],
                        random_state=random_state)

    # ============
    # Set up cluster parameters
    # ============
    plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)

    plot_num = 1

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    datasets = [(noisy_circles, {
        'damping': .77,
        'preference': -240,
        'quantile': .2,
        'n_clusters': 2
    }), (noisy_moons, {
        'damping': .75,
        'preference': -220,
        'n_clusters': 2
    }), (varied, {
        'eps': .18,
        'n_neighbors': 2
    }), (aniso, {
        'eps': .15,
        'n_neighbors': 2
    }), (blobs, {}), (no_structure, {})]

    for i_dataset, (dataset, algo_params) in enumerate(datasets):
        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # ============
        # Create cluster objects
        # ============
        kmeans = KMeans(n_clusters=params["n_clusters"])
        dbscan = DBSCAN(eps=params["eps"], n_regions=1)
        gm = GaussianMixture(n_components=params["n_clusters"])

        clustering_algorithms = (('K-Means', kmeans), ('DBSCAN', dbscan),
                                 ('Gaussian mixture', gm))

        for name, algorithm in clustering_algorithms:
            t0 = time.time()

            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore",
                                        message="the number of connected "
                                        "components of the "
                                        "connectivity matrix is ["
                                        "0-9]{1,2} > 1. Completing "
                                        "it to avoid stopping the "
                                        "tree early.",
                                        category=UserWarning)
                warnings.filterwarnings("ignore",
                                        message="Graph is not fully "
                                        "connected, "
                                        "spectral "
                                        "embedding may not "
                                        "work as "
                                        "expected.",
                                        category=UserWarning)

                data = ds.array(X, block_size=(300, 2))
                algorithm.fit(data)

            t1 = time.time()
            y_pred = algorithm.fit_predict(data).collect()

            plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)

            colors = np.array(
                list(
                    islice(
                        cycle([
                            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                            '#a65628', '#984ea3', '#999999', '#e41a1c',
                            '#dede00'
                        ]), int(max(y_pred) + 1))))
            # add black color for outliers (if any)
            colors = np.append(colors, ["#000000"])
            plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

            plt.xlim(-2.5, 2.5)
            plt.ylim(-2.5, 2.5)
            plt.xticks(())
            plt.yticks(())
            plt.text(.99,
                     .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')
            plot_num += 1

    plt.show()
def main():
    # Based on tests.test_gm.GaussianMixtureTest.test_covariance_types
    # Copied code START
    """ Tests GaussianMixture covariance types """
    np.random.seed(0)
    n_samples = 600
    n_features = 2

    def create_anisotropic_dataset():
        """Create dataset with 2 anisotropic gaussians of different
        weight"""
        n0 = 2 * n_samples // 3
        n1 = n_samples // 3
        x0 = np.random.normal(size=(n0, n_features))
        x1 = np.random.normal(size=(n1, n_features))
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x0 = np.dot(x0, transformation)
        x1 = np.dot(x1, transformation) + [0, 3]
        x = np.concatenate((x0, x1))
        y = np.concatenate((np.zeros(n0), np.ones(n1)))
        return x, y

    def create_spherical_blobs_dataset():
        """Create dataset with 2 spherical gaussians of different weight,
        variance and position"""
        n0 = 2 * n_samples // 3
        n1 = n_samples // 3
        x0 = np.random.normal(size=(n0, 2), scale=0.5, loc=[2, 0])
        x1 = np.random.normal(size=(n1, 2), scale=2.5)
        x = np.concatenate((x0, x1))
        y = np.concatenate((np.zeros(n0), np.ones(n1)))
        return x, y

    def create_uncorrelated_dataset():
        """Create dataset with 2 gaussians forming a cross of uncorrelated
        variables"""
        n0 = 2 * n_samples // 3
        n1 = n_samples // 3
        x0 = np.random.normal(size=(n0, n_features))
        x1 = np.random.normal(size=(n1, n_features))
        x0 = np.dot(x0, [[1.2, 0], [0, 0.5]]) + [0, 3]
        x1 = np.dot(x1, [[0.4, 0], [0, 2.5]]) + [1, 0]
        x = np.concatenate((x0, x1))
        y = np.concatenate((np.zeros(n0), np.ones(n1)))
        return x, y

    def create_correlated_dataset():
        """Create dataset with 2 gaussians forming a cross of correlated
        variables"""
        x, y = create_uncorrelated_dataset()
        x = np.dot(x, [[1, 1], [-1, 1]])
        return x, y

    datasets = {
        'aniso': create_anisotropic_dataset(),
        'blobs': create_spherical_blobs_dataset(),
        'uncorr': create_uncorrelated_dataset(),
        'corr': create_correlated_dataset()
    }
    real_labels = {k: v[1] for k, v in datasets.items()}
    for k, v in datasets.items():
        datasets[k] = ds.array(v[0], block_size=(200, v[0].shape[1]))

    covariance_types = 'full', 'tied', 'diag', 'spherical'

    def compute_accuracy(real, predicted):
        """ Computes classification accuracy for binary (0/1) labels"""
        equal_labels = np.count_nonzero(predicted == real)
        equal_ratio = equal_labels / len(real)
        return max(equal_ratio, 1 - equal_ratio)

    pred_labels = {}
    for cov_type in covariance_types:
        pred_labels[cov_type] = {}
        gm = GaussianMixture(n_components=2,
                             covariance_type=cov_type,
                             random_state=0)
        for k, x in datasets.items():
            pred_labels[cov_type][k] = gm.fit_predict(x)
    accuracy = {}
    for cov_type in covariance_types:
        accuracy[cov_type] = {}
        for k, pred in pred_labels[cov_type].items():
            pred = pred.collect()
            pred_labels[cov_type][k] = pred
            accuracy[cov_type][k] = compute_accuracy(real_labels[k], pred)
    # Copied code END

    # Plot START
    plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)
    plot_num = 1

    for i_ds, (ds_name, x) in enumerate(datasets.items()):
        x = x.collect()

        plt.subplot(len(datasets), len(covariance_types) + 1, plot_num)
        if i_ds == 0:
            plt.title('original', size=18)
        colors = np.array(['#377eb8', '#ff7f00'])
        label_colors = colors[real_labels[ds_name].astype(int)]
        plt.scatter(x[:, 0], x[:, 1], s=10, color=label_colors)
        plt.xticks(())
        plt.yticks(())
        plot_num += 1

        for cov_type in covariance_types:
            plt.subplot(len(datasets), len(covariance_types) + 1, plot_num)
            if i_ds == 0:
                plt.title(cov_type, size=18)

            colors = np.array(['#377eb8', '#ff7f00'])
            label_colors = colors[pred_labels[cov_type][ds_name]]
            plt.scatter(x[:, 0], x[:, 1], s=10, color=label_colors)
            plt.xticks(())
            plt.yticks(())
            plt.text(.99,
                     .01, ('%.2f' % accuracy[cov_type][ds_name]).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')
            plot_num += 1

    plt.show()
Beispiel #16
0
 def test_check_max_iter(self):
     """Tests GaussianMixture max_iter validation"""
     x = ds.array([[0, 0], [0, 1], [1, 0]], block_size=(3, 2))
     with self.assertRaises(ValueError):
         gm = GaussianMixture(max_iter=0)
         gm.fit(x)
Beispiel #17
0
 def test_check_n_components(self):
     """Tests GaussianMixture n_components validation"""
     x = ds.array([[0, 0], [0, 1], [1, 0]], block_size=(3, 2))
     with self.assertRaises(ValueError):
         gm = GaussianMixture(n_components=0)
         gm.fit(x)
Beispiel #18
0
 def test_check_covariance_type(self):
     """Tests GaussianMixture covariance_type validation"""
     x = ds.array([[0, 0], [0, 1], [1, 0]], block_size=(3, 2))
     with self.assertRaises(ValueError):
         gm = GaussianMixture(covariance_type='')
         gm.fit(x)
Beispiel #19
0
 def test_check_reg_covar(self):
     """Tests GaussianMixture reg_covar validation"""
     x = ds.array([[0, 0], [0, 1], [1, 0]], block_size=(3, 2))
     with self.assertRaises(ValueError):
         gm = GaussianMixture(reg_covar=-0.1)
         gm.fit(x)
Beispiel #20
0
 def test_check_init_params(self):
     """Tests GaussianMixture init_params validation"""
     x = ds.array([[0, 0], [0, 1], [1, 0]], block_size=(3, 2))
     with self.assertRaises(ValueError):
         gm = GaussianMixture(init_params='')
         gm.fit(x)
Beispiel #21
0
 def test_check_initial_parameters(self):
     """Tests GaussianMixture initial parameters validation"""
     x = ds.array([[0, 0], [0, 1], [1, 0]], block_size=(3, 2))
     with self.assertRaises(ValueError):
         gm = GaussianMixture(weights_init=[1, 2])
         gm.fit(x)
     with self.assertRaises(ValueError):
         gm = GaussianMixture(means_init=[1, 2])
         gm.fit(x)
     with self.assertRaises(ValueError):
         gm = GaussianMixture(precisions_init=[1, 2],
                              covariance_type='full')
         gm.fit(x)
     with self.assertRaises(ValueError):
         gm = GaussianMixture(precisions_init=[1, 2],
                              covariance_type='tied')
         gm.fit(x)
     with self.assertRaises(ValueError):
         gm = GaussianMixture(precisions_init=[1, 2],
                              covariance_type='diag')
         gm.fit(x)
     with self.assertRaises(ValueError):
         gm = GaussianMixture(precisions_init=[1, 2],
                              covariance_type='spherical')
         gm.fit(x)
     with self.assertRaises(ValueError):
         gm = GaussianMixture(means_init=[[1, 2, 3]],
                              precisions_init=[[1, 2], [3, 4]],
                              covariance_type='tied')
         gm.fit(x)
Beispiel #22
0
    def test_fit_predict_vs_fit_and_predict(self):
        """Tests GaussianMixture fit_predict() eq. fit() and predict() for both
        converged and not converged runs (and a fixed random_state)."""
        x0 = np.random.normal(size=(1000, 2))
        x1 = np.random.normal(size=(2000, 2))
        x0 = np.dot(x0, [[1.2, 1], [0, 0.5]]) + [0, 3]
        x1 = np.dot(x1, [[0.4, 0], [1, 2.5]]) + [1, 0]
        x = np.concatenate((x0, x1))
        x_ds = ds.array(x, (1500, 2))

        # We check the cases with and without convergence
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ConvergenceWarning)
            for max_iter, converges in ((5, False), (100, True)):
                gm1 = GaussianMixture(n_components=2, max_iter=max_iter,
                                      random_state=0)
                gm1.fit(x_ds)
                labels1 = gm1.predict(x_ds)

                gm2 = GaussianMixture(n_components=2, max_iter=max_iter,
                                      random_state=0)
                labels2 = gm2.fit_predict(x_ds)

                self.assertTrue(np.all(labels1.collect() == labels2.collect()))
                self.assertEqual(gm1.n_iter, gm2.n_iter)
                self.assertEqual(converges, gm1.converged_)
                self.assertEqual(gm1.converged_, gm2.converged_)
                self.assertEqual(gm1.lower_bound_, gm2.lower_bound_)

                gm1.weights_ = compss_wait_on(gm1.weights_)
                gm1.means_ = compss_wait_on(gm1.means_)
                gm1.covariances_ = compss_wait_on(gm1.covariances_)
                gm2.weights_ = compss_wait_on(gm2.weights_)
                gm2.means_ = compss_wait_on(gm2.means_)
                gm2.covariances_ = compss_wait_on(gm2.covariances_)

                self.assertTrue(np.all(gm1.weights_ == gm2.weights_))
                self.assertTrue(np.all(gm1.means_ == gm2.means_))
                self.assertTrue(np.all(gm1.covariances_ == gm2.covariances_))
Beispiel #23
0
    def test_covariance_types(self):
        """ Tests GaussianMixture covariance types """
        np.random.seed(0)
        n_samples = 600
        n_features = 2

        def create_anisotropic_dataset():
            """Create dataset with 2 anisotropic gaussians of different
            weight"""
            n0 = 2 * n_samples // 3
            n1 = n_samples // 3
            x0 = np.random.normal(size=(n0, n_features))
            x1 = np.random.normal(size=(n1, n_features))
            transformation = [[0.6, -0.6], [-0.4, 0.8]]
            x0 = np.dot(x0, transformation)
            x1 = np.dot(x1, transformation) + [0, 3]
            x = np.concatenate((x0, x1))
            y = np.concatenate((np.zeros(n0), np.ones(n1)))
            return x, y

        def create_spherical_blobs_dataset():
            """Create dataset with 2 spherical gaussians of different weight,
            variance and position"""
            n0 = 2 * n_samples // 3
            n1 = n_samples // 3
            x0 = np.random.normal(size=(n0, 2), scale=0.5, loc=[2, 0])
            x1 = np.random.normal(size=(n1, 2), scale=2.5)
            x = np.concatenate((x0, x1))
            y = np.concatenate((np.zeros(n0), np.ones(n1)))
            return x, y

        def create_uncorrelated_dataset():
            """Create dataset with 2 gaussians forming a cross of uncorrelated
            variables"""
            n0 = 2 * n_samples // 3
            n1 = n_samples // 3
            x0 = np.random.normal(size=(n0, n_features))
            x1 = np.random.normal(size=(n1, n_features))
            x0 = np.dot(x0, [[1.2, 0], [0, 0.5]]) + [0, 3]
            x1 = np.dot(x1, [[0.4, 0], [0, 2.5]]) + [1, 0]
            x = np.concatenate((x0, x1))
            y = np.concatenate((np.zeros(n0), np.ones(n1)))
            return x, y

        def create_correlated_dataset():
            """Create dataset with 2 gaussians forming a cross of correlated
            variables"""
            x, y = create_uncorrelated_dataset()
            x = np.dot(x, [[1, 1], [-1, 1]])
            return x, y

        datasets = {'aniso': create_anisotropic_dataset(),
                    'blobs': create_spherical_blobs_dataset(),
                    'uncorr': create_uncorrelated_dataset(),
                    'corr': create_correlated_dataset()}
        real_labels = {k: v[1] for k, v in datasets.items()}
        for k, v in datasets.items():
            datasets[k] = ds.array(v[0], block_size=(200, v[0].shape[1]))

        covariance_types = 'full', 'tied', 'diag', 'spherical'

        def compute_accuracy(real, predicted):
            """ Computes classification accuracy for binary (0/1) labels"""
            equal_labels = np.count_nonzero(predicted == real)
            equal_ratio = equal_labels / len(real)
            return max(equal_ratio, 1 - equal_ratio)

        pred_labels = {}
        for cov_type in covariance_types:
            pred_labels[cov_type] = {}
            gm = GaussianMixture(n_components=2, covariance_type=cov_type,
                                 random_state=0)
            for k, x in datasets.items():
                pred_labels[cov_type][k] = gm.fit_predict(x)
        accuracy = {}
        for cov_type in covariance_types:
            accuracy[cov_type] = {}
            for k, pred in pred_labels[cov_type].items():
                accuracy[cov_type][k] = \
                    compute_accuracy(real_labels[k], pred.collect())

        # Covariance type 'full'.
        # Assert good accuracy in all tested datasets.
        self.assertGreater(accuracy['full']['aniso'], 0.9)
        self.assertGreater(accuracy['full']['blobs'], 0.9)
        self.assertGreater(accuracy['full']['uncorr'], 0.9)
        self.assertGreater(accuracy['full']['corr'], 0.9)

        # Covariance type 'tied'.
        # Assert good accuracy only for 'aniso'.
        self.assertGreater(accuracy['tied']['aniso'], 0.9)
        self.assertLess(accuracy['tied']['blobs'], 0.9)
        self.assertLess(accuracy['tied']['uncorr'], 0.9)
        self.assertLess(accuracy['tied']['corr'], 0.9)

        # Covariance type 'diag'.
        # Assert good accuracy only for 'blobs' and 'uncorr'.
        self.assertLess(accuracy['diag']['aniso'], 0.9)
        self.assertGreater(accuracy['diag']['blobs'], 0.9)
        self.assertGreater(accuracy['diag']['uncorr'], 0.9)
        self.assertLess(accuracy['diag']['corr'], 0.9)

        # Covariance type 'spherical'.
        # Assert good accuracy only for 'blobs'.
        self.assertLess(accuracy['spherical']['aniso'], 0.9)
        self.assertGreater(accuracy['spherical']['blobs'], 0.9)
        self.assertLess(accuracy['spherical']['uncorr'], 0.9)
        self.assertLess(accuracy['spherical']['corr'], 0.9)
Beispiel #24
0
def initialize(alg_names, args):
    return [{
        'KMeans': lambda x: KMeans(**get_kmeans_kwargs(x)),
        'DBSCAN': lambda x: DBSCAN(**get_dbscan_kwargs(x)),
        'GaussianMixture': lambda x: GaussianMixture(**get_gm_kwargs(x))
    }[name](args) for name in alg_names]