Beispiel #1
0
    def test_sparse(self):
        """ Tests GaussianMixture produces the same results using dense and
        sparse data structures """
        file_ = "tests/files/libsvm/2"

        x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True)
        x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False)

        covariance_types = 'full', 'tied', 'diag', 'spherical'

        for cov_type in covariance_types:
            gm = GaussianMixture(n_components=4, random_state=0,
                                 covariance_type=cov_type)
            labels_sparse = gm.fit_predict(x_sparse).collect()
            labels_dense = gm.fit_predict(x_dense).collect()
            self.assertTrue(np.array_equal(labels_sparse, labels_dense))
Beispiel #2
0
    def test_fit_predict(self):
        """Tests GaussianMixture.fit_predict()"""
        x, y = make_blobs(n_samples=1500, random_state=170)
        x_filtered = np.vstack(
            (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))
        y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10)))

        ds_x = ds.array(x_filtered, block_size=(300, 2))

        gm = GaussianMixture(n_components=3, random_state=170)
        pred = gm.fit_predict(ds_x).collect()

        self.assertEqual(len(pred), 610)
        accuracy = np.count_nonzero(pred == y_real) / len(pred)
        self.assertGreater(accuracy, 0.99)
Beispiel #3
0
    def test_fit_predict_vs_fit_and_predict(self):
        """Tests GaussianMixture fit_predict() eq. fit() and predict() for both
        converged and not converged runs (and a fixed random_state)."""
        x0 = np.random.normal(size=(1000, 2))
        x1 = np.random.normal(size=(2000, 2))
        x0 = np.dot(x0, [[1.2, 1], [0, 0.5]]) + [0, 3]
        x1 = np.dot(x1, [[0.4, 0], [1, 2.5]]) + [1, 0]
        x = np.concatenate((x0, x1))
        x_ds = ds.array(x, (1500, 2))

        # We check the cases with and without convergence
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ConvergenceWarning)
            for max_iter, converges in ((5, False), (100, True)):
                gm1 = GaussianMixture(n_components=2,
                                      max_iter=max_iter,
                                      random_state=0)
                gm1.fit(x_ds)
                labels1 = gm1.predict(x_ds)

                gm2 = GaussianMixture(n_components=2,
                                      max_iter=max_iter,
                                      random_state=0)
                labels2 = gm2.fit_predict(x_ds)

                self.assertTrue(np.all(labels1.collect() == labels2.collect()))
                self.assertEqual(gm1.n_iter, gm2.n_iter)
                self.assertEqual(converges, gm1.converged_)
                self.assertEqual(gm1.converged_, gm2.converged_)
                self.assertEqual(gm1.lower_bound_, gm2.lower_bound_)

                gm1.weights_ = compss_wait_on(gm1.weights_)
                gm1.means_ = compss_wait_on(gm1.means_)
                gm1.covariances_ = compss_wait_on(gm1.covariances_)
                gm2.weights_ = compss_wait_on(gm2.weights_)
                gm2.means_ = compss_wait_on(gm2.means_)
                gm2.covariances_ = compss_wait_on(gm2.covariances_)

                self.assertTrue(np.all(gm1.weights_ == gm2.weights_))
                self.assertTrue(np.all(gm1.means_ == gm2.means_))
                self.assertTrue(np.all(gm1.covariances_ == gm2.covariances_))
def main():
    # Based on tests.test_gm.GaussianMixtureTest.test_covariance_types
    # Copied code START
    """ Tests GaussianMixture covariance types """
    np.random.seed(0)
    n_samples = 600
    n_features = 2

    def create_anisotropic_dataset():
        """Create dataset with 2 anisotropic gaussians of different
        weight"""
        n0 = 2 * n_samples // 3
        n1 = n_samples // 3
        x0 = np.random.normal(size=(n0, n_features))
        x1 = np.random.normal(size=(n1, n_features))
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x0 = np.dot(x0, transformation)
        x1 = np.dot(x1, transformation) + [0, 3]
        x = np.concatenate((x0, x1))
        y = np.concatenate((np.zeros(n0), np.ones(n1)))
        return x, y

    def create_spherical_blobs_dataset():
        """Create dataset with 2 spherical gaussians of different weight,
        variance and position"""
        n0 = 2 * n_samples // 3
        n1 = n_samples // 3
        x0 = np.random.normal(size=(n0, 2), scale=0.5, loc=[2, 0])
        x1 = np.random.normal(size=(n1, 2), scale=2.5)
        x = np.concatenate((x0, x1))
        y = np.concatenate((np.zeros(n0), np.ones(n1)))
        return x, y

    def create_uncorrelated_dataset():
        """Create dataset with 2 gaussians forming a cross of uncorrelated
        variables"""
        n0 = 2 * n_samples // 3
        n1 = n_samples // 3
        x0 = np.random.normal(size=(n0, n_features))
        x1 = np.random.normal(size=(n1, n_features))
        x0 = np.dot(x0, [[1.2, 0], [0, 0.5]]) + [0, 3]
        x1 = np.dot(x1, [[0.4, 0], [0, 2.5]]) + [1, 0]
        x = np.concatenate((x0, x1))
        y = np.concatenate((np.zeros(n0), np.ones(n1)))
        return x, y

    def create_correlated_dataset():
        """Create dataset with 2 gaussians forming a cross of correlated
        variables"""
        x, y = create_uncorrelated_dataset()
        x = np.dot(x, [[1, 1], [-1, 1]])
        return x, y

    datasets = {
        'aniso': create_anisotropic_dataset(),
        'blobs': create_spherical_blobs_dataset(),
        'uncorr': create_uncorrelated_dataset(),
        'corr': create_correlated_dataset()
    }
    real_labels = {k: v[1] for k, v in datasets.items()}
    for k, v in datasets.items():
        datasets[k] = ds.array(v[0], block_size=(200, v[0].shape[1]))

    covariance_types = 'full', 'tied', 'diag', 'spherical'

    def compute_accuracy(real, predicted):
        """ Computes classification accuracy for binary (0/1) labels"""
        equal_labels = np.count_nonzero(predicted == real)
        equal_ratio = equal_labels / len(real)
        return max(equal_ratio, 1 - equal_ratio)

    pred_labels = {}
    for cov_type in covariance_types:
        pred_labels[cov_type] = {}
        gm = GaussianMixture(n_components=2,
                             covariance_type=cov_type,
                             random_state=0)
        for k, x in datasets.items():
            pred_labels[cov_type][k] = gm.fit_predict(x)
    accuracy = {}
    for cov_type in covariance_types:
        accuracy[cov_type] = {}
        for k, pred in pred_labels[cov_type].items():
            pred = pred.collect()
            pred_labels[cov_type][k] = pred
            accuracy[cov_type][k] = compute_accuracy(real_labels[k], pred)
    # Copied code END

    # Plot START
    plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)
    plot_num = 1

    for i_ds, (ds_name, x) in enumerate(datasets.items()):
        x = x.collect()

        plt.subplot(len(datasets), len(covariance_types) + 1, plot_num)
        if i_ds == 0:
            plt.title('original', size=18)
        colors = np.array(['#377eb8', '#ff7f00'])
        label_colors = colors[real_labels[ds_name].astype(int)]
        plt.scatter(x[:, 0], x[:, 1], s=10, color=label_colors)
        plt.xticks(())
        plt.yticks(())
        plot_num += 1

        for cov_type in covariance_types:
            plt.subplot(len(datasets), len(covariance_types) + 1, plot_num)
            if i_ds == 0:
                plt.title(cov_type, size=18)

            colors = np.array(['#377eb8', '#ff7f00'])
            label_colors = colors[pred_labels[cov_type][ds_name]]
            plt.scatter(x[:, 0], x[:, 1], s=10, color=label_colors)
            plt.xticks(())
            plt.yticks(())
            plt.text(.99,
                     .01, ('%.2f' % accuracy[cov_type][ds_name]).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')
            plot_num += 1

    plt.show()
Beispiel #5
0
    def test_covariance_types(self):
        """ Tests GaussianMixture covariance types """
        np.random.seed(0)
        n_samples = 600
        n_features = 2

        def create_anisotropic_dataset():
            """Create dataset with 2 anisotropic gaussians of different
            weight"""
            n0 = 2 * n_samples // 3
            n1 = n_samples // 3
            x0 = np.random.normal(size=(n0, n_features))
            x1 = np.random.normal(size=(n1, n_features))
            transformation = [[0.6, -0.6], [-0.4, 0.8]]
            x0 = np.dot(x0, transformation)
            x1 = np.dot(x1, transformation) + [0, 3]
            x = np.concatenate((x0, x1))
            y = np.concatenate((np.zeros(n0), np.ones(n1)))
            return x, y

        def create_spherical_blobs_dataset():
            """Create dataset with 2 spherical gaussians of different weight,
            variance and position"""
            n0 = 2 * n_samples // 3
            n1 = n_samples // 3
            x0 = np.random.normal(size=(n0, 2), scale=0.5, loc=[2, 0])
            x1 = np.random.normal(size=(n1, 2), scale=2.5)
            x = np.concatenate((x0, x1))
            y = np.concatenate((np.zeros(n0), np.ones(n1)))
            return x, y

        def create_uncorrelated_dataset():
            """Create dataset with 2 gaussians forming a cross of uncorrelated
            variables"""
            n0 = 2 * n_samples // 3
            n1 = n_samples // 3
            x0 = np.random.normal(size=(n0, n_features))
            x1 = np.random.normal(size=(n1, n_features))
            x0 = np.dot(x0, [[1.2, 0], [0, 0.5]]) + [0, 3]
            x1 = np.dot(x1, [[0.4, 0], [0, 2.5]]) + [1, 0]
            x = np.concatenate((x0, x1))
            y = np.concatenate((np.zeros(n0), np.ones(n1)))
            return x, y

        def create_correlated_dataset():
            """Create dataset with 2 gaussians forming a cross of correlated
            variables"""
            x, y = create_uncorrelated_dataset()
            x = np.dot(x, [[1, 1], [-1, 1]])
            return x, y

        datasets = {'aniso': create_anisotropic_dataset(),
                    'blobs': create_spherical_blobs_dataset(),
                    'uncorr': create_uncorrelated_dataset(),
                    'corr': create_correlated_dataset()}
        real_labels = {k: v[1] for k, v in datasets.items()}
        for k, v in datasets.items():
            datasets[k] = ds.array(v[0], block_size=(200, v[0].shape[1]))

        covariance_types = 'full', 'tied', 'diag', 'spherical'

        def compute_accuracy(real, predicted):
            """ Computes classification accuracy for binary (0/1) labels"""
            equal_labels = np.count_nonzero(predicted == real)
            equal_ratio = equal_labels / len(real)
            return max(equal_ratio, 1 - equal_ratio)

        pred_labels = {}
        for cov_type in covariance_types:
            pred_labels[cov_type] = {}
            gm = GaussianMixture(n_components=2, covariance_type=cov_type,
                                 random_state=0)
            for k, x in datasets.items():
                pred_labels[cov_type][k] = gm.fit_predict(x)
        accuracy = {}
        for cov_type in covariance_types:
            accuracy[cov_type] = {}
            for k, pred in pred_labels[cov_type].items():
                accuracy[cov_type][k] = \
                    compute_accuracy(real_labels[k], pred.collect())

        # Covariance type 'full'.
        # Assert good accuracy in all tested datasets.
        self.assertGreater(accuracy['full']['aniso'], 0.9)
        self.assertGreater(accuracy['full']['blobs'], 0.9)
        self.assertGreater(accuracy['full']['uncorr'], 0.9)
        self.assertGreater(accuracy['full']['corr'], 0.9)

        # Covariance type 'tied'.
        # Assert good accuracy only for 'aniso'.
        self.assertGreater(accuracy['tied']['aniso'], 0.9)
        self.assertLess(accuracy['tied']['blobs'], 0.9)
        self.assertLess(accuracy['tied']['uncorr'], 0.9)
        self.assertLess(accuracy['tied']['corr'], 0.9)

        # Covariance type 'diag'.
        # Assert good accuracy only for 'blobs' and 'uncorr'.
        self.assertLess(accuracy['diag']['aniso'], 0.9)
        self.assertGreater(accuracy['diag']['blobs'], 0.9)
        self.assertGreater(accuracy['diag']['uncorr'], 0.9)
        self.assertLess(accuracy['diag']['corr'], 0.9)

        # Covariance type 'spherical'.
        # Assert good accuracy only for 'blobs'.
        self.assertLess(accuracy['spherical']['aniso'], 0.9)
        self.assertGreater(accuracy['spherical']['blobs'], 0.9)
        self.assertLess(accuracy['spherical']['uncorr'], 0.9)
        self.assertLess(accuracy['spherical']['corr'], 0.9)