Beispiel #1
0
    def test_randomized_pca_inverse(self):
        # Test that randomized PCA is inversible on dense data
        rng = np.random.RandomState(0)
        n, p = 50, 3
        X = mt.tensor(rng.randn(n, p))  # spherical data
        X[:, 1] *= .00001  # make middle component relatively small
        X += [5, 4, 3]  # make a large mean

        # same check that we can find the original data from the transformed signal
        # (since the data is almost of rank n_components)
        pca = PCA(n_components=2, svd_solver='randomized',
                  random_state=0).fit(X)
        Y = pca.transform(X)
        Y_inverse = pca.inverse_transform(Y)
        assert_almost_equal(X.execute(), Y_inverse.execute(), decimal=2)

        # same as above with whitening (approximate reconstruction)
        pca = PCA(n_components=2,
                  whiten=True,
                  svd_solver='randomized',
                  random_state=0).fit(X)
        Y = pca.transform(X)
        Y_inverse = pca.inverse_transform(Y)
        relative_max_delta = (mt.abs(X - Y_inverse) / mt.abs(X).mean()).max()
        self.assertLess(relative_max_delta.execute(), 1e-5)
Beispiel #2
0
    def test_singular_values(self):
        # Check that the PCA output has the correct singular values

        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 80

        X = mt.tensor(rng.randn(n_samples, n_features))

        pca = PCA(n_components=2, svd_solver='full', random_state=rng).fit(X)
        rpca = PCA(n_components=2, svd_solver='randomized',
                   random_state=rng).fit(X)
        assert_array_almost_equal(pca.singular_values_.fetch(),
                                  rpca.singular_values_.fetch(), 1)

        # Compare to the Frobenius norm
        X_pca = pca.transform(X)
        X_rpca = rpca.transform(X)
        assert_array_almost_equal(
            mt.sum(pca.singular_values_**2.0).execute(),
            (mt.linalg.norm(X_pca, "fro")**2.0).execute(), 12)
        assert_array_almost_equal(
            mt.sum(rpca.singular_values_**2.0).execute(),
            (mt.linalg.norm(X_rpca, "fro")**2.0).execute(), 0)

        # Compare to the 2-norms of the score vectors
        assert_array_almost_equal(
            pca.singular_values_.fetch(),
            mt.sqrt(mt.sum(X_pca**2.0, axis=0)).execute(), 12)
        assert_array_almost_equal(
            rpca.singular_values_.fetch(),
            mt.sqrt(mt.sum(X_rpca**2.0, axis=0)).execute(), 2)

        # Set the singular values and see what we get back
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 110

        X = mt.tensor(rng.randn(n_samples, n_features))

        pca = PCA(n_components=3, svd_solver='full', random_state=rng)
        rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng)
        X_pca = pca.fit_transform(X)

        X_pca /= mt.sqrt(mt.sum(X_pca**2.0, axis=0))
        X_pca[:, 0] *= 3.142
        X_pca[:, 1] *= 2.718

        X_hat = mt.dot(X_pca, pca.components_)
        pca.fit(X_hat)
        rpca.fit(X_hat)
        assert_array_almost_equal(pca.singular_values_.fetch(),
                                  [3.142, 2.718, 1.0], 14)
        assert_array_almost_equal(rpca.singular_values_.fetch(),
                                  [3.142, 2.718, 1.0], 14)
Beispiel #3
0
    def testWhitening(self):
        # Check that PCA output has unit-variance
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 80
        n_components = 30
        rank = 50

        # some low rank data with correlated features
        X = mt.dot(
            rng.randn(n_samples, rank),
            mt.dot(mt.diag(mt.linspace(10.0, 1.0, rank)),
                   rng.randn(rank, n_features)))
        # the component-wise variance of the first 50 features is 3 times the
        # mean component-wise variance of the remaining 30 features
        X[:, :50] *= 3

        self.assertEqual(X.shape, (n_samples, n_features))

        # the component-wise variance is thus highly varying:
        self.assertGreater(X.std(axis=0).std().execute(), 43.8)

        for solver, copy in product(self.solver_list, (True, False)):
            # whiten the data while projecting to the lower dim subspace
            X_ = X.copy()  # make sure we keep an original across iterations.
            pca = PCA(n_components=n_components,
                      whiten=True,
                      copy=copy,
                      svd_solver=solver,
                      random_state=0,
                      iterated_power=7)
            # test fit_transform
            X_whitened = pca.fit_transform(X_.copy())
            self.assertEqual(X_whitened.shape, (n_samples, n_components))
            X_whitened2 = pca.transform(X_)
            assert_array_almost_equal(X_whitened.fetch(), X_whitened2.fetch())

            assert_almost_equal(X_whitened.std(ddof=1, axis=0).execute(),
                                np.ones(n_components),
                                decimal=6)
            assert_almost_equal(
                X_whitened.mean(axis=0).execute(), np.zeros(n_components))

            X_ = X.copy()
            pca = PCA(n_components=n_components,
                      whiten=False,
                      copy=copy,
                      svd_solver=solver).fit(X_)
            X_unwhitened = pca.transform(X_)
            self.assertEqual(X_unwhitened.shape, (n_samples, n_components))

            # in that case the output components still have varying variances
            assert_almost_equal(
                X_unwhitened.std(axis=0).std().execute(), 74.1, 1)
Beispiel #4
0
    def testExplainedVariance(self):
        # Check that PCA output has unit-variance
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 80

        X = mt.tensor(rng.randn(n_samples, n_features))

        pca = PCA(n_components=2, svd_solver='full').fit(X)
        rpca = PCA(n_components=2, svd_solver='randomized',
                   random_state=42).fit(X)
        assert_array_almost_equal(pca.explained_variance_.execute(),
                                  rpca.explained_variance_.execute(), 1)
        assert_array_almost_equal(pca.explained_variance_ratio_.execute(),
                                  rpca.explained_variance_ratio_.execute(), 1)

        # compare to empirical variances
        expected_result = np.linalg.eig(np.cov(X.execute(), rowvar=False))[0]
        expected_result = sorted(expected_result, reverse=True)[:2]

        X_pca = pca.transform(X)
        assert_array_almost_equal(pca.explained_variance_.execute(),
                                  mt.var(X_pca, ddof=1, axis=0).execute())
        assert_array_almost_equal(pca.explained_variance_.execute(),
                                  expected_result)

        X_rpca = rpca.transform(X)
        assert_array_almost_equal(rpca.explained_variance_.execute(),
                                  mt.var(X_rpca, ddof=1, axis=0).execute(),
                                  decimal=1)
        assert_array_almost_equal(rpca.explained_variance_.execute(),
                                  expected_result,
                                  decimal=1)

        # Same with correlated data
        X = datasets.make_classification(n_samples,
                                         n_features,
                                         n_informative=n_features - 2,
                                         random_state=rng)[0]
        X = mt.tensor(X)

        pca = PCA(n_components=2).fit(X)
        rpca = PCA(n_components=2, svd_solver='randomized',
                   random_state=rng).fit(X)
        assert_array_almost_equal(pca.explained_variance_ratio_.execute(),
                                  rpca.explained_variance_ratio_.execute(), 5)
Beispiel #5
0
    def _check_pca_int_dtype_upcast_to_double(self, svd_solver):
        # Ensure that all int types will be upcast to float64
        X_i64 = mt.tensor(np.random.RandomState(0).randint(0, 1000, (1000, 4)))
        X_i64 = X_i64.astype(np.int64, copy=False)
        X_i32 = X_i64.astype(np.int32, copy=False)

        pca_64 = PCA(n_components=3, svd_solver=svd_solver,
                     random_state=0).fit(X_i64)
        pca_32 = PCA(n_components=3, svd_solver=svd_solver,
                     random_state=0).fit(X_i32)

        self.assertEqual(pca_64.components_.dtype, np.float64)
        self.assertEqual(pca_32.components_.dtype, np.float64)
        self.assertEqual(pca_64.transform(X_i64).dtype, np.float64)
        self.assertEqual(pca_32.transform(X_i32).dtype, np.float64)

        assert_array_almost_equal(pca_64.components_.execute(), pca_32.components_.execute(),
                                  decimal=5)
Beispiel #6
0
    def _check_pca_float_dtype_preservation(self, svd_solver):
        # Ensure that PCA does not upscale the dtype when input is float32
        X_64 = mt.tensor(np.random.RandomState(0).rand(1000, 4).astype(np.float64,
                                                                       copy=False))
        X_32 = X_64.astype(np.float32)

        pca_64 = PCA(n_components=3, svd_solver=svd_solver,
                     random_state=0).fit(X_64)
        pca_32 = PCA(n_components=3, svd_solver=svd_solver,
                     random_state=0).fit(X_32)

        self.assertEqual(pca_64.components_.dtype, np.float64)
        self.assertEqual(pca_32.components_.dtype, np.float32)
        self.assertEqual(pca_64.transform(X_64).dtype, np.float64)
        self.assertEqual(pca_32.transform(X_32).dtype, np.float32)

        # decimal=5 fails on mac with scipy = 1.1.0
        assert_array_almost_equal(pca_64.components_.execute(), pca_32.components_.execute(),
                                  decimal=4)
Beispiel #7
0
    def test_pca_inverse(self):
        # Test that the projection of data can be inverted
        rng = np.random.RandomState(0)
        n, p = 50, 3
        X = mt.tensor(rng.randn(n, p))  # spherical data
        X[:, 1] *= .00001  # make middle component relatively small
        X += [5, 4, 3]  # make a large mean

        # same check that we can find the original data from the transformed
        # signal (since the data is almost of rank n_components)
        pca = PCA(n_components=2, svd_solver='full').fit(X)
        Y = pca.transform(X)
        Y_inverse = pca.inverse_transform(Y)
        assert_almost_equal(X.execute(), Y_inverse.execute(), decimal=3)

        # same as above with whitening (approximate reconstruction)
        for solver in self.solver_list:
            pca = PCA(n_components=2, whiten=True, svd_solver=solver)
            pca.fit(X)
            Y = pca.transform(X)
            Y_inverse = pca.inverse_transform(Y)
            assert_almost_equal(X.execute(), Y_inverse.execute(), decimal=3)