def test_pca_bad_solver(): X = np.random.RandomState(0).rand(5, 4) pca = dd.PCA(n_components=3, svd_solver="bad_argument") assert_raises(ValueError, pca.fit, X)
def test_whitening(): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 n_components = 30 rank = 50 # some low rank data with correlated features X = np.dot( rng.randn(n_samples, rank), np.dot(np.diag(np.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features)), ) # the component-wise variance of the first 50 features is 3 times the # mean component-wise variance of the remaining 30 features X[:, :50] *= 3 assert X.shape == (n_samples, n_features) # the component-wise variance is thus highly varying: assert X.std(axis=0).std() > 43.8 dX = da.from_array(X, chunks=(50, n_features)) for solver, copy in product(solver_list, (True, False)): # whiten the data while projecting to the lower dim subspace X_ = dX.copy() # make sure we keep an original across iterations. pca = dd.PCA( n_components=n_components, whiten=True, copy=copy, svd_solver=solver, random_state=0, iterated_power=4, ) # test fit_transform X_whitened = pca.fit_transform(X_.copy()) assert X_whitened.shape == (n_samples, n_components) # X_whitened2 = pca.transform(X_) # XXX: These differ for randomized. # assert_eq(X_whitened.compute(), X_whitened2.compute(), # atol=tol, rtol=tol) assert_almost_equal(X_whitened.std(ddof=1, axis=0), np.ones(n_components), decimal=6) assert_almost_equal(X_whitened.mean(axis=0), np.zeros(n_components)) X_ = dX.copy() pca = dd.PCA( n_components=n_components, whiten=False, copy=copy, svd_solver=solver, random_state=0, ).fit(X_) X_unwhitened = pca.transform(X_) assert X_unwhitened.shape == (n_samples, n_components) # in that case the output components still have varying variances assert_almost_equal(X_unwhitened.std(axis=0).std(), 74.1, 1)
def test_basic(): a = dd.PCA() b = sd.PCA() a.fit(dX) b.fit(X) assert_estimator_equal(a, b)