def test_incremental_pca(svd_solver): # Incremental PCA on dense arrays. X = iris.data X = da.from_array(X, chunks=(3, -1)) batch_size = X.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size, svd_solver=svd_solver) pca = PCA(n_components=2, svd_solver=svd_solver) pca.fit_transform(X) X_transformed = ipca.fit_transform(X) assert X_transformed.shape == (X.shape[0], 2) np.testing.assert_allclose( ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), rtol=1e-3, ) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X) cov = ipca.get_covariance() precision = ipca.get_precision() np.testing.assert_allclose( np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-13 ) assert isinstance(pca.singular_values_, type(ipca.singular_values_)) assert isinstance(pca.mean_, type(ipca.mean_)) assert isinstance(pca.explained_variance_, type(ipca.explained_variance_)) assert isinstance( pca.explained_variance_ratio_, type(ipca.explained_variance_ratio_) )
def test_compare_with_sklearn(svd_solver, batch_number): X = iris.data X_da = da.from_array(X, chunks=(3, -1)) batch_size = X.shape[0] // batch_number ipca = sd.IncrementalPCA(n_components=2, batch_size=batch_size) ipca.fit(X) ipca_da = IncrementalPCA( n_components=2, batch_size=batch_size, svd_solver=svd_solver ) ipca_da.fit(X_da) np.testing.assert_allclose(ipca.components_, ipca_da.components_, atol=1e-13) np.testing.assert_allclose( ipca.explained_variance_, ipca_da.explained_variance_, atol=1e-13 ) np.testing.assert_allclose( ipca.explained_variance_, ipca_da.explained_variance_, atol=1e-13 ) np.testing.assert_allclose( ipca.explained_variance_ratio_, ipca_da.explained_variance_ratio_, atol=1e-13 ) if svd_solver == "randomized": # noise variance in randomized solver is probabilistic. assert_almost_equal(ipca.noise_variance_, ipca_da.noise_variance_, decimal=1) else: np.testing.assert_allclose( ipca.noise_variance_, ipca_da.noise_variance_, atol=1e-13 )
def test_incremental_pca_set_params(): # Test that components_ sign is stable over batch sizes. rng = np.random.RandomState(1999) n_samples = 100 n_features = 20 X = rng.randn(n_samples, n_features) X2 = rng.randn(n_samples, n_features) X3 = rng.randn(n_samples, n_features) X = da.from_array(X, chunks=[4, -1]) X2 = da.from_array(X2, chunks=[4, -1]) X3 = da.from_array(X3, chunks=[4, -1]) ipca = IncrementalPCA(n_components=20) ipca.fit(X) # Decreasing number of components ipca.set_params(n_components=10) with pytest.raises(ValueError): ipca.partial_fit(X2) # Increasing number of components ipca.set_params(n_components=15) with pytest.raises(ValueError): ipca.partial_fit(X3) # Returning to original setting ipca.set_params(n_components=20) ipca.partial_fit(X)
def test_singular_values(svd_solver): # Check that the IncrementalPCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 1000 n_features = 100 X = datasets.make_low_rank_matrix( n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng ) X = da.from_array(X, chunks=[200, -1]) pca = PCA(n_components=10, svd_solver=svd_solver, random_state=rng).fit(X) ipca = IncrementalPCA(n_components=10, batch_size=100, svd_solver=svd_solver).fit(X) assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2) # Compare to the Frobenius norm X_pca = pca.transform(X) X_ipca = ipca.transform(X) assert_array_almost_equal( np.sum(pca.singular_values_ ** 2.0), np.linalg.norm(X_pca, "fro") ** 2.0, 12 ) assert_array_almost_equal( np.sum(ipca.singular_values_ ** 2.0), np.linalg.norm(X_ipca, "fro") ** 2.0, 2 ) # Compare to the 2-norms of the score vectors assert_array_almost_equal( pca.singular_values_, np.sqrt(np.sum(X_pca ** 2.0, axis=0)), 12 ) assert_array_almost_equal( ipca.singular_values_, np.sqrt(np.sum(X_ipca ** 2.0, axis=0)), 2 ) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = datasets.make_low_rank_matrix( n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng ) X = da.from_array(X, chunks=[4, -1]) pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng) ipca = IncrementalPCA(n_components=3, batch_size=100, svd_solver=svd_solver) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca ** 2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = np.dot(X_pca, pca.components_) pca.fit(X_hat) X_hat = da.from_array(X_hat, chunks=(4, -1)) ipca.fit(X_hat) assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def test_incremental_pca_num_features_change(): # Test that changing n_components will raise an error. rng = np.random.RandomState(1999) n_samples = 100 X = rng.randn(n_samples, 20) X2 = rng.randn(n_samples, 50) X = da.from_array(X, chunks=[4, -1]) X2 = da.from_array(X2, chunks=[4, -1]) ipca = IncrementalPCA(n_components=None) ipca.fit(X) with pytest.raises(ValueError): ipca.partial_fit(X2)