def test_pca_singular_values(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) X_trans = pca.fit_transform(X) # compare to the Frobenius norm assert_allclose(np.sum(pca.singular_values_**2), np.linalg.norm(X_trans, "fro")**2) # Compare to the 2-norms of the score vectors assert_allclose(pca.singular_values_, np.sqrt(np.sum(X_trans**2, axis=0))) # set the singular values and see what er get back n_samples, n_features = 100, 110 X = rng.randn(n_samples, n_features) pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng) X_trans = pca.fit_transform(X) X_trans /= np.sqrt(np.sum(X_trans**2, axis=0)) X_trans[:, 0] *= 3.142 X_trans[:, 1] *= 2.718 X_hat = np.dot(X_trans, pca.components_) pca.fit(X_hat) assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0])
def test_pca_score_consistency_solvers(svd_solver): # Check the consistency of score between solvers X, _ = datasets.load_digits(return_X_y=True) pca_full = PCA(n_components=30, svd_solver='full', random_state=0) pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6)
def test_pca_svd_solver_auto(data, n_components, expected_solver): pca_auto = PCA(n_components=n_components, random_state=0) pca_test = PCA(n_components=n_components, svd_solver=expected_solver, random_state=0) pca_auto.fit(data) pca_test.fit(data) assert_allclose(pca_auto.components_, pca_test.components_)
def test_whitening(solver, copy): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 n_components = 30 rank = 50 # some low rank data with correlated features X = np.dot( rng.randn(n_samples, rank), np.dot(np.diag(np.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features))) # the component-wise variance of the first 50 features is 3 times the # mean component-wise variance of the remaining 30 features X[:, :50] *= 3 assert X.shape == (n_samples, n_features) # the component-wise variance is thus highly varying: assert X.std(axis=0).std() > 43.8 # whiten the data while projecting to the lower dim subspace X_ = X.copy() # make sure we keep an original across iterations. pca = PCA(n_components=n_components, whiten=True, copy=copy, svd_solver=solver, random_state=0, iterated_power=7) # test fit_transform X_whitened = pca.fit_transform(X_.copy()) assert X_whitened.shape == (n_samples, n_components) X_whitened2 = pca.transform(X_) assert_allclose(X_whitened, X_whitened2, rtol=5e-4) assert_allclose(X_whitened.std(ddof=1, axis=0), np.ones(n_components)) assert_allclose(X_whitened.mean(axis=0), np.zeros(n_components), atol=1e-12) X_ = X.copy() pca = PCA(n_components=n_components, whiten=False, copy=copy, svd_solver=solver).fit(X_) X_unwhitened = pca.transform(X_) assert X_unwhitened.shape == (n_samples, n_components) # in that case the output components still have varying variances assert X_unwhitened.std(axis=0).std() == pytest.approx(74.1, rel=1e-1)
def test_pca_singular_values_consistency(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca_full = PCA(n_components=2, svd_solver='full', random_state=rng) pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.singular_values_, pca_other.singular_values_, rtol=5e-3)
def test_singular_values(): # Check that the IncrementalPCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 1000 n_features = 100 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng) pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X) ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X) assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2) # Compare to the Frobenius norm X_pca = pca.transform(X) X_ipca = ipca.transform(X) assert_array_almost_equal(np.sum(pca.singular_values_**2.0), np.linalg.norm(X_pca, "fro")**2.0, 12) assert_array_almost_equal(np.sum(ipca.singular_values_**2.0), np.linalg.norm(X_ipca, "fro")**2.0, 2) # Compare to the 2-norms of the score vectors assert_array_almost_equal(pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), 12) assert_array_almost_equal(ipca.singular_values_, np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng) pca = PCA(n_components=3, svd_solver='full', random_state=rng) ipca = IncrementalPCA(n_components=3, batch_size=100) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = np.dot(X_pca, pca.components_) pca.fit(X_hat) ipca.fit(X_hat) assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def test_feature_union_weights(): # test feature union with transformer weights X = iris.data y = iris.target pca = PCA(n_components=2, svd_solver='randomized', random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert X_fit_transformed_wo_method.shape == (X.shape[0], 7)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver='randomized', whiten=True) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples, ) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def test_incremental_pca_sparse(matrix_class): # Incremental PCA on sparse arrays. X = iris.data pca = PCA(n_components=2) pca.fit_transform(X) X_sparse = matrix_class(X) batch_size = X_sparse.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) X_transformed = ipca.fit_transform(X_sparse) assert X_transformed.shape == (X_sparse.shape[0], 2) np.testing.assert_allclose(ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), rtol=1e-3) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X_sparse) cov = ipca.get_covariance() precision = ipca.get_precision() np.testing.assert_allclose(np.dot(cov, precision), np.eye(X_sparse.shape[1]), atol=1e-13) with pytest.raises( TypeError, match="IncrementalPCA.partial_fit does not support " "sparse input. Either convert data to dense " "or use IncrementalPCA.fit to do so in batches."): ipca.partial_fit(X_sparse)
def test_make_union(): pca = PCA(svd_solver='full') mock = Transf() fu = make_union(pca, mock) names, transformers = zip(*fu.transformer_list) assert names == ("pca", "transf") assert transformers == (pca, mock)
def test_n_components_mle(svd_solver): # Ensure that n_components == 'mle' doesn't raise error for auto/full rng = np.random.RandomState(0) n_samples, n_features = 600, 10 X = rng.randn(n_samples, n_features) pca = PCA(n_components='mle', svd_solver=svd_solver) pca.fit(X) assert pca.n_components_ == 0
def test_pca_explained_variance_equivalence_solver(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca_full = PCA(n_components=2, svd_solver='full') pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=0) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.explained_variance_, pca_other.explained_variance_, rtol=5e-2) assert_allclose(pca_full.explained_variance_ratio_, pca_other.explained_variance_ratio_, rtol=5e-2)
def test_pca_sparse_input(svd_solver): X = np.random.RandomState(0).rand(5, 4) X = sp.sparse.csr_matrix(X) assert sp.sparse.issparse(X) pca = PCA(n_components=3, svd_solver=svd_solver) with pytest.raises(TypeError): pca.fit(X)
def test_pca_explained_variance_empirical(X, svd_solver): pca = PCA(n_components=2, svd_solver=svd_solver, random_state=0) X_pca = pca.fit_transform(X) assert_allclose(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0)) expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] expected_result = sorted(expected_result, reverse=True)[:2] assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3)
def check_pca_int_dtype_upcast_to_double(svd_solver): # Ensure that all int types will be upcast to float64 X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4)) X_i64 = X_i64.astype(np.int64, copy=False) X_i32 = X_i64.astype(np.int32, copy=False) pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i64) pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i32) assert pca_64.components_.dtype == np.float64 assert pca_32.components_.dtype == np.float64 assert pca_64.transform(X_i64).dtype == np.float64 assert pca_32.transform(X_i32).dtype == np.float64 assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4)
def test_pca_check_projection_list(svd_solver): # Test that the projection of data is correct X = [[1.0, 0.0], [0.0, 1.0]] pca = PCA(n_components=1, svd_solver=svd_solver, random_state=0) X_trans = pca.fit_transform(X) assert X_trans.shape, (2, 1) assert_allclose(X_trans.mean(), 0.00, atol=1e-12) assert_allclose(X_trans.std(), 0.71, rtol=5e-3)
def test_incremental_pca_against_pca_iris(): # Test that IncrementalPCA and PCA are approximate (to a sign flip). X = iris.data Y_pca = PCA(n_components=2).fit_transform(X) Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X) assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def test_pca_dim(): # Check automated dimensionality setting rng = np.random.RandomState(0) n, p = 100, 5 X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) pca = PCA(n_components='mle', svd_solver='full').fit(X) assert pca.n_components == 'mle' assert pca.n_components_ == 1
def test_no_empty_slice_warning(): # test if we avoid numpy warnings for computing over empty arrays n_components = 10 n_features = n_components + 2 # anything > n_comps triggered it in 0.16 X = np.random.uniform(-1, 1, size=(n_components, n_features)) pca = PCA(n_components=n_components) with pytest.warns(None) as record: pca.fit(X) assert not record.list
def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() pca = PCA(svd_solver='full') pipe = Pipeline([('scaler', scaler), ('pca', pca)]) assert_raises_regex(AttributeError, "'PCA' object has no attribute 'fit_predict'", getattr, pipe, 'fit_predict')
def test_pca_sanity_noise_variance(svd_solver): # Sanity check for the noise_variance_. For more details see # https://github.com/scikit-learn/scikit-learn/issues/7568 # https://github.com/scikit-learn/scikit-learn/issues/8541 # https://github.com/scikit-learn/scikit-learn/issues/8544 X, _ = datasets.load_digits(return_X_y=True) pca = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca.fit(X) assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0)
def test_pca_score(svd_solver): # Test that probabilistic PCA scoring yields a reasonable score n, p = 1000, 3 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 5]) pca = PCA(n_components=2, svd_solver=svd_solver) pca.fit(X) ll1 = pca.score(X) h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1**2) * p assert_allclose(ll1 / h, 1, rtol=5e-2) ll2 = pca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5])) assert ll1 > ll2 pca = PCA(n_components=2, whiten=True, svd_solver=svd_solver) pca.fit(X) ll2 = pca.score(X) assert ll1 > ll2
def test_pca_deterministic_output(svd_solver): rng = np.random.RandomState(0) X = rng.rand(10, 10) transformed_X = np.zeros((20, 2)) for i in range(20): pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) transformed_X[i, :] = pca.fit_transform(X)[0] assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
def test_infer_dim_3(): n, p = 100, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) X[30:40] += 2 * np.array([-1, 1, -1, 1, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension_(spect, n, p) > 2
def test_n_components_mle_error(svd_solver): # Ensure that n_components == 'mle' will raise an error for unsupported # solvers rng = np.random.RandomState(0) n_samples, n_features = 600, 10 X = rng.randn(n_samples, n_features) pca = PCA(n_components='mle', svd_solver=svd_solver) err_msg = ("n_components='mle' cannot be a string with svd_solver='{}'". format(svd_solver)) with pytest.raises(ValueError, match=err_msg): pca.fit(X)
def test_incremental_pca_against_pca_random_data(): # Test that IncrementalPCA and PCA are approximate (to a sign flip). rng = np.random.RandomState(1999) n_samples = 100 n_features = 3 X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features) Y_pca = PCA(n_components=3).fit_transform(X) Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X) assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def check_pca_float_dtype_preservation(svd_solver): # Ensure that PCA does not upscale the dtype when input is float32 X_64 = np.random.RandomState(0).rand(1000, 4).astype(np.float64, copy=False) X_32 = X_64.astype(np.float32) pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_64) pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_32) assert pca_64.components_.dtype == np.float64 assert pca_32.components_.dtype == np.float32 assert pca_64.transform(X_64).dtype == np.float64 assert pca_32.transform(X_32).dtype == np.float32 # the rtol is set such that the test passes on all platforms tested on # conda-forge: PR#15775 # see: https://github.com/conda-forge/scikit-learn-feedstock/pull/113 assert_allclose(pca_64.components_, pca_32.components_, rtol=2e-4)
def test_infer_dim_2(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension_(spect, n, p) > 1
def test_kernel_pca_linear_kernel(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) # for a linear kernel, kernel PCA should find the same projection as PCA # modulo the sign (direction) # fit only the first four components: fifth is near zero eigenvalue, so # can be trimmed due to roundoff error assert_array_almost_equal( np.abs(KernelPCA(4).fit(X_fit).transform(X_pred)), np.abs(PCA(4).fit(X_fit).transform(X_pred)))
def test_infer_dim_1(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + np.array([1, 0, 7, 4, 6])) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)]) assert ll[1] > ll.max() - .01 * n