def test_pca_singular_values(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) X_trans = pca.fit_transform(X) # compare to the Frobenius norm assert_allclose(np.sum(pca.singular_values_**2), np.linalg.norm(X_trans, "fro")**2) # Compare to the 2-norms of the score vectors assert_allclose(pca.singular_values_, np.sqrt(np.sum(X_trans**2, axis=0))) # set the singular values and see what er get back n_samples, n_features = 100, 110 X = rng.randn(n_samples, n_features) pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng) X_trans = pca.fit_transform(X) X_trans /= np.sqrt(np.sum(X_trans**2, axis=0)) X_trans[:, 0] *= 3.142 X_trans[:, 1] *= 2.718 X_hat = np.dot(X_trans, pca.components_) pca.fit(X_hat) assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0])
def test_pca_score_consistency_solvers(svd_solver): # Check the consistency of score between solvers X, _ = datasets.load_digits(return_X_y=True) pca_full = PCA(n_components=30, svd_solver='full', random_state=0) pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6)
def test_n_components_mle(svd_solver): # Ensure that n_components == 'mle' doesn't raise error for auto/full rng = np.random.RandomState(0) n_samples, n_features = 600, 10 X = rng.randn(n_samples, n_features) pca = PCA(n_components='mle', svd_solver=svd_solver) pca.fit(X) assert pca.n_components_ == 0
def test_pca_svd_solver_auto(data, n_components, expected_solver): pca_auto = PCA(n_components=n_components, random_state=0) pca_test = PCA(n_components=n_components, svd_solver=expected_solver, random_state=0) pca_auto.fit(data) pca_test.fit(data) assert_allclose(pca_auto.components_, pca_test.components_)
def test_pca_sparse_input(svd_solver): X = np.random.RandomState(0).rand(5, 4) X = sp.sparse.csr_matrix(X) assert sp.sparse.issparse(X) pca = PCA(n_components=3, svd_solver=svd_solver) with pytest.raises(TypeError): pca.fit(X)
def test_pca_sanity_noise_variance(svd_solver): # Sanity check for the noise_variance_. For more details see # https://github.com/scikit-learn/scikit-learn/issues/7568 # https://github.com/scikit-learn/scikit-learn/issues/8541 # https://github.com/scikit-learn/scikit-learn/issues/8544 X, _ = datasets.load_digits(return_X_y=True) pca = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca.fit(X) assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0)
def test_no_empty_slice_warning(): # test if we avoid numpy warnings for computing over empty arrays n_components = 10 n_features = n_components + 2 # anything > n_comps triggered it in 0.16 X = np.random.uniform(-1, 1, size=(n_components, n_features)) pca = PCA(n_components=n_components) with pytest.warns(None) as record: pca.fit(X) assert not record.list
def test_n_components_mle_error(svd_solver): # Ensure that n_components == 'mle' will raise an error for unsupported # solvers rng = np.random.RandomState(0) n_samples, n_features = 600, 10 X = rng.randn(n_samples, n_features) pca = PCA(n_components='mle', svd_solver=svd_solver) err_msg = ("n_components='mle' cannot be a string with svd_solver='{}'". format(svd_solver)) with pytest.raises(ValueError, match=err_msg): pca.fit(X)
def test_infer_dim_3(): n, p = 100, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) X[30:40] += 2 * np.array([-1, 1, -1, 1, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension_(spect, n, p) > 2
def test_infer_dim_2(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension_(spect, n, p) > 1
def test_infer_dim_1(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + np.array([1, 0, 7, 4, 6])) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)]) assert ll[1] > ll.max() - .01 * n
def test_pca_zero_noise_variance_edge_cases(svd_solver): # ensure that noise_variance_ is 0 in edge cases # when n_components == min(n_samples, n_features) n, p = 100, 3 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 5]) pca = PCA(n_components=p, svd_solver=svd_solver) pca.fit(X) assert pca.noise_variance_ == 0 pca.fit(X.T) assert pca.noise_variance_ == 0
def test_pca_singular_values_consistency(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca_full = PCA(n_components=2, svd_solver='full', random_state=rng) pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.singular_values_, pca_other.singular_values_, rtol=5e-3)
def test_singular_values(): # Check that the IncrementalPCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 1000 n_features = 100 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng) pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X) ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X) assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2) # Compare to the Frobenius norm X_pca = pca.transform(X) X_ipca = ipca.transform(X) assert_array_almost_equal(np.sum(pca.singular_values_**2.0), np.linalg.norm(X_pca, "fro")**2.0, 12) assert_array_almost_equal(np.sum(ipca.singular_values_**2.0), np.linalg.norm(X_ipca, "fro")**2.0, 2) # Compare to the 2-norms of the score vectors assert_array_almost_equal(pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), 12) assert_array_almost_equal(ipca.singular_values_, np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng) pca = PCA(n_components=3, svd_solver='full', random_state=rng) ipca = IncrementalPCA(n_components=3, batch_size=100) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = np.dot(X_pca, pca.components_) pca.fit(X_hat) ipca.fit(X_hat) assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def test_pca_score3(): # Check that probabilistic PCA selects the right model n, p = 200, 3 rng = np.random.RandomState(0) Xl = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])) Xt = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])) ll = np.zeros(p) for k in range(p): pca = PCA(n_components=k, svd_solver='full') pca.fit(Xl) ll[k] = pca.score(Xt) assert ll.argmax() == 1
def test_pca_vs_spca(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2) pca = PCA(n_components=2) pca.fit(Y) spca.fit(Y) results_test_pca = pca.transform(Z) results_test_spca = spca.transform(Z) assert_allclose(np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-5) results_test_pca *= np.sign(results_test_pca[0, :]) results_test_spca *= np.sign(results_test_spca[0, :]) assert_allclose(results_test_pca, results_test_spca)
def test_pca_explained_variance_equivalence_solver(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca_full = PCA(n_components=2, svd_solver='full') pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=0) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.explained_variance_, pca_other.explained_variance_, rtol=5e-2) assert_allclose(pca_full.explained_variance_ratio_, pca_other.explained_variance_ratio_, rtol=5e-2)
def test_pca_score(svd_solver): # Test that probabilistic PCA scoring yields a reasonable score n, p = 1000, 3 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 5]) pca = PCA(n_components=2, svd_solver=svd_solver) pca.fit(X) ll1 = pca.score(X) h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1**2) * p assert_allclose(ll1 / h, 1, rtol=5e-2) ll2 = pca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5])) assert ll1 > ll2 pca = PCA(n_components=2, whiten=True, svd_solver=svd_solver) pca.fit(X) ll2 = pca.score(X) assert ll1 > ll2
def test_pca_validation(svd_solver, data, n_components, err_msg): # Ensures that solver-specific extreme inputs for the n_components # parameter raise errors smallest_d = 2 # The smallest dimension lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0} pca_fitted = PCA(n_components, svd_solver=svd_solver) solver_reported = 'full' if svd_solver == 'auto' else svd_solver err_msg = err_msg.format(n_components, lower_limit[svd_solver], smallest_d, solver_reported) with pytest.raises(ValueError, match=err_msg): pca_fitted.fit(data) # Additional case for arpack if svd_solver == 'arpack': n_components = smallest_d err_msg = ("n_components={}L? must be strictly less than " r"min\(n_samples, n_features\)={}L? with " "svd_solver=\'arpack\'".format(n_components, smallest_d)) with pytest.raises(ValueError, match=err_msg): PCA(n_components, svd_solver=svd_solver).fit(data)
def test_pca(svd_solver, n_components): X = iris.data pca = PCA(n_components=n_components, svd_solver=svd_solver) # check the shape of fit.transform X_r = pca.fit(X).transform(X) assert X_r.shape[1] == n_components # check the equivalence of fit.transform and fit_transform X_r2 = pca.fit_transform(X) assert_allclose(X_r, X_r2) X_r = pca.transform(X) assert_allclose(X_r, X_r2) # Test get_covariance and get_precision cov = pca.get_covariance() precision = pca.get_precision() assert_allclose(np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-12)
def test_pca_bad_solver(): X = np.random.RandomState(0).rand(5, 4) pca = PCA(n_components=3, svd_solver='bad_argument') with pytest.raises(ValueError): pca.fit(X)
def test_n_components_none(data, solver, n_components_): pca = PCA(svd_solver=solver) pca.fit(data) assert pca.n_components_ == n_components_
def test_infer_dim_by_explained_variance(X, n_components, n_components_validated): pca = PCA(n_components=n_components, svd_solver='full') pca.fit(X) assert pca.n_components == pytest.approx(n_components) assert pca.n_components_ == n_components_validated