Esempio n. 1
0
    def testExplainedVariance(self):
        # Check that PCA output has unit-variance
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 80

        X = mt.tensor(rng.randn(n_samples, n_features))

        pca = PCA(n_components=2, svd_solver='full').fit(X)
        rpca = PCA(n_components=2, svd_solver='randomized',
                   random_state=42).fit(X)
        assert_array_almost_equal(pca.explained_variance_.to_numpy(),
                                  rpca.explained_variance_.to_numpy(), 1)
        assert_array_almost_equal(pca.explained_variance_ratio_.to_numpy(),
                                  rpca.explained_variance_ratio_.to_numpy(), 1)

        # compare to empirical variances
        expected_result = np.linalg.eig(np.cov(X.to_numpy(), rowvar=False))[0]
        expected_result = sorted(expected_result, reverse=True)[:2]

        X_pca = pca.transform(X)
        assert_array_almost_equal(pca.explained_variance_.to_numpy(),
                                  mt.var(X_pca, ddof=1, axis=0).to_numpy())
        assert_array_almost_equal(pca.explained_variance_.to_numpy(),
                                  expected_result)

        X_rpca = rpca.transform(X)
        assert_array_almost_equal(rpca.explained_variance_.to_numpy(),
                                  mt.var(X_rpca, ddof=1, axis=0).to_numpy(),
                                  decimal=1)
        assert_array_almost_equal(rpca.explained_variance_.to_numpy(),
                                  expected_result,
                                  decimal=1)

        # Same with correlated data
        X = datasets.make_classification(n_samples,
                                         n_features,
                                         n_informative=n_features - 2,
                                         random_state=rng)[0]
        X = mt.tensor(X)

        pca = PCA(n_components=2).fit(X)
        rpca = PCA(n_components=2, svd_solver='randomized',
                   random_state=rng).fit(X)
        assert_array_almost_equal(pca.explained_variance_ratio_.to_numpy(),
                                  rpca.explained_variance_ratio_.to_numpy(), 5)
Esempio n. 2
0
def test_explained_variance(setup):
    # Test sparse data
    svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_r_10_sp = svd_r_10_sp.fit_transform(X)
    X_trans_r_20_sp = svd_r_20_sp.fit_transform(X)

    # Test dense data
    svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray())
    X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray())

    # helper arrays for tests below
    svds = (svd_r_10_sp, svd_r_20_sp, svd_r_10_de, svd_r_20_de)
    svds_trans = (
        (svd_r_10_sp, X_trans_r_10_sp),
        (svd_r_20_sp, X_trans_r_20_sp),
        (svd_r_10_de, X_trans_r_10_de),
        (svd_r_20_de, X_trans_r_20_de),
    )
    svds_10_v_20 = (
        (svd_r_10_sp, svd_r_20_sp),
        (svd_r_10_de, svd_r_20_de),
    )
    svds_sparse_v_dense = (
        (svd_r_10_sp, svd_r_10_de),
        (svd_r_20_sp, svd_r_20_de),
    )

    # Assert the 1st component is equal
    for svd_10, svd_20 in svds_10_v_20:
        assert_array_almost_equal(
            svd_10.explained_variance_ratio_.to_numpy(),
            svd_20.explained_variance_ratio_[:10].to_numpy(),
            decimal=4,
        )

    # Assert that 20 components has higher explained variance than 10
    for svd_10, svd_20 in svds_10_v_20:
        assert svd_20.explained_variance_ratio_.sum().to_numpy(
        ) > svd_10.explained_variance_ratio_.sum().to_numpy()

    # Assert that all the values are greater than 0
    for svd in svds:
        assert_array_less(0.0, svd.explained_variance_ratio_.to_numpy())

    # Assert that total explained variance is less than 1
    for svd in svds:
        assert_array_less(svd.explained_variance_ratio_.sum().to_numpy(), 1.0)

    # Compare sparse vs. dense
    for svd_sparse, svd_dense in svds_sparse_v_dense:
        assert_array_almost_equal(
            svd_sparse.explained_variance_ratio_.to_numpy(),
            svd_dense.explained_variance_ratio_.to_numpy())

    # Test that explained_variance is correct
    for svd, transformed in svds_trans:
        total_variance = mt.var(X.toarray(), axis=0).sum().to_numpy()
        variances = mt.var(transformed, axis=0)
        true_explained_variance_ratio = variances / total_variance

        assert_array_almost_equal(
            svd.explained_variance_ratio_.to_numpy(),
            true_explained_variance_ratio.to_numpy(),
        )