Ejemplo n.º 1
0
def test_infer_dim_bad_spec():
    # Test a spectrum that drops to near zero for PR #16224
    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
    n_samples = 10
    n_features = 5
    ret = _infer_dimension(spectrum, n_samples, n_features)
    assert ret == 0
Ejemplo n.º 2
0
    def _fit_full_daal4py(self, X, n_components):
        n_samples, n_features = X.shape

        # due to need to flip components, need to do full decomposition
        self._fit_daal4py(X, min(n_samples, n_features))
        U = self._transform_daal4py(X, whiten=True, check_X=False, scale_eigenvalues=True)
        V = self.components_
        U, V = svd_flip(U, V)
        U = U.copy()
        V = V.copy()
        S = self.singular_values_.copy()

        if n_components == 'mle':
            n_components = \
                _infer_dimension(self.explained_variance_, n_samples)
        elif 0 < n_components < 1.0:
            n_components = _n_components_from_fraction(
                self.explained_variance_ratio_, n_components)

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = self.explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = self.components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = self.explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            self.explained_variance_ratio_[:n_components]
        self.singular_values_ = self.singular_values_[:n_components]

        return U, S, V
Ejemplo n.º 3
0
    def _fit_daal4py(self, X, n_components):
        n_samples, n_features = X.shape
        n_sf_min = min(n_samples, n_features)

        _validate_n_components(n_components, n_samples, n_features)

        if n_components == 'mle':
            daal_n_components = n_features
        elif n_components < 1:
            daal_n_components = n_sf_min
        else:
            daal_n_components = n_components

        fpType = getFPType(X)
        centering_algo = daal4py.normalization_zscore(fptype=fpType,
                                                      doScale=False)
        pca_alg = daal4py.pca(fptype=fpType,
                              method='svdDense',
                              normalization=centering_algo,
                              resultsToCompute='mean|variance|eigenvalue',
                              isDeterministic=True,
                              nComponents=daal_n_components)
        pca_res = pca_alg.compute(X)

        self.mean_ = pca_res.means.ravel()
        variances_ = pca_res.variances.ravel()
        components_ = pca_res.eigenvectors
        explained_variance_ = pca_res.eigenvalues.ravel()
        tot_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / tot_var

        if n_components == 'mle':
            n_components = \
                _infer_dimension(explained_variance_, n_samples)
        elif 0 < n_components < 1.0:
            n_components = _n_components_from_fraction(
                explained_variance_ratio_, n_components)

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < n_sf_min:
            if explained_variance_.shape[0] == n_sf_min:
                self.noise_variance_ = explained_variance_[n_components:].mean(
                )
            else:
                resid_var_ = variances_.sum()
                resid_var_ -= explained_variance_[:n_components].sum()
                self.noise_variance_ = resid_var_ / (n_sf_min - n_components)
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = np.sqrt(
            (n_samples - 1) * self.explained_variance_)
Ejemplo n.º 4
0
def test_small_eigenvalues_mle():
    # Test rank associated with tiny eigenvalues are given a log-likelihood of
    # -inf. The inferred rank will be 1
    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])

    assert _assess_dimension(spectrum, rank=1, n_samples=10) > -np.inf

    for rank in (2, 3):
        assert _assess_dimension(spectrum, rank, 10) == -np.inf

    assert _infer_dimension(spectrum, 10) == 1
Ejemplo n.º 5
0
def test_infer_dim_3():
    n, p = 100, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * 0.1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    X[30:40] += 2 * np.array([-1, 1, -1, 1, -1])
    pca = PCA(n_components=p, svd_solver="full")
    pca.fit(X)
    spect = pca.explained_variance_
    assert _infer_dimension(spect, n) > 2
Ejemplo n.º 6
0
def test_infer_dim_2():
    # TODO: explain what this is testing
    # Or at least use explicit variable names...
    n, p = 1000, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * 0.1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    pca = PCA(n_components=p, svd_solver="full")
    pca.fit(X)
    spect = pca.explained_variance_
    assert _infer_dimension(spect, n) > 1
Ejemplo n.º 7
0
    def _decompose_full(self, mat):

        if self.n_components != "mle":
            if not 0 <= self.n_components <= self.n_samples_:
                raise ValueError("n_components=%r must be between 1 and "
                                 "n_samples=%r with "
                                 "svd_solver='%s'" % (
                                     self.n_components,
                                     self.n_samples_,
                                     self.svd_solver,
                                 ))
            elif self.n_components >= 1:
                if not isinstance(self.n_components, numbers.Integral):
                    raise ValueError(
                        "n_components=%r must be of type int "
                        "when greater than or equal to 1, "
                        "was of type=%r" %
                        (self.n_components, type(self.n_components)))

        U, S, Vt = linalg.svd(mat, full_matrices=False)
        U[:, S < self.tol] = 0.0
        Vt[S < self.tol] = 0.0
        S[S < self.tol] = 0.0

        # flip eigenvectors' sign to enforce deterministic output
        U, Vt = svd_flip(U, Vt)

        # Get variance explained by singular values
        explained_variance_ = (S**2) / (self.n_samples_ - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var

        # Postprocess the number of components required
        if self.n_components == "mle":
            self.n_components = _infer_dimension(explained_variance_,
                                                 self.n_samples_)
        elif 0 < self.n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            # side='right' ensures that number of features selected
            # their variance is always greater than self.n_components float
            # passed. More discussion in issue: #15669
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            self.n_components = (np.searchsorted(
                ratio_cumsum, self.n_components, side="right") + 1)
        self.n_components = self.n_components
        return (
            U[:, :self.n_components],
            S[:self.n_components],
            Vt[:self.n_components],
        )
Ejemplo n.º 8
0
def _fit_full(self, X, n_components):
    """Fit the model by computing full SVD on X"""
    n_samples, n_features = X.shape

    _validate_n_components(n_components, n_samples, n_features)

    # Center data
    self.mean_ = np.mean(X, axis=0)
    X -= self.mean_

    if X.shape[0] > X.shape[1] and (X.dtype == np.float64
                                    or X.dtype == np.float32):
        U, S, V = _daal4py_svd(X)
    else:
        U, S, V = np.linalg.svd(X, full_matrices=False)
    # flip eigenvectors' sign to enforce deterministic output
    U, V = svd_flip(U, V)

    components_ = V

    # Get variance explained by singular values
    explained_variance_ = (S**2) / (n_samples - 1)
    total_var = explained_variance_.sum()
    explained_variance_ratio_ = explained_variance_ / total_var

    # Postprocess the number of components required
    if n_components == 'mle':
        n_components = \
            _infer_dimension(explained_variance_, n_samples)
    elif 0 < n_components < 1.0:
        n_components = _n_components_from_fraction(explained_variance_ratio_,
                                                   n_components)

    # Compute noise covariance using Probabilistic PCA model
    # The sigma2 maximum likelihood (cf. eq. 12.46)
    if n_components < min(n_features, n_samples):
        self.noise_variance_ = explained_variance_[n_components:].mean()
    else:
        self.noise_variance_ = 0.

    self.n_samples_, self.n_features_ = n_samples, n_features
    self.components_ = components_[:n_components]
    self.n_components_ = n_components
    self.explained_variance_ = explained_variance_[:n_components]
    self.explained_variance_ratio_ = \
        explained_variance_ratio_[:n_components]
    self.singular_values_ = S[:n_components]

    return U, S, V
Ejemplo n.º 9
0
    def _fit_full(self, X, n_components):
        X = check_array(X, dtype=[np.float64, np.float32])

        n_samples, n_features = X.shape
        self._validate_n_components(n_components, n_samples, n_features)

        self._fit_full_daal4py(X, min(X.shape))

        U = self._transform_daal4py(X,
                                    whiten=True,
                                    check_X=False,
                                    scale_eigenvalues=True)
        V = self.components_
        S = self.singular_values_

        if n_components == 'mle':
            if sklearn_check_version('0.23'):
                n_components = _infer_dimension(self.explained_variance_,
                                                n_samples)
            else:
                n_components = _infer_dimension_(self.explained_variance_,
                                                 n_samples, n_features)
        elif 0 < n_components < 1.0:
            ratio_cumsum = stable_cumsum(self.explained_variance_ratio_)
            n_components = np.searchsorted(
                ratio_cumsum, n_components, side='right') + 1

        if n_components < min(n_features, n_samples):
            self.noise_variance_ = self.explained_variance_[
                n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = self.components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = self.explained_variance_[:n_components]
        self.explained_variance_ratio_ = self.explained_variance_ratio_[:
                                                                        n_components]
        self.singular_values_ = self.singular_values_[:n_components]

        return U, S, V
Ejemplo n.º 10
0
    def _fit_full(self, X, n_components):
        n_samples, n_features = X.shape
        self._validate_n_components(n_components, n_samples, n_features)

        self._fit_full_daal4py(X, min(X.shape))

        U = None
        V = self.components_
        S = self.singular_values_

        if n_components == 'mle':
            if sklearn_check_version('0.23'):
                n_components = _infer_dimension(self.explained_variance_,
                                                n_samples)
            else:
                n_components = \
                    _infer_dimension_(self.explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            ratio_cumsum = stable_cumsum(self.explained_variance_ratio_)
            n_components = np.searchsorted(
                ratio_cumsum, n_components, side='right') + 1

        if n_components < min(n_features, n_samples):
            self.noise_variance_ = self.explained_variance_[
                n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = self.components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = self.explained_variance_[:n_components]
        self.explained_variance_ratio_ = self.explained_variance_ratio_[:
                                                                        n_components]
        self.singular_values_ = self.singular_values_[:n_components]

        return U, S, V
Ejemplo n.º 11
0
    def _fit_full_daal4py(self, X, n_components):
        n_samples, n_features = X.shape
        n_sf_min = min(n_samples, n_features)

        if n_components == 'mle':
            daal_n_components = n_features
        elif n_components < 1:
            daal_n_components = n_sf_min
        else:
            daal_n_components = n_components

        fpType = getFPType(X)

        covariance_algo = daal4py.covariance(
            fptype=fpType, outputMatrixType='covarianceMatrix')
        covariance_res = covariance_algo.compute(X)

        self.mean_ = covariance_res.mean.ravel()
        covariance = covariance_res.covariance
        variances_ = np.array([covariance[i, i] for i in range(n_features)])

        pca_alg = daal4py.pca(fptype=fpType,
                              method='correlationDense',
                              resultsToCompute='eigenvalue',
                              isDeterministic=True,
                              nComponents=daal_n_components)
        pca_res = pca_alg.compute(X, covariance)

        components_ = pca_res.eigenvectors
        explained_variance_ = np.maximum(pca_res.eigenvalues.ravel(), 0)
        tot_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / tot_var

        if n_components == 'mle':
            if sklearn_check_version('0.23'):
                n_components = _infer_dimension(explained_variance_, n_samples)
            else:
                n_components = \
                    _infer_dimension_(explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(
                ratio_cumsum, n_components, side='right') + 1

        if n_components < n_sf_min:
            if explained_variance_.shape[0] == n_sf_min:
                self.noise_variance_ = explained_variance_[n_components:].mean(
                )
            else:
                resid_var_ = variances_.sum()
                resid_var_ -= explained_variance_[:n_components].sum()
                self.noise_variance_ = resid_var_ / (n_sf_min - n_components)
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:
                                                                   n_components]
        self.singular_values_ = np.sqrt(
            (n_samples - 1) * self.explained_variance_)
Ejemplo n.º 12
0
    def _fit_full(self, X, n_components):
        self.accountant.check(self.epsilon, 0)

        n_samples, n_features = X.shape

        if self.centered:
            self.mean_ = np.zeros_like(np.mean(X, axis=0))
        else:
            if self.bounds is None:
                warnings.warn(
                    "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n"
                    "This will result in additional privacy leakage. To ensure differential privacy with no "
                    "additional privacy loss, specify `range` for each valued returned by np.mean().",
                    PrivacyLeakWarning)

                self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

            self.bounds = check_bounds(self.bounds, n_features)
            self.mean_ = mean(X,
                              epsilon=self.epsilon / 2,
                              bounds=self.bounds,
                              axis=0,
                              accountant=BudgetAccountant())

        X -= self.mean_

        if self.data_norm is None:
            warnings.warn(
                "Data norm has not been specified and will be calculated on the data provided.  This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify `data_norm` at initialisation.",
                PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = clip_to_norm(X, self.data_norm)

        XtX = np.dot(X.T, X)

        mech = Wishart().set_epsilon(self.epsilon if self.centered else self.epsilon / 2).\
            set_sensitivity(self.data_norm)
        noisy_input = mech.randomise(XtX)

        u, s, v = np.linalg.svd(noisy_input)
        u, v = svd_flip(u, v)
        s = np.sqrt(s)

        components_ = v

        # Get variance explained by singular values
        explained_variance_ = (s**2) / (n_samples - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = s.copy()  # Store the singular values.

        # Post-process the number of components required
        if n_components == 'mle':
            try:
                n_components = sk_pca._infer_dimension(explained_variance_,
                                                       n_samples)
            except AttributeError:
                n_components = sk_pca._infer_dimension_(
                    explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:
                                                                   n_components]
        self.singular_values_ = singular_values_[:n_components]

        self.accountant.spend(self.epsilon, 0)

        return u, s, v
Ejemplo n.º 13
0
    def _fit_full(self, X, n_components):
        self.accountant.check(self.epsilon, 0)

        n_samples, n_features = X.shape

        if self.centered:
            self.mean_ = np.zeros_like(np.mean(X, axis=0))
        else:
            if self.bounds is None:
                warnings.warn(
                    "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n"
                    "This will result in additional privacy leakage. To ensure differential privacy with no "
                    "additional privacy loss, specify `range` for each valued returned by np.mean().",
                    PrivacyLeakWarning)

                self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

            self.bounds = self._check_bounds(self.bounds, n_features)
            self.mean_ = mean(X, epsilon=self.epsilon / 2, bounds=self.bounds, axis=0, accountant=BudgetAccountant())

        X -= self.mean_

        if self.data_norm is None:
            warnings.warn("Data norm has not been specified and will be calculated on the data provided.  This will "
                          "result in additional privacy leakage. To ensure differential privacy and no additional "
                          "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = self._clip_to_norm(X, self.data_norm)

        sigma_vec, u_mtx = covariance_eig(X, epsilon=self.epsilon if self.centered else self.epsilon / 2,
                                          norm=self.data_norm,
                                          dims=n_components if isinstance(n_components, Integral) else None)
        u_mtx, _ = svd_flip(u_mtx, np.zeros_like(u_mtx).T)
        sigma_vec = np.sqrt(sigma_vec)

        components_ = u_mtx.T

        # Get variance explained by singular values
        explained_variance_ = np.sort((sigma_vec ** 2) / (n_samples - 1))[::-1]
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = sigma_vec.copy()  # Store the singular values.

        # Post-process the number of components required
        if n_components == 'mle':
            n_components = sk_pca._infer_dimension(explained_variance_, n_samples)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:n_components]
        self.singular_values_ = singular_values_[:n_components]

        self.accountant.spend(self.epsilon, 0)

        return u_mtx, sigma_vec[:n_components], u_mtx.T