Beispiel #1
0
    def _fit(self, X):
        X, w = weighted_data(X)

        X = check_array(X)
        n_samples, n_features = X.shape
        n_samples_weighted = sum(w)
        X = as_float_array(X, copy=self.copy)
        # Center data
        # self.mean_ = average(X, axis=0, weights=w)
        # X -= self.mean_
        U, S, V = linalg.svd((X.T * reshape(sqrt(w), (1, len(X)))).T,
                             full_matrices=True)
        explained_variance_ = (S**2) / n_samples_weighted
        explained_variance_ratio_ = (explained_variance_ /
                                     explained_variance_.sum())

        components_ = V

        n_components = self.n_components
        if n_components is None:
            n_components = n_features
        elif n_components == 'mle':
            if n_samples < n_features:
                raise ValueError("n_components='mle' is only supported "
                                 "if n_samples >= n_features")

            n_components = _infer_dimension_(explained_variance_, n_samples,
                                             n_features)
        elif not 0 <= n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d" %
                             (n_components, n_features))

        if 0 < n_components < 1.0:
            # number of components for which the cumulated explained variance
            # percentage is superior to the desired threshold
            ratio_cumsum = explained_variance_ratio_.cumsum()
            n_components = np.sum(ratio_cumsum < n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        # store n_samples to revert whitening when getting covariance
        self.n_samples_ = n_samples_weighted  # n_samples

        self.components_ = components_[:n_components]
        self.explained_variance_ = explained_variance_[:n_components]
        explained_variance_ratio_ = explained_variance_ratio_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_
        self.n_components_ = n_components

        return (U, S, V)
Beispiel #2
0
def test_infer_dim_3():
    n, p = 100, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    X[30:40] += 2 * np.array([-1, 1, -1, 1, -1])
    pca = PCA(n_components=p, svd_solver='full')
    pca.fit(X)
    spect = pca.explained_variance_
    assert_greater(_infer_dimension_(spect, n, p), 2)
def test_infer_dim_3():
    n, p = 100, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    X[30:40] += 2 * np.array([-1, 1, -1, 1, -1])
    pca = PCA(n_components=p, svd_solver='full')
    pca.fit(X)
    spect = pca.explained_variance_
    assert_greater(_infer_dimension_(spect, n, p), 2)
Beispiel #4
0
def test_infer_dim_2():
    # TODO: explain what this is testing
    # Or at least use explicit variable names...
    n, p = 1000, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    pca = PCA(n_components=p, svd_solver='full')
    pca.fit(X)
    spect = pca.explained_variance_
    assert_greater(_infer_dimension_(spect, n, p), 1)
def test_infer_dim_2():
    # TODO: explain what this is testing
    # Or at least use explicit variable names...
    n, p = 1000, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    pca = PCA(n_components=p, svd_solver='full')
    pca.fit(X)
    spect = pca.explained_variance_
    assert_greater(_infer_dimension_(spect, n, p), 1)
Beispiel #6
0
def test_infer_dim_3():
    n, p = 100, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * 0.1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    X[30:40] += 2 * np.array([-1, 1, -1, 1, -1])
    X = da.from_array(X, chunks=(n, p))
    pca = dd.PCA(n_components=p, svd_solver="full")
    pca.fit(X)
    spect = pca.explained_variance_
    assert _infer_dimension_(spect, n, p) > 2
Beispiel #7
0
def test_infer_dim_2():
    # TODO: explain what this is testing
    # Or at least use explicit variable names...
    n, p = 1000, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * 0.1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    dX = da.from_array(X, chunks=(n, p))
    pca = dd.PCA(n_components=p, svd_solver="full")
    pca.fit(dX)
    spect = pca.explained_variance_
    assert _infer_dimension_(spect, n, p) > 1
Beispiel #8
0
def _fit_full(self, X, n_components):
    """Fit the model by computing full SVD on X"""
    n_samples, n_features = X.shape

    _validate_n_components(n_components, n_samples, n_features)

    # Center data
    self.mean_ = np.mean(X, axis=0)
    X -= self.mean_

    if X.shape[0] > X.shape[1] and (X.dtype == np.float64
                                    or X.dtype == np.float32):
        U, S, V = _daal4py_svd(X)
    else:
        U, S, V = np.linalg.svd(X, full_matrices=False)
    # flip eigenvectors' sign to enforce deterministic output
    U, V = svd_flip(U, V)

    components_ = V

    # Get variance explained by singular values
    explained_variance_ = (S**2) / (n_samples - 1)
    total_var = explained_variance_.sum()
    explained_variance_ratio_ = explained_variance_ / total_var

    # Postprocess the number of components required
    if n_components == 'mle':
        n_components = \
            _infer_dimension_(explained_variance_, n_samples, n_features)
    elif 0 < n_components < 1.0:
        # number of components for which the cumulated explained
        # variance percentage is superior to the desired threshold
        ratio_cumsum = explained_variance_ratio_.cumsum()
        n_components = np.searchsorted(ratio_cumsum, n_components) + 1

    # Compute noise covariance using Probabilistic PCA model
    # The sigma2 maximum likelihood (cf. eq. 12.46)
    if n_components < min(n_features, n_samples):
        self.noise_variance_ = explained_variance_[n_components:].mean()
    else:
        self.noise_variance_ = 0.

    self.n_samples_, self.n_features_ = n_samples, n_features
    self.components_ = components_[:n_components]
    self.n_components_ = n_components
    self.explained_variance_ = explained_variance_[:n_components]
    self.explained_variance_ratio_ = \
        explained_variance_ratio_[:n_components]
    self.singular_values_ = S[:n_components]

    return U, S, V
Beispiel #9
0
    def _fit_full_daal4py(self, X, n_components):
        n_samples, n_features = X.shape

        # due to need to flip components, need to do full decomposition
        self._fit_daal4py(X, min(n_samples, n_features))
        U = self._transform_daal4py(X,
                                    whiten=True,
                                    check_X=False,
                                    scale_eigenvalues=True)
        V = self.components_
        U, V = svd_flip(U, V)
        U = U.copy()
        V = V.copy()
        S = self.singular_values_.copy()

        if n_components == 'mle':
            n_components = \
                _infer_dimension_(self.explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(self.explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = self.explained_variance_[
                n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = self.components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = self.explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            self.explained_variance_ratio_[:n_components]
        self.singular_values_ = self.singular_values_[:n_components]

        return U, S, V
def get_pca_components(X, n_components, get_S=False):
    """Same as in sklearn, but we don't center the data"""
    n_samples, n_features = X.shape
    U, S, V = linalg.svd(X, full_matrices=False)
    # flip eigenvectors' sign to enforce deterministic output
    U, V = svd_flip(U, V)

    components_ = V

    # Get variance explained by singular values
    explained_variance_ = (S ** 2) / n_samples
    total_var = explained_variance_.sum()
    explained_variance_ratio_ = explained_variance_ / total_var

    # Postprocess the number of components required
    if n_components == 'mle':
        n_components = _infer_dimension_(explained_variance_, n_samples, n_features)
    elif 0 < n_components < 1.0:
        # number of components for which the cumulated explained
        # variance percentage is superior to the desired threshold
        ratio_cumsum = explained_variance_ratio_.cumsum()
        n_components = np.searchsorted(ratio_cumsum, n_components) + 1

    # Compute noise covariance using Probabilistic PCA model
    # The sigma2 maximum likelihood (cf. eq. 12.46)
    # if n_components < min(n_features, n_samples):
    #     noise_variance_ = explained_variance_[n_components:].mean()
    # else:
    #     noise_variance_ = 0.

    components_ = components_[:n_components]
    # explained_variance_ = explained_variance_[:n_components]
    # explained_variance_ratio_ = explained_variance_ratio_[:n_components]
    # if get_explained_var:
    #     return components_, (explained_variance_ratio_[:n_components])
    if get_S:
        return components_, explained_variance_ratio_[:n_components], S[:n_components]

    return components_, explained_variance_ratio_[:n_components]
def get_wpca_components(X, weights, n_components, get_S=False):  # , xi=0):
    """Same as in sklearn, but we don't center the data"""
    # weights = np.repeat(weights[:, np.newaxis], X.shape[1], axis=1)

    weights = weights[:, np.newaxis]
    X_ = X * weights
    covar = np.dot(X_.T, X_)
    covar /= np.dot(weights.T, weights)
    covar[np.isnan(covar)] = 0

    # enhance weights if desired
    # if xi != 0:
    #     Ws = weights.sum(0)
    #     covar *= np.outer(Ws, Ws) ** xi

    eigvals = (0, X.shape[1] - 1)
    evals, evecs = linalg.eigh(covar, eigvals=eigvals)
    components_ = evecs[:, ::-1].T
    explained_variance_ = evals[::-1]
    explained_variance_ratio_ = evals[::-1] / covar.trace()
    n_samples, n_features = X_.shape

    # Postprocess the number of components required
    if n_components == 'mle':
        n_components = _infer_dimension_(explained_variance_, n_samples, n_features)
    elif 0 < n_components < 1.0:
        # number of components for which the cumulated explained
        # variance percentage is superior to the desired threshold
        ratio_cumsum = explained_variance_ratio_.cumsum()
        n_components = np.searchsorted(ratio_cumsum, n_components) + 1

    components_ = components_[:n_components]
    explained_variance_ratio_ = explained_variance_ratio_[:n_components]

    if get_S:
        return components_, explained_variance_ratio_, evals[:n_components]

    return components_, explained_variance_ratio_
Beispiel #12
0
    def _fit_daal4py(self, X, n_components):
        n_samples, n_features = X.shape
        n_sf_min = min(n_samples, n_features)

        _validate_n_components(n_components, n_samples, n_features)

        if n_components == 'mle':
            daal_n_components = n_features
        elif n_components < 1:
            daal_n_components = n_sf_min
        else:
            daal_n_components = n_components

        fpType = getFPType(X)
        centering_algo = daal4py.normalization_zscore(fptype=fpType,
                                                      doScale=False)
        pca_alg = daal4py.pca(fptype=fpType,
                              method='svdDense',
                              normalization=centering_algo,
                              resultsToCompute='mean|variance|eigenvalue',
                              isDeterministic=True,
                              nComponents=daal_n_components)
        pca_res = pca_alg.compute(X)

        self.mean_ = pca_res.means.ravel()
        variances_ = pca_res.variances.ravel()
        components_ = pca_res.eigenvectors
        explained_variance_ = pca_res.eigenvalues.ravel()
        tot_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / tot_var

        if n_components == 'mle':
            n_components = \
                _infer_dimension_(explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < n_sf_min:
            if explained_variance_.shape[0] == n_sf_min:
                self.noise_variance_ = explained_variance_[n_components:].mean(
                )
            else:
                resid_var_ = variances_.sum()
                resid_var_ -= explained_variance_[:n_components].sum()
                self.noise_variance_ = resid_var_ / (n_sf_min - n_components)
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = np.sqrt(
            (n_samples - 1) * self.explained_variance_)
Beispiel #13
0
def _fit_full(self, X, n_components):
    """Fit the model by computing full SVD on X"""
    n_samples, n_features = X.shape

    if n_components == 'mle':
        if n_samples < n_features:
            raise ValueError("n_components='mle' is only supported "
                             "if n_samples >= n_features")
    elif not 0 <= n_components <= min(n_samples, n_features):
        raise ValueError("n_components=%r must be between 0 and "
                         "min(n_samples, n_features)=%r with "
                         "svd_solver='full'" %
                         (n_components, min(n_samples, n_features)))
    elif n_components >= 1:
        if not isinstance(n_components, (numbers.Integral, np.integer)):
            raise ValueError("n_components=%r must be of type int "
                             "when greater than or equal to 1, "
                             "was of type=%r" %
                             (n_components, type(n_components)))

    # Center data
    self.mean_ = np.mean(X, axis=0)
    X -= self.mean_

    if X.shape[0] > X.shape[1] and (X.dtype == np.float64
                                    or X.dtype == np.float32):
        U, S, V = _daal4py_svd(X)
    else:
        U, S, V = np.linalg.svd(X, full_matrices=False)
    # flip eigenvectors' sign to enforce deterministic output
    U, V = svd_flip(U, V)

    components_ = V

    # Get variance explained by singular values
    explained_variance_ = (S**2) / (n_samples - 1)
    total_var = explained_variance_.sum()
    explained_variance_ratio_ = explained_variance_ / total_var

    # Postprocess the number of components required
    if n_components == 'mle':
        n_components = \
            _infer_dimension_(explained_variance_, n_samples, n_features)
    elif 0 < n_components < 1.0:
        # number of components for which the cumulated explained
        # variance percentage is superior to the desired threshold
        ratio_cumsum = explained_variance_ratio_.cumsum()
        n_components = np.searchsorted(ratio_cumsum, n_components) + 1

    # Compute noise covariance using Probabilistic PCA model
    # The sigma2 maximum likelihood (cf. eq. 12.46)
    if n_components < min(n_features, n_samples):
        self.noise_variance_ = explained_variance_[n_components:].mean()
    else:
        self.noise_variance_ = 0.

    self.n_samples_, self.n_features_ = n_samples, n_features
    self.components_ = components_[:n_components]
    self.n_components_ = n_components
    self.explained_variance_ = explained_variance_[:n_components]
    self.explained_variance_ratio_ = \
        explained_variance_ratio_[:n_components]
    self.singular_values_ = S[:n_components]

    return U, S, V