def _fit(self, X): X, w = weighted_data(X) X = check_array(X) n_samples, n_features = X.shape n_samples_weighted = sum(w) X = as_float_array(X, copy=self.copy) # Center data # self.mean_ = average(X, axis=0, weights=w) # X -= self.mean_ U, S, V = linalg.svd((X.T * reshape(sqrt(w), (1, len(X)))).T, full_matrices=True) explained_variance_ = (S**2) / n_samples_weighted explained_variance_ratio_ = (explained_variance_ / explained_variance_.sum()) components_ = V n_components = self.n_components if n_components is None: n_components = n_features elif n_components == 'mle': if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") n_components = _infer_dimension_(explained_variance_, n_samples, n_features) elif not 0 <= n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d" % (n_components, n_features)) if 0 < n_components < 1.0: # number of components for which the cumulated explained variance # percentage is superior to the desired threshold ratio_cumsum = explained_variance_ratio_.cumsum() n_components = np.sum(ratio_cumsum < n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. # store n_samples to revert whitening when getting covariance self.n_samples_ = n_samples_weighted # n_samples self.components_ = components_[:n_components] self.explained_variance_ = explained_variance_[:n_components] explained_variance_ratio_ = explained_variance_ratio_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_ self.n_components_ = n_components return (U, S, V)
def test_infer_dim_3(): n, p = 100, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) X[30:40] += 2 * np.array([-1, 1, -1, 1, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ assert_greater(_infer_dimension_(spect, n, p), 2)
def test_infer_dim_2(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ assert_greater(_infer_dimension_(spect, n, p), 1)
def test_infer_dim_3(): n, p = 100, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * 0.1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) X[30:40] += 2 * np.array([-1, 1, -1, 1, -1]) X = da.from_array(X, chunks=(n, p)) pca = dd.PCA(n_components=p, svd_solver="full") pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension_(spect, n, p) > 2
def test_infer_dim_2(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * 0.1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) dX = da.from_array(X, chunks=(n, p)) pca = dd.PCA(n_components=p, svd_solver="full") pca.fit(dX) spect = pca.explained_variance_ assert _infer_dimension_(spect, n, p) > 1
def _fit_full(self, X, n_components): """Fit the model by computing full SVD on X""" n_samples, n_features = X.shape _validate_n_components(n_components, n_samples, n_features) # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ if X.shape[0] > X.shape[1] and (X.dtype == np.float64 or X.dtype == np.float32): U, S, V = _daal4py_svd(X) else: U, S, V = np.linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) components_ = V # Get variance explained by singular values explained_variance_ = (S**2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var # Postprocess the number of components required if n_components == 'mle': n_components = \ _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = explained_variance_ratio_.cumsum() n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = S[:n_components] return U, S, V
def _fit_full_daal4py(self, X, n_components): n_samples, n_features = X.shape # due to need to flip components, need to do full decomposition self._fit_daal4py(X, min(n_samples, n_features)) U = self._transform_daal4py(X, whiten=True, check_X=False, scale_eigenvalues=True) V = self.components_ U, V = svd_flip(U, V) U = U.copy() V = V.copy() S = self.singular_values_.copy() if n_components == 'mle': n_components = \ _infer_dimension_(self.explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(self.explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = self.explained_variance_[ n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = self.components_[:n_components] self.n_components_ = n_components self.explained_variance_ = self.explained_variance_[:n_components] self.explained_variance_ratio_ = \ self.explained_variance_ratio_[:n_components] self.singular_values_ = self.singular_values_[:n_components] return U, S, V
def get_pca_components(X, n_components, get_S=False): """Same as in sklearn, but we don't center the data""" n_samples, n_features = X.shape U, S, V = linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) components_ = V # Get variance explained by singular values explained_variance_ = (S ** 2) / n_samples total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var # Postprocess the number of components required if n_components == 'mle': n_components = _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = explained_variance_ratio_.cumsum() n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) # if n_components < min(n_features, n_samples): # noise_variance_ = explained_variance_[n_components:].mean() # else: # noise_variance_ = 0. components_ = components_[:n_components] # explained_variance_ = explained_variance_[:n_components] # explained_variance_ratio_ = explained_variance_ratio_[:n_components] # if get_explained_var: # return components_, (explained_variance_ratio_[:n_components]) if get_S: return components_, explained_variance_ratio_[:n_components], S[:n_components] return components_, explained_variance_ratio_[:n_components]
def get_wpca_components(X, weights, n_components, get_S=False): # , xi=0): """Same as in sklearn, but we don't center the data""" # weights = np.repeat(weights[:, np.newaxis], X.shape[1], axis=1) weights = weights[:, np.newaxis] X_ = X * weights covar = np.dot(X_.T, X_) covar /= np.dot(weights.T, weights) covar[np.isnan(covar)] = 0 # enhance weights if desired # if xi != 0: # Ws = weights.sum(0) # covar *= np.outer(Ws, Ws) ** xi eigvals = (0, X.shape[1] - 1) evals, evecs = linalg.eigh(covar, eigvals=eigvals) components_ = evecs[:, ::-1].T explained_variance_ = evals[::-1] explained_variance_ratio_ = evals[::-1] / covar.trace() n_samples, n_features = X_.shape # Postprocess the number of components required if n_components == 'mle': n_components = _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = explained_variance_ratio_.cumsum() n_components = np.searchsorted(ratio_cumsum, n_components) + 1 components_ = components_[:n_components] explained_variance_ratio_ = explained_variance_ratio_[:n_components] if get_S: return components_, explained_variance_ratio_, evals[:n_components] return components_, explained_variance_ratio_
def _fit_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) _validate_n_components(n_components, n_samples, n_features) if n_components == 'mle': daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min else: daal_n_components = n_components fpType = getFPType(X) centering_algo = daal4py.normalization_zscore(fptype=fpType, doScale=False) pca_alg = daal4py.pca(fptype=fpType, method='svdDense', normalization=centering_algo, resultsToCompute='mean|variance|eigenvalue', isDeterministic=True, nComponents=daal_n_components) pca_res = pca_alg.compute(X) self.mean_ = pca_res.means.ravel() variances_ = pca_res.variances.ravel() components_ = pca_res.eigenvectors explained_variance_ = pca_res.eigenvalues.ravel() tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var if n_components == 'mle': n_components = \ _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: self.noise_variance_ = explained_variance_[n_components:].mean( ) else: resid_var_ = variances_.sum() resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = np.sqrt( (n_samples - 1) * self.explained_variance_)
def _fit_full(self, X, n_components): """Fit the model by computing full SVD on X""" n_samples, n_features = X.shape if n_components == 'mle': if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") elif not 0 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 0 and " "min(n_samples, n_features)=%r with " "svd_solver='full'" % (n_components, min(n_samples, n_features))) elif n_components >= 1: if not isinstance(n_components, (numbers.Integral, np.integer)): raise ValueError("n_components=%r must be of type int " "when greater than or equal to 1, " "was of type=%r" % (n_components, type(n_components))) # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ if X.shape[0] > X.shape[1] and (X.dtype == np.float64 or X.dtype == np.float32): U, S, V = _daal4py_svd(X) else: U, S, V = np.linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) components_ = V # Get variance explained by singular values explained_variance_ = (S**2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var # Postprocess the number of components required if n_components == 'mle': n_components = \ _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = explained_variance_ratio_.cumsum() n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = S[:n_components] return U, S, V