def main(readcsv=read_csv, method='svdDense'): infile = "./data/batch/pca_normalized.csv" # 'normalization' is an optional parameter to PCA; we use z-score which could be configured differently zscore = d4p.normalization_zscore() # configure a PCA object algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue", isDeterministic=True, normalization=zscore) # let's provide a file directly, not a table/array result1 = algo.compute(infile) # We can also load the data ourselfs and provide the numpy array data = readcsv(infile) result2 = algo.compute(data) # PCA result objects provide eigenvalues, eigenvectors, means and variances assert np.allclose(result1.eigenvalues, result2.eigenvalues) assert np.allclose(result1.eigenvectors, result2.eigenvectors) assert np.allclose(result1.means, result2.means) assert np.allclose(result1.variances, result2.variances) assert result1.eigenvalues.shape == (1, data.shape[1]) assert result1.eigenvectors.shape == (data.shape[1], data.shape[1]) assert result1.means.shape == (1, data.shape[1]) assert result1.variances.shape == (1, data.shape[1]) return result1
def pca_fit_daal(X, n_components, method): if n_components < 1: n_components = min(X.shape) fptype = getFPType(X) centering_algo = normalization_zscore( fptype=fptype, doScale=False ) pca_algorithm = pca( fptype=fptype, method=method, normalization=centering_algo, resultsToCompute='mean|variance|eigenvalue', isDeterministic=True, nComponents=n_components ) pca_result = pca_algorithm.compute(X) eigenvectors = pca_result.eigenvectors eigenvalues = pca_result.eigenvalues.ravel() singular_values = np.sqrt((X.shape[0] - 1) * eigenvalues) return pca_result, eigenvalues, eigenvectors, singular_values
def _fit_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) _validate_n_components(n_components, n_samples, n_features) if n_components == 'mle': daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min else: daal_n_components = n_components fpType = getFPType(X) centering_algo = daal4py.normalization_zscore(fptype=fpType, doScale=False) pca_alg = daal4py.pca(fptype=fpType, method='svdDense', normalization=centering_algo, resultsToCompute='mean|variance|eigenvalue', isDeterministic=True, nComponents=daal_n_components) pca_res = pca_alg.compute(X) self.mean_ = pca_res.means.ravel() variances_ = pca_res.variances.ravel() components_ = pca_res.eigenvectors explained_variance_ = pca_res.eigenvalues.ravel() tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var if n_components == 'mle': n_components = \ _infer_dimension(explained_variance_, n_samples) elif 0 < n_components < 1.0: n_components = _n_components_from_fraction( explained_variance_ratio_, n_components) # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: self.noise_variance_ = explained_variance_[n_components:].mean( ) else: resid_var_ = variances_.sum() resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = np.sqrt( (n_samples - 1) * self.explained_variance_)
def compute(data): # 'normalization' is an optional parameter to PCA; we use z-score which could be configured differently zscore = d4p.normalization_zscore() # configure a PCA object algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue", isDeterministic=True, normalization=zscore) return algo.compute(data)
def main(readcsv=read_csv, method='defaultDense'): infile = "./data/batch/normalization.csv" # configure a covariance object try: algo = d4p.normalization_zscore(doScale=True) except NameError: algo = d4p.normalization_zscore() # let's provide a file directly, not a table/array result1 = algo.compute(infile) # We can also load the data ourselfs and provide the numpy array data = readcsv(infile) result2 = algo.compute(data) # covariance result objects provide correlation, covariance and mean assert np.allclose(result1.normalizedData, result2.normalizedData) return result1