def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.externals.joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: spca = SparsePCA(n_components=3, n_jobs=2, random_state=0, alpha=alpha).fit(Y) U2 = spca.transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_) # Test that deprecated ridge_alpha parameter throws warning warning_msg = "The ridge_alpha parameter on transform()" assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=0.01) assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=None)
def test_fit_transform_tall(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array spca_lars = SparsePCA(n_components=3, method='lars', random_state=rng) U1 = spca_lars.fit_transform(Y) spca_lasso = SparsePCA(n_components=3, method='cd', random_state=rng) U2 = spca_lasso.fit(Y).transform(Y) assert_array_almost_equal(U1, U2)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=0) spca_lars.fit(Y) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method="cd", random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_correct_shapes(): rng = np.random.RandomState(0) X = rng.randn(12, 10) spca = SparsePCA(n_components=8, random_state=rng) U = spca.fit_transform(X) assert_equal(spca.components_.shape, (8, 10)) assert_equal(U.shape, (12, 8)) # test overcomplete decomposition spca = SparsePCA(n_components=13, random_state=rng) U = spca.fit_transform(X) assert_equal(spca.components_.shape, (13, 10)) assert_equal(U.shape, (12, 13))
def test_fit_transform_parallel(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA( n_components=3, n_jobs=2, method="lars", alpha=alpha, random_state=0 ).fit(Y) U2 = spca.transform(Y) assert not np.all(spca_lars.components_ == 0) assert_array_almost_equal(U1, U2)
def _explainedvar(X, n_components=None, onehot=False, random_state=None, n_jobs=-1, verbose=3): # Create the model if sp.issparse(X): if verbose >= 3: print('[pca] >Fiting using Truncated SVD..') model = TruncatedSVD(n_components=n_components, random_state=random_state) elif onehot: if verbose >= 3: print('[pca] >Fitting using Sparse PCA..') model = SparsePCA(n_components=n_components, random_state=random_state, n_jobs=n_jobs) else: if verbose >= 3: print('[pca] >Fitting using PCA..') model = PCA(n_components=n_components, random_state=random_state) # Fit model model.fit(X) # Do the reduction if verbose >= 3: print('[pca] >Computing loadings and PCs..') loadings = model.components_ # Ook wel de coeeficienten genoemd: coefs! PC = model.transform(X) if not onehot: # Compute explained variance, top 95% variance if verbose >= 3: print('[pca] >Computing explained variance..') percentExplVar = model.explained_variance_ratio_.cumsum() else: percentExplVar = None # Return return (model, PC, loadings, percentExplVar)
def compress_l1(W, b, fac, alpha): # This is the Sparse-Coreset setting W_s = W.reshape(W.shape[0], W.size/W.shape[0]) b_s = b.reshape(b.shape[0], b.size/b.shape[0]) X = np.concatenate([W_s,b_s], axis=1).transpose() if fac == -1: # Use (4/sqrt(3) * median eigenvalue as trunc-value) # we have to do SVD on the full matrix to # figure out median eigenvalue Ux, sx, Vx = np.linalg.svd(X) r = (4.0/(3.0**0.5))*np.median(sx) n_comp = np.sum(np.array([int(j>=r) for j in sx])) print 'Optimal Eigenvalue: %f, Number of components selected: %d/%d' % (r, n_comp, len(sx)) elif fac == 0: print 'No compression, Number of components selected: %d/%d' % (W.shape[0], W.shape[0]) return (W, b) else: n_comp=int(W.shape[0]*fac) print 'Predefined ratio, Number of components selected: %d/%d' % (n_comp, W.shape[0]) # perform truncation pca = SparsePCA(n_components=n_comp, alpha=alpha) Xs = pca.fit_transform(X) Xr = np.dot(Xs, pca.components_) # Approximate the original weights Wf = Xr.transpose()[:,:-1].reshape(W.shape) bf = Xr.transpose()[:,-1].reshape(b.shape) return Wf, bf
def fit(self, data): ''' 训练数据 ''' #先以n_comp为基准,如不满足条件则 if self.method == 'pca': self.dr_model = PCA(n_components=self.n_comp) self.dr_model.fit(data) if self.dr_model.explained_variance_ratio_.cumsum( )[-1] < self.cum_std: self.dr_model = PCA(n_components=self.cum_std) self.dr_model.fit(data) elif self.method == 'kpca': self.dr_model = KernelPCA(n_components=self.n_comp, kernel="rbf") self.dr_model.fit(data) elif self.method == 'fa': self.dr_model = FactorAnalysis(n_components=self.n_comp) self.dr_model.fit(data) elif self.method == 'spca': self.dr_model = SparsePCA(n_components=self.n_comp) self.dr_model.fit(data) elif self.method == 'tsvd': self.dr_model = TruncatedSVD(n_components=self.n_comp) self.dr_model.fit(data) elif self.method == 'ipca': self.dr_model = IncrementalPCA(n_components=self.n_comp) self.dr_model.fit(data) self.data_col = data.columns
def featureVect(X_train, y, compoents, feature_para): bigram_vectorizer = CountVectorizer(ngram_range=(1, 25), stop_words="english") X_2 = bigram_vectorizer.fit_transform(X_train).toarray() vectorizer = TfidfVectorizer(ngram_range=(1, 25), stop_words="english") X_2_DFIDF = vectorizer.fit_transform(X_train).toarray() X = np.multiply(X_2, X_2_DFIDF) # This dataset is way to high-dimensional. Better do PCA: # pca = PCA(n_components=400) pca = SparsePCA(n_components=compoents[0]) # Build estimator from PCA and Univariate selection: # ,("dfr",selection_fdr),("fwe",selection_fwe),("fpr",selection_fpr), ("univ_select", selection) feature_list = [("pca", pca)] feature_list += feature_para combined_features = FeatureUnion(feature_list) # Use combined features to transform dataset: X_features = combined_features.fit(X, y).transform(X) select_chi = chi2(X_2, y) ind = np.argpartition(select_chi[0], -compoents[1])[-compoents[1]:] selection_chi2 = X_2[:, ind] X_features = np.concatenate((X_features, selection_chi2), axis=1) return [X_features, combined_features, bigram_vectorizer, vectorizer, ind]
def factorization(method='TruncatedSVD', n_components=10): # print("Unsupervised feature selection: matrix factorization with", method, "(", n_components, "components )") sparse = { 'LatentDirichletAllocation': LatentDirichletAllocation(n_components=n_components, n_jobs=-1, learning_method='online'), 'TruncatedSVD': Pipeline([("selector", TruncatedSVD(n_components)), ("normalizer", MinMaxScaler())]), 'NMF': Pipeline([("selector", NMF(n_components, tol=0.01)), ("normalizer", MinMaxScaler())]), } model = sparse.get(method, None) if model is not None: return model dense = { 'PCA': PCA(n_components), 'SparsePCA': SparsePCA(n_components), 'FactorAnalysis': FactorAnalysis(n_components) } model = dense.get(method, None) if model is not None: return Pipeline([("densifier", Densifier()), ("selector", model), ("normalizer", MinMaxScaler())]) else: return Pipeline([("selector", TruncatedSVD(n_components)), ("normalizer", MinMaxScaler())])
def createSparsePCADecomposition(params): # params['method'] = {‘lars’, ‘cd’} # params['alpha'] = {1} # params['ridge_alpha'] = {1} cls = SparsePCA() return cls
def _sparse_pca(self, x, y): """ Computes the adpative weights based on sparse principal component analysis. """ # Compute sparse pca x_center = x - x.mean(axis=0) total_variance_in_x = np.sum(np.var(x, axis=0)) spca = SparsePCA(n_components=np.min((x.shape[0], x.shape[1])), alpha=self.spca_alpha, ridge_alpha=self.spca_ridge_alpha) t = spca.fit_transform(x_center) p = spca.components_.T # Obtain explained variance using spca as explained in the original paper (based on QR decomposition) t_spca_qr_decomp = np.linalg.qr(t) # QR decomposition of modified PCs r_spca = t_spca_qr_decomp[1] t_spca_variance = np.diag(r_spca)**2 / x.shape[0] # compute variance_ratio fractions_of_explained_variance = np.cumsum(t_spca_variance / total_variance_in_x) # Update variability_pct self.variability_pct = np.min( (self.variability_pct, np.max(fractions_of_explained_variance))) n_comp = np.argmax( fractions_of_explained_variance >= self.variability_pct) + 1 unpenalized_model = ASGL(model=self.model, penalization=None, intercept=True, tau=self.tau) unpenalized_model.fit(x=t[:, 0:n_comp], y=y) beta_qr = unpenalized_model.coef_[0][1:] # Recover an estimation of the beta parameters and use it as weight tmp_weight = np.abs(np.dot(p[:, 0:n_comp], beta_qr)).flatten() return tmp_weight
def reduce_dimension(self, X): """ Perform dimensionality reduction. Inputs: X: (DataFrame) Independent variables. Returns: pd_new_X: (DataFrame) Reduced dimension independent variables. mode: (str) Dimensionality reduction used (PCA | tSNE) """ if self.dimension_reduction_mode.lower() == 'pca': model = PCA(n_components=self.projection_dim) column_prefix = 'pc' elif self.dimension_reduction_mode.lower() == 'sparsepca': model = SparsePCA(n_components=self.projection_dim) column_prefix = 'pc' elif self.dimension_reduction_mode.lower() == 'tsne': model = TSNE(n_components=self.projection_dim) column_prefix = 'embedding' else: raise ValueError('Invalid mode: {}'.format(self.dimension_reduction_mode)) pd_new_X = pd.DataFrame( model.fit_transform(X), index=X.index, columns=[column_prefix + str(i+1) for i in range(self.projection_dim)]) return pd_new_X, self.dimension_reduction_mode
def sparce_pca(df_train, df_test, n_components=30): print("sparce_pca") cols = [ "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "id_19", "DeviceInfo" ] for col in cols: valid = pd.concat([df_train[[col]], df_test[[col]]]) valid = valid[col].value_counts() valid = valid[valid > 75] valid = list(valid.index) df_train[col] = np.where(df_train[col].isin(valid), df_train[col], "others") df_test[col] = np.where(df_test[col].isin(valid), df_test[col], "others") X_all = df_train.append(df_test)[cols] X_all = pd.get_dummies(X_all, columns=cols, sparse=True).astype(np.int8) print(X_all.shape) X_all = SparsePCA(n_components=n_components).fit_transform(X_all) col_names = ["cat_SpacePCA_{}".format(x) for x in range(n_components)] df_train = pd.DataFrame(X_all[:len(df_train)], columns=col_names) df_test = pd.DataFrame(X_all[len(df_train):], columns=col_names) return df_train, df_test
def SPCA(X, reg, reg2): X = StandardScaler().fit_transform(X) transformer = SparsePCA(n_components=9, alpha=reg, ridge_alpha=reg2) transformer.fit(X) norm_comps = np.array( [i / np.linalg.norm(i) for i in transformer.components_]) return norm_comps
def get_dim_reds_scikit(pct_features): n_components = max(int(pct_features * num_features), 1) return [ LinearDiscriminantAnalysis(n_components=n_components), TruncatedSVD(n_components=n_components), #SparseCoder(n_components=n_components), DictionaryLearning(n_components=n_components), FactorAnalysis(n_components=n_components), SparsePCA(n_components=n_components), NMF(n_components=n_components), PCA(n_components=n_components), RandomizedPCA(n_components=n_components), KernelPCA(kernel="linear", n_components=n_components), KernelPCA(kernel="poly", n_components=n_components), KernelPCA(kernel="rbf", n_components=n_components), KernelPCA(kernel="sigmoid", n_components=n_components), KernelPCA(kernel="cosine", n_components=n_components), Isomap(n_components=n_components), LocallyLinearEmbedding(n_components=n_components, eigen_solver='auto', method='standard'), LocallyLinearEmbedding(n_neighbors=n_components, n_components=n_components, eigen_solver='auto', method='modified'), LocallyLinearEmbedding(n_neighbors=n_components, n_components=n_components, eigen_solver='auto', method='ltsa'), SpectralEmbedding(n_components=n_components) ]
def _explainedvar(X, n_components=None, sparse=False, random_state=None, verbose=3): # Create the model if sp.issparse(X): if verbose >= 3: print('[TruncatedSVD] Fit..') model = TruncatedSVD(n_components=n_components, random_state=random_state) elif sparse: if verbose >= 3: print('[PCA] Fit sparse dataset..') model = SparsePCA(n_components=n_components, random_state=random_state) else: if verbose >= 3: print('[PCA] Fit..') model = PCA(n_components=n_components, random_state=random_state) # Fit model model.fit(X) # Do the reduction loadings = model.components_ # Ook wel de coeeficienten genoemd: coefs! PC = model.transform(X) # Compute explained variance, top 95% variance percentExplVar = model.explained_variance_ratio_.cumsum() # Return return (model, PC, loadings, percentExplVar)
def pca_svm(filename): data = pd.read_csv('archive/' +filename, usecols=['label', 'tweet'] ) vectorizer = TfidfVectorizer() vectorized = vectorizer.fit_transform(data['tweet']) vectorized=vectorized.todense() X_tr, X_te, y_tr, y_te = train_test_split(vectorized, data['label'],test_size = 0.2) pca = SparsePCA() X_tr = pca.fit_transform(X_tr) X_te = pca.transform(X_te) clf = SVC(kernel = 'rbf') clf.fit(X_tr, y_tr) y_pred = clf.predict(X_te) y_pred_tr = clf.predict(X_tr) accuracy = accuracy_score(y_te, y_pred) accuracy_train = accuracy_score(y_tr, y_pred_tr) plot_confusion_matrix(clf, X_te, y_te) plt.show()
def train_reduc(data, reduc_type='pca', kernel='rbf', n_c=8, eps=0.01, random_state=2020): if reduc_type == 'pca': reduc = PCA(n_components=n_c) elif reduc_type == 'spca': reduc = SparsePCA(n_components=n_c) elif reduc_type == 'kpca': reduc = KernelPCA(n_components=n_c, kernel=kernel) elif reduc_type == 'ica': reduc = FastICA(n_components=n_c) elif reduc_type == 'grp': reduc = GaussianRandomProjection(n_components=n_c, eps=eps, random_state=random_state) elif reduc_type == 'srp': reduc = SparseRandomProjection(n_components=n_c, density='auto', eps=eps, dense_output=True, random_state=random_state) reduced = reduc.fit_transform(data) print('Reduc Complete') return reduced, reduc
def __init__(self, num_components=10, catalog_name='unknown', alpha=0.1, ridge_alpha=0.01, max_iter=2000, tol=1e-9, n_jobs=1, random_state=None): self._decomposition = 'Sparse PCA' self._num_components = num_components self._catalog_name = catalog_name self._alpha = alpha self._ridge_alpha = ridge_alpha self._n_jobs = n_jobs self._max_iter = max_iter self._tol = tol self._random_state = random_state self._SPCA = SparsePCA(n_components=self._num_components, alpha=self._alpha, ridge_alpha=self._ridge_alpha, n_jobs=self._n_jobs, max_iter=self._max_iter, tol=self._tol, random_state=self._random_state)
def _fitted_sparse_pca(X, d, unscaled_alpha, **kwargs): # this seems to work better than initializing with MiniBatchSparsePCA, # svd of cov mat, or basically anything else I tried U, _, Vt = randomized_svd(X, n_components=d, random_state=123) U = U[:, :d] V = Vt.T[:d] # SparsePCA (and all the sklearn dictionary learning stuff) # internally uses sum of squared errs for each sample, and L1 norm # of parameter matrix; to make alpha meaningful across datasets, # want to scale by number of examples (so it's effectively using MSE) # and divide by L1 norm (which grows linearly with size of parameter # matrix / vector); also scale by variance of data for similar reasons N, D = X.shape alpha = unscaled_alpha * np.var(X - X.mean(axis=0)) * N / D verbose = 1 pca = SparsePCA( n_components=d, alpha=alpha, normalize_components=True, method='lars', U_init=U, V_init=V, max_iter=10, ridge_alpha=max(1, len(X) * X.std() * 10), # ridge_alpha=1e8, verbose=verbose, random_state=123) if verbose > 0: print("fitting sparse pca...") return pca.fit(X)
def calc_principal_components(self, df, n_comp=20, method='PCA'): ''' Run PCA and Sparse PCA on feature table :param df: :return: ''' print(">> Running " + method + "...") if df.shape[1] <= n_comp: n_comp = df.shape[1] - 1 tmp_drop_cols = ['Gene_Name', self.cfg.Y] X = df.drop(tmp_drop_cols, axis=1) pca_data = X.copy() pca = None if method == 'SparsePCA': pca = SparsePCA(n_components=n_comp) else: pca = PCA(n_components=n_comp) principal_components = pca.fit_transform(pca_data) columns = [] for i in range(1, n_comp + 1): columns.append('PC' + str(i)) pca_df = pd.DataFrame(data=principal_components, columns=columns) pca_df = pd.concat([pca_df, df[tmp_drop_cols]], axis=1) filepath = str(self.cfg.unsuperv_out / (method + ".table.tsv")) pca_df.to_csv(filepath, sep='\t', index=None) return pca, pca_df
def __init__( self, *args, sparse=False, kernel=None, **kwargs ): super().__init__(*args, **kwargs) if kernel: self.model = KernelPCA( n_components = self.n_latent, kernel=kernel, random_state = self.random_state, copy_X=False ) elif sparse: self.model = SparsePCA( n_components = self.n_latent, random_state = self.random_state ) else: self.model = PCA( n_components = self.n_latent, random_state = self.random_state )
def spca(components, train_matrix, test_matrix): """Sparse principal component analysis routine. Parameters ---------- components : int The number of components to be returned. train_matrix : array The training features. test_matrix : array The test features. Returns ------- new_train : array Extracted training features. new_test : array Extracted test features. """ msg = 'The number of components must be a positive int greater than 0.' assert components > 0, msg pca = SparsePCA(n_components=components) model = pca.fit(X=train_matrix) new_train = model.transform(train_matrix) new_test = model.transform(test_matrix) return new_train, new_test
def test_transform_nan(): # Test that SparsePCA won't return NaN when there is 0 feature in all # samples. rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array Y[:, 0] = 0 estimator = SparsePCA(n_components=8) assert not np.any(np.isnan(estimator.fit_transform(Y)))
def test_initialization(): rng = np.random.RandomState(0) U_init = rng.randn(5, 3) V_init = rng.randn(3, 4) model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng) model.fit(rng.randn(5, 4)) assert_array_equal(model.components_, V_init)
def sccodedirect(): "得到不带眼镜的RPCA结果" nglassmodel = np.load('nglassline.npy').astype('f') from sklearn.decomposition import SparsePCA learning = SparsePCA(500,verbose=True) learning.fit(nglassmodel) import cPickle cPickle.dump(learning,file('sparsepcadirect','wb'),-1)
def sparse_pca(K, alpha, ridge_alpha): transformer = SparsePCA(n_components=1, alpha=alpha, ridge_alpha=ridge_alpha, normalize_components=False, random_state=0) transformer.fit(K) val = transformer.components_[0] print('#nnz: ', np.sum(np.abs(val) > 1.0e-10)) #print(np.sum(val * val)) #val = np.random.randn(K.shape[1]) return val / np.linalg.norm(val)