def SPCA(X, reg, reg2): X = StandardScaler().fit_transform(X) transformer = SparsePCA(n_components=9, alpha=reg, ridge_alpha=reg2) transformer.fit(X) norm_comps = np.array( [i / np.linalg.norm(i) for i in transformer.components_]) return norm_comps
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.externals.joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: spca = SparsePCA(n_components=3, n_jobs=2, random_state=0, alpha=alpha).fit(Y) U2 = spca.transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_initialization(): rng = np.random.RandomState(0) U_init = rng.randn(5, 3) V_init = rng.randn(3, 4) model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng) model.fit(rng.randn(5, 4)) assert_array_equal(model.components_, V_init)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_) # Test that deprecated ridge_alpha parameter throws warning warning_msg = "The ridge_alpha parameter on transform()" assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=0.01) assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=None)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.externals.joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: spca = SparsePCA(n_components=3, n_jobs=2, random_state=0, alpha=alpha).fit(Y) U2 = spca.transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
class SparsePCA(): def __init__(self, cols, n_components): self.n_components = n_components self.model = SparsePCA(n_components=n_components) self.columns = cols def fit(self, data): self.model.fit(data[self.columns]) def fit_transform(self, data): transformed = self.model.fit_transform(data[self.columns]) transformed = pd.DataFrame( transformed, columns=["spca_" + str(i + 1) for i in range(self.n_components)]) data = pd.concat([data, transformed], axis=1) data = data.drop(self.columns, axis=1) return data def transform(self, data): transformed = self.model.transform(data[self.columns]) transformed = pd.DataFrame( transformed, columns=["spca_" + str(i + 1) for i in range(self.n_components)]) data = pd.concat([data, transformed], axis=1) data = data.drop(self.columns, axis=1) return data
def test_initialization(): rng = np.random.RandomState(0) U_init = rng.randn(5, 3) V_init = rng.randn(3, 4) model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng) model.fit(rng.randn(5, 4)) assert_array_equal(model.components_, V_init)
def sccodedirect(): "得到不带眼镜的RPCA结果" nglassmodel = np.load('nglassline.npy').astype('f') from sklearn.decomposition import SparsePCA learning = SparsePCA(500,verbose=True) learning.fit(nglassmodel) import cPickle cPickle.dump(learning,file('sparsepcadirect','wb'),-1)
def sparse_pca(self): """ Runs PCA on view and returns projected view, the principle components, and explained variance. """ model = SparsePCA(n_components=param['components'], alpha=param['sparse_pca_alpha']) model.fit(self.view) return model.transform(self.view), model.components_
def sparse_pca(K, alpha, ridge_alpha): transformer = SparsePCA(n_components=1, alpha=alpha, ridge_alpha=ridge_alpha, normalize_components=False, random_state=0) transformer.fit(K) val = transformer.components_[0] print('#nnz: ', np.sum(np.abs(val) > 1.0e-10)) #print(np.sum(val * val)) #val = np.random.randn(K.shape[1]) return val / np.linalg.norm(val)
def test_initialization(): rng = np.random.RandomState(0) U_init = rng.randn(5, 3) V_init = rng.randn(3, 4) model = SparsePCA( n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng ) model.fit(rng.randn(5, 4)) assert_allclose(model.components_, V_init / np.linalg.norm(V_init, axis=1)[:, None])
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=0) spca_lars.fit(Y) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method="cd", random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def do_sparse_pca(sparse_matrix): # from skikit learn http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.SparsePCA.html#sklearn.decomposition.SparsePCA dense_matrix = sparse_matrix.tobsr().toarray() # instantiate the spca with some parameters spca = SparsePCA(n_components=6, alpha=0.01, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=1, U_init=None, V_init=None, verbose=False, random_state=None) # train the spca with our matrix spca.fit(dense_matrix) # return the components return spca.components_
def test_initialization(norm_comp): rng = np.random.RandomState(0) U_init = rng.randn(5, 3) V_init = rng.randn(3, 4) model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng, normalize_components=norm_comp) model.fit(rng.randn(5, 4)) if norm_comp: assert_allclose(model.components_, V_init / np.linalg.norm(V_init, axis=1)[:, None]) else: assert_allclose(model.components_, V_init)
def spca(data, num_components=None, alpha=1): # creates a matrix with sparse principal component analysis # build matrix with all data data = [d.flatten() for d in data if not any(isnan(d))] datamatrix = row_stack(data) # center data cdata = datamatrix - mean(datamatrix, axis=0) if num_components is None: num_components = cdata.shape[0] # do spca on matrix spca = SparsePCA(n_components=num_components, alpha=alpha) spca.fit(cdata) # normalize components components = spca.components_.T for r in range(0, components.shape[1]): compnorm = numpy.apply_along_axis(numpy.linalg.norm, 0, components[:, r]) if not compnorm == 0: components[:, r] /= compnorm components = components.T # calc adjusted explained variance from "Sparse Principal Component Analysis" by Zou, Hastie, Tibshirani spca.components_ = components #nuz = spca.transform(cdata).T nuz = ridge_regression(spca.components_.T, cdata.T, 0.01, solver='dense_cholesky').T #nuz = dot(components, cdata.T) q, r = qr(nuz.T) cumulative_var = [] for i in range(1, num_components + 1): cumulative_var.append(trace(r[0:i, ] * r[0:i, ])) explained_var = [math.sqrt(cumulative_var[0])] for i in range(1, num_components): explained_var.append( math.sqrt(cumulative_var[i]) - math.sqrt(cumulative_var[i - 1])) order = numpy.argsort(explained_var)[::-1] components = numpy.take(components, order, axis=0) evars = numpy.take(explained_var, order).tolist() #evars = numpy.take(explained_var,order) #order2 = [0,1,2,4,5,7,12,19] #components = numpy.take(components,order2,axis=0) #evars = numpy.take(evars,order2).tolist() return components, evars
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
class SPCAEstimator(): def __init__(self, n_components, alpha=10.0): self.n_components = n_components self.whiten = False self.alpha = alpha # higher alpha => sparser components #self.transformer = MiniBatchSparsePCA(n_components, alpha=alpha, n_iter=100, # batch_size=max(20, n_components//5), random_state=0, normalize_components=True) self.transformer = SparsePCA( n_components, alpha=alpha, ridge_alpha=0.01, max_iter=100, random_state=0, n_jobs=-1, normalize_components=True) # TODO: warm start using PCA result? self.batch_support = False # maybe through memmap and HDD-stored tensor self.stdev = np.zeros((n_components, )) self.total_var = 0.0 def get_param_str(self): return "spca_c{}_a{}{}".format(self.n_components, self.alpha, '_w' if self.whiten else '') def fit(self, X): self.transformer.fit(X) # Save variance for later self.total_var = X.var(axis=0).sum() # Compute projected standard deviations # NB: cannot simply project with dot product! self.stdev = self.transformer.transform(X).std( axis=0) # X = (n_samples, n_features) # Sort components based on explained variance idx = np.argsort(self.stdev)[::-1] self.stdev = self.stdev[idx] self.transformer.components_[:] = self.transformer.components_[idx] # Check orthogonality dotps = [ np.dot(*self.transformer.components_[[i, j]]) for (i, j) in itertools.combinations(range(self.n_components), 2) ] if not np.allclose(dotps, 0, atol=1e-4): print('SPCA components not orghogonal, max dot', np.abs(dotps).max()) def get_components(self): var_ratio = self.stdev**2 / self.total_var return self.transformer.components_, self.stdev, var_ratio # SPCA outputs are normalized
class DimensionalityReducer(object): def __init__(self): self.sc = None self.pca = None def fitPCA(self, X, nfeats=3): self.sc = StandardScaler() self.pca = SparsePCA(n_components=nfeats) self.pca.fit(self.sc.fit_transform(X)) def transformPCA(self, X): components = self.pca.transform(self.sc.transform(X)) return components
def test_fit_transform_parallel(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2)
class SparsePCAImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def test_fit_transform_parallel(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA( n_components=3, n_jobs=2, method="lars", alpha=alpha, random_state=0 ).fit(Y) U2 = spca.transform(Y) assert not np.all(spca_lars.components_ == 0) assert_array_almost_equal(U1, U2)
def test_pca_vs_spca(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2) pca = PCA(n_components=2) pca.fit(Y) spca.fit(Y) results_test_pca = pca.transform(Z) results_test_spca = spca.transform(Z) assert_allclose(np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-5) results_test_pca *= np.sign(results_test_pca[0, :]) results_test_spca *= np.sign(results_test_spca[0, :]) assert_allclose(results_test_pca, results_test_spca)
def spca(components, train_matrix, test_matrix): """Sparse principal component analysis routine. Parameters ---------- components : int The number of components to be returned. train_matrix : array The training features. test_matrix : array The test features. Returns ------- new_train : array Extracted training features. new_test : array Extracted test features. """ msg = 'The number of components must be a positive int greater than 0.' assert components > 0, msg pca = SparsePCA(n_components=components) model = pca.fit(X=train_matrix) new_train = model.transform(train_matrix) new_test = model.transform(test_matrix) return new_train, new_test
def test_pca_vs_spca(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2, normalize_components=True) pca = PCA(n_components=2) pca.fit(Y) spca.fit(Y) results_test_pca = pca.transform(Z) results_test_spca = spca.transform(Z) assert_allclose(np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-5) results_test_pca *= np.sign(results_test_pca[0, :]) results_test_spca *= np.sign(results_test_spca[0, :]) assert_allclose(results_test_pca, results_test_spca)
def _fitted_sparse_pca(X, d, unscaled_alpha, **kwargs): # this seems to work better than initializing with MiniBatchSparsePCA, # svd of cov mat, or basically anything else I tried U, _, Vt = randomized_svd(X, n_components=d, random_state=123) U = U[:, :d] V = Vt.T[:d] # SparsePCA (and all the sklearn dictionary learning stuff) # internally uses sum of squared errs for each sample, and L1 norm # of parameter matrix; to make alpha meaningful across datasets, # want to scale by number of examples (so it's effectively using MSE) # and divide by L1 norm (which grows linearly with size of parameter # matrix / vector); also scale by variance of data for similar reasons N, D = X.shape alpha = unscaled_alpha * np.var(X - X.mean(axis=0)) * N / D verbose = 1 pca = SparsePCA( n_components=d, alpha=alpha, normalize_components=True, method='lars', U_init=U, V_init=V, max_iter=10, ridge_alpha=max(1, len(X) * X.std() * 10), # ridge_alpha=1e8, verbose=verbose, random_state=123) if verbose > 0: print("fitting sparse pca...") return pca.fit(X)
def spca(data, num_components=None, alpha=1): # creates a matrix with sparse principal component analysis # build matrix with all data data = [d.flatten() for d in data if not any(isnan(d))] datamatrix = row_stack(data) # center data cdata = datamatrix - mean(datamatrix, axis=0) if num_components is None: num_components = cdata.shape[0] # do spca on matrix spca = SparsePCA(n_components=num_components, alpha=alpha) spca.fit(cdata) # normalize components components = spca.components_.T for r in xrange(0,components.shape[1]): compnorm = numpy.apply_along_axis(numpy.linalg.norm, 0, components[:,r]) if not compnorm == 0: components[:,r] /= compnorm components = components.T # calc adjusted explained variance from "Sparse Principal Component Analysis" by Zou, Hastie, Tibshirani spca.components_ = components #nuz = spca.transform(cdata).T nuz = ridge_regression(spca.components_.T, cdata.T, 0.01, solver='dense_cholesky').T #nuz = dot(components, cdata.T) q,r = qr(nuz.T) cumulative_var = [] for i in range(1,num_components+1): cumulative_var.append(trace(r[0:i,]*r[0:i,])) explained_var = [math.sqrt(cumulative_var[0])] for i in range(1,num_components): explained_var.append(math.sqrt(cumulative_var[i])-math.sqrt(cumulative_var[i-1])) order = numpy.argsort(explained_var)[::-1] components = numpy.take(components,order,axis=0) evars = numpy.take(explained_var,order).tolist() #evars = numpy.take(explained_var,order) #order2 = [0,1,2,4,5,7,12,19] #components = numpy.take(components,order2,axis=0) #evars = numpy.take(evars,order2).tolist() return components, evars
def testSparse(n_components, alpha): spca = SparsePCA(n_components=n_components, alpha=alpha) spca_data = spca.fit(data).transform(data) plt.scatter(spca_data[:, 0], spca_data[:, 1], c=labels, cmap='nipy_spectral') plt.show()
def test_fit_transform_tall(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array spca_lars = SparsePCA(n_components=3, method="lars", random_state=rng) U1 = spca_lars.fit_transform(Y) spca_lasso = SparsePCA(n_components=3, method="cd", random_state=rng) U2 = spca_lasso.fit(Y).transform(Y) assert_array_almost_equal(U1, U2)
def test_fit_transform_tall(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array spca_lars = SparsePCA(n_components=3, method='lars', random_state=rng) U1 = spca_lars.fit_transform(Y) spca_lasso = SparsePCA(n_components=3, method='cd', random_state=rng) U2 = spca_lasso.fit(Y).transform(Y) assert_array_almost_equal(U1, U2)
def _OnClick2(self, event): if self.var2.get() == "Off": self.var2.set("On") elif self.var2.get() == "On": self.var2.set("Off") print("Sparse PCA is running...") label = pd.read_csv(self.labelVar, header=None)[0].tolist() df = pd.read_csv(self.dfLabel, header=None) data, label = df, label #Standdardize the data data = StandardScaler().fit_transform(data) # apply PCA sparsepca = SparsePCA(n_components=2) # get 1st and 2nd components sparsepca.fit(data) SparseprincipalComponents = sparsepca.fit_transform(data) SparseprincipalDf = pd.DataFrame( data=SparseprincipalComponents, columns=['Component 1', 'Component 2']) print("Our principal components are: ") print(SparseprincipalComponents) X_r1 = SparseprincipalComponents[:, 0] X_r2 = SparseprincipalComponents[:, 1] unique = np.unique(label) print(len(np.unique(label)) + "*************************") try: plt.scatter(X_r1, X_r2, c=label) except: print( "Data matrix does not match label matrix (Select input file and label, remove headers)" ) name = 'Sparse_PCA' #CHANGE FILENAME HERE ************************************************************************* #plt.legend(unique, loc=8, ncol=5,fontsize='x-small') plt.title(name + " Clusters: " + str(len(unique))) plt.show() plt.savefig(name + ".png") plt.clf() # save 1st and 2nd components to csv SparseprincipalDf.to_excel( "Sparse_PCA_Components.xlsx" ) #Names of 1st and 2nd components to EXCEL here *************************************************************************
def test_fit_transform_variance(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0, variance=True) pca = PCA(n_components=3, random_state=0) pca.fit(Y) # no need to fit spca for this spca_lars.fit(Y) components = pca.components_ explained_variance = pca.explained_variance_ spca_lars.components_ = components explained_variance_sparse = spca_lars.explained_variance_ assert_array_almost_equal(explained_variance, explained_variance_sparse)
class SPCA: def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = SparsePCA(*args, **kwargs) def fit(self, X, y): Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any( axis=1)] if Z.shape[0] != X.shape[0]: print( 'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: self.model.fit(X_) def predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print( 'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: predicted = self.model.transform(X_) Z = numpy.full(shape=(X.shape[0], predicted.shape[1]), fill_value=numpy.nan, dtype=numpy.float64) Z[nan_mask, :] = predicted return Z
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_) # Test that deprecated ridge_alpha parameter throws warning warning_msg = "The ridge_alpha parameter on transform()" assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=0.01) assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=None)
def compute_robust_low_rank(data, total_k, range_kprime, d): # PCA + sparse PCA Count = 0 Count += 1 projection_error = [] sdp_val = [] minPi = [] min_sdp_val = 1000000 for k in range_kprime: print("processing k=", k) eigs, eigvecs = lasp.eigsh(data, k=k, which='LA', tol=0.00001) Pi = np.matmul(eigvecs, eigvecs.T) projected_data = np.matmul(Pi, np.matmul(data, Pi)) if k < total_k: spca = SparsePCA(n_components=total_k - k, random_state=0, alpha=1e-5, normalize_components=True) spca.fit(100 * (data - projected_data)) u = spca.components_ A = np.matmul(np.eye(d) - Pi, np.matmul(u.T, u)) B = np.matmul(A, np.eye(d) - Pi) eigval, U = lasp.eigsh(B, k=total_k - k, which='LA', tol=0.00001) D = 1.0 * np.diag(eigval > 0.00001) U = np.matmul(U, D) sPi = Pi + np.matmul(U, U.T) else: sPi = Pi projected_data = np.matmul(sPi, np.matmul(data, sPi)) projection_error.append(np.trace(data) - np.trace(projected_data)) [curr_y, min_val, curr_alpha, avg_y_val] = solveGroth(sPi, d) sdp_val.append(min_val) minPi.append(sPi) return [projection_error, sdp_val, minPi, min_sdp_val]
def spca_run(alpha=1): pca = SparsePCA(n_components=2, alpha=alpha) pca_data = pca.fit(data).transform(data) fig, axs = plt.subplots(1, 1) axs.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='rainbow') axs.set_xlabel('PC1') axs.set_ylabel('PC2') plt.show()
def tu_spca(self, dataname="kong", components_n=1, data=None): #测试数据 X, y = make_blobs(n_samples=10000, n_features=3, centers=[[3, 3, 3], [0, 0, 0], [1, 1, 1], [2, 2, 2]], cluster_std=[0.2, 0.1, 0.2, 0.2], random_state=9) if data == None: data = X message = [] #训练数据 spca = SparsePCA(n_components=components_n, normalize_components=True, random_state=0) spca.fit(X) #保存数据 value = spca.transform(X) save_helper.save_txt_helper(value, dataname) components = spca.components_ error = spca.error_ page2 = Page() #绘图 for j in range(0, components.shape[0]): bar1 = Bar("稀疏组建" + str(j)) bar1.add("", [ "components_" + str(i) for i in range(0, components.shape[1]) ], components[j]) page2.add(bar1) message.append("我们仅提供稀疏组建和数据误差供给分析") print(error) bar2 = Bar("数据误差分析") bar2.add("", ["error" + str(i) for i in range(0, len(error))], error) page2.add(bar2) save_helper.save_tu_helper(page2, dataname) return message
def spca_fn(X): import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.decomposition import SparsePCA if X.shape != (7501, 6): X = np.transpose(X) pca = PCA(n_components=1) X_r = pca.fit(X).transform(X) spca = SparsePCA(n_components=1) X_r2 = spca.fit(X).transform(X) return X_r, X_r2
def spca_alpha(Y_matrix, max_features, eps = 1e-4): """ Y_matrix = input matrix max_features = maximum number of non-zero elements in the first sparse principal component eps = convergence parameter """ range_ = [0,1] while True: range_mid = 0.5 * (range_[1] + range_[0]) sp_pca_mid = SparsePCA(n_components=1, alpha=range_mid) sp_pca_mid.fit(Y_matrix) n_features_mid = sum([x != 0 for x in sp_pca_mid.components_[0]]) if n_features_mid == max_features: return range_mid elif range_[1] - range_[0] < eps: return None elif n_features_mid < max_features: range_[1] = range_mid else: range_[0] = range_mid
def WeightsEstimatedFromSparsePCAWithWeightedCovariance(ret_p, n_com=30): ret_port = ret_p.dropna(how='all', axis=1) tf = SparsePCA(n_components=n_com) # , random_state=0) cov_matrix = WeightedCovariance(ret_port) tf.fit(cov_matrix) # 注意量级 tf.transform( ret_port.fillna(0.0) ) # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12']) # 根据组合的组合的平均收益,调整组合的符号 weights = pd.DataFrame(tf.components_, columns=cov_matrix.columns).T ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace( 0.0, np.nan) for c in ret_transformed_port.columns: weights[c] = weights[c] * np.sign( ret_transformed_port[c].mean()) / np.abs(weights[c]).sum() ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace( 0.0, np.nan) # 按t值选,还是按SR选择 select_port = np.abs( PortfolioAnalysis(ret_transformed_port)).T.sort_values( by='SR', ascending=False).index for p in select_port: weights[p] *= np.sign(ret_transformed_port[p].mean()) return weights[select_port]
def WeightsEstimatedFromSparsePCA(ret_port, n_com=25): tf = SparsePCA(n_components=n_com) # , random_state=0) tf.fit(ret_port.agg(lambda x: x - x.mean()).fillna(0.0)) # 注意量级 tf.transform( ret_port.fillna(0.0) ) # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12']) # 根据组合的组合的平均收益,调整组合的符号 weights = pd.DataFrame(tf.components_, columns=signal_names.split(',')).T ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace( 0.0, np.nan) for c in weights.columns: weights[c] = weights[c] * np.sign( ret_transformed_port[c].mean()) / np.abs(weights[c]).sum() ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace( 0.0, np.nan) # 按t值选,还是按SR选择 select_port = np.abs( PortfolioAnalysis(ret_transformed_port.dropna( how='all', axis=1))).T.sort_values(by='SR', ascending=False).index[:int(n_com * 0.67)] for p in select_port: weights[p] *= np.sign(ret_transformed_port[p].mean()) return weights[select_port]
def main(): accounts = csv_to_dict('accounts.csv', 0, cast_evals=[str, read_time, readOutcome], type="account") account_nodes = csv_to_dict('nodevisits.csv', 1, cast_evals=[str, str, read_time, str], type="node") account_submissions = csv_to_dict('submissions.csv', 1, cast_evals=[str, str, read_time, str, str], type="submission") account_visits = account_nodes for acc in account_visits: account_visits[acc].extend(account_submissions[acc]) account_visits[acc] = sorted(account_visits[acc], key=lambda k: k['time']) session_length(account_visits) #Build sessions based on time scale determined from previous code as 15 minutes sessions = [] for acc in account_visits: actions = [] for idx, visit in enumerate(account_visits[acc]): if idx == 0: actions = {"node": [], "submission": [], "learning_outcome": accounts[acc][0]["learning_outcome"]} actions[account_visits[acc][idx]["type"]].append(visit) else: #Time between visits in minutes delta_time = delta_minutes(visit["time"], account_visits[acc][idx-1]["time"]) #New session, defined as 15 minutes from above if delta_time > 15: sessions.append(actions) actions = {"node": [], "submission": [], "learning_outcome": accounts[acc][0]["learning_outcome"]} actions[account_visits[acc][idx]["type"]].append(visit) else: actions[account_visits[acc][idx]["type"]].append(visit) sessions.append(actions) for session in sessions: if len(session["node"]) > 0 and len(session["submission"]) > 0: session["start_time"] = min(session["node"][0]["time"], session["submission"][0]["time"]) session["end_time"] = max(session["node"] [len(session["node"]) -1]["time"], session["submission"] [len(session["submission"]) -1]["time"]) elif len(session["node"]) > 0: session["start_time"] = session["node"][0]["time"] session["end_time"] = session["node"] [len(session["submission"]) -1]["time"] else: session["start_time"] = session["submission"][0]["time"] session["end_time"] = session["submission"] [len(session["submission"]) -1]["time"] #Remove sessions without any time difference or no nodes visited sessions = [session for session in sessions if delta_minutes(session["end_time"], session["start_time"]) != 0] X = session_properties(sessions) X = standardize(X) pca = SparsePCA(n_components = 2) #Negative one just makes plot easier to look at, PCA is sign insensitive so no real effect X_r = -1 * pca.fit(X).transform(X) kmeans = cluster.KMeans(n_clusters=4) group = kmeans.fit_predict(X_r) fig = plt.figure(figsize=(6,6)) ax = fig.add_subplot(111) plt.rc('font', family='serif', size=20) ax.set_xticklabels([]) ax.set_yticklabels([]) ax.scatter(X_r[:,0], X_r[:,1],s=20,marker = 'o', c=group) plt.show() outcomes = np.asarray([session["learning_outcome"] for session in sessions]) session_by_outcome = [] tags = [] labels = get_labels(X_r, group, 4) for result in range(0, 4): session_by_outcome.append(group[outcomes == result]) if result == 0: tags.append("No certificate achieved") else: tags.append("Mastery Level = " + str(result)) plot_hist(session_by_outcome, x_min = 0, x_max = 4, y_min = 0, y_max = 1, bins = 4, tags = tags, y_label = "Fraction of sessions", labels=labels)
N = 500 P = 10 MU = [0] * P T = 1 # spike level K = 2 # sparsity level V = list(range(1,K+1)) + [0]*(P-K) V = V / np.linalg.norm(V) SIG = np.identity(P) + T * np.matrix(V).transpose() * np.matrix(V) X = np.matrix(np.random.multivariate_normal(MU,SIG,N)) ##### # using scikit-learn method for Sparse PCA (like an l1-regularized dictionary learning problem) from sklearn.decomposition import SparsePCA spca = SparsePCA(n_components=1, alpha=5) spca.fit(X) from sklearn.decomposition import PCA pca = PCA(n_components=1) pca.fit(X) print('Classical 1st principal component:', pca.components_) print('Sparse 1st principal component:', spca.components_) ##### # TODO: SDP implementation a la El Ghaoui, Bach, D'Aspremont import cvxopt # TWO CONSTRAINTS # trace = 1 (multiply with identity) # l1 norm <= k (multiply with all 1s matrix)
class SPCA(object): """ Wrapper for sklearn package. Performs sparse PCA SPCA has 5 methods: - fit(waveforms) update class instance with ICA fit - fit_transform() do what fit() does, but additionally return the projection onto ICA space - inverse_transform(A) inverses the decomposition, returns waveforms for an input A, using Z - get_basis() returns the basis vectors Z^\dagger - get_params() returns metadata used for fits. """ def __init__(self, num_components=10, catalog_name='unknown', alpha = 0.1, ridge_alpha = 0.01, max_iter = 2000, tol = 1e-9, n_jobs = 1, random_state = None): self._decomposition = 'Sparse PCA' self._num_components = num_components self._catalog_name = catalog_name self._alpha = alpha self._ridge_alpha = ridge_alpha self._n_jobs = n_jobs self._max_iter = max_iter self._tol = tol self._random_state = random_state self._SPCA = SparsePCA(n_components=self._num_components, alpha = self._alpha, ridge_alpha = self._ridge_alpha, n_jobs = self._n_jobs, max_iter = self._max_iter, tol = self._tol, random_state = self._random_state) def fit(self,waveforms): # TODO make sure there are more columns than rows (transpose if not) # normalize waveforms self._waveforms = waveforms self._SPCA.fit(self._waveforms) def fit_transform(self,waveforms): # TODO make sure there are more columns than rows (transpose if not) # normalize waveforms self._waveforms = waveforms self._A = self._SPCA.fit_transform(self._waveforms) return self._A def inverse_transform(self,A): # convert basis back to waveforms using fit new_waveforms = self._SPCA.inverse_transform(A) return new_waveforms def get_params(self): # TODO know what catalog was used! (include waveform metadata) params = self._SPCA.get_params() params['num_components'] = params.pop('n_components') params['Decompositon'] = self._decomposition return params def get_basis(self): """ Return the SPCA basis vectors (Z^\dagger)""" Zt = self._SPCA.components_ return Zt
def fit(self, dif_df): factorization = SparsePCA(n_components=self.n_components, alpha=0.03) X = dif_df.values[1:] self.ticker_symbols_used = dif_df.columns.values factorization.fit(X) self.factorization = factorization