def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.externals.joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: spca = SparsePCA(n_components=3, n_jobs=2, random_state=0, alpha=alpha).fit(Y) U2 = spca.transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.externals.joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: spca = SparsePCA(n_components=3, n_jobs=2, random_state=0, alpha=alpha).fit(Y) U2 = spca.transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform_parallel(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA( n_components=3, n_jobs=2, method="lars", alpha=alpha, random_state=0 ).fit(Y) U2 = spca.transform(Y) assert not np.all(spca_lars.components_ == 0) assert_array_almost_equal(U1, U2)
def test_fit_transform_parallel(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2)
def pca_svm(filename): data = pd.read_csv('archive/' +filename, usecols=['label', 'tweet'] ) vectorizer = TfidfVectorizer() vectorized = vectorizer.fit_transform(data['tweet']) vectorized=vectorized.todense() X_tr, X_te, y_tr, y_te = train_test_split(vectorized, data['label'],test_size = 0.2) pca = SparsePCA() X_tr = pca.fit_transform(X_tr) X_te = pca.transform(X_te) clf = SVC(kernel = 'rbf') clf.fit(X_tr, y_tr) y_pred = clf.predict(X_te) y_pred_tr = clf.predict(X_tr) accuracy = accuracy_score(y_te, y_pred) accuracy_train = accuracy_score(y_tr, y_pred_tr) plot_confusion_matrix(clf, X_te, y_te) plt.show()
class SparsePCA(): def __init__(self, cols, n_components): self.n_components = n_components self.model = SparsePCA(n_components=n_components) self.columns = cols def fit(self, data): self.model.fit(data[self.columns]) def fit_transform(self, data): transformed = self.model.fit_transform(data[self.columns]) transformed = pd.DataFrame( transformed, columns=["spca_" + str(i + 1) for i in range(self.n_components)]) data = pd.concat([data, transformed], axis=1) data = data.drop(self.columns, axis=1) return data def transform(self, data): transformed = self.model.transform(data[self.columns]) transformed = pd.DataFrame( transformed, columns=["spca_" + str(i + 1) for i in range(self.n_components)]) data = pd.concat([data, transformed], axis=1) data = data.drop(self.columns, axis=1) return data
def test_scaling_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=rng) results_train = spca_lars.fit_transform(Y) results_test = spca_lars.transform(Y[:10]) assert_allclose(results_train[0], results_test[0])
def sparse_pca(self): """ Runs PCA on view and returns projected view, the principle components, and explained variance. """ model = SparsePCA(n_components=param['components'], alpha=param['sparse_pca_alpha']) model.fit(self.view) return model.transform(self.view), model.components_
def test_scaling_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=rng, normalize_components=True) results_train = spca_lars.fit_transform(Y) results_test = spca_lars.transform(Y[:10]) assert_allclose(results_train[0], results_test[0])
class SPCAEstimator(): def __init__(self, n_components, alpha=10.0): self.n_components = n_components self.whiten = False self.alpha = alpha # higher alpha => sparser components #self.transformer = MiniBatchSparsePCA(n_components, alpha=alpha, n_iter=100, # batch_size=max(20, n_components//5), random_state=0, normalize_components=True) self.transformer = SparsePCA( n_components, alpha=alpha, ridge_alpha=0.01, max_iter=100, random_state=0, n_jobs=-1, normalize_components=True) # TODO: warm start using PCA result? self.batch_support = False # maybe through memmap and HDD-stored tensor self.stdev = np.zeros((n_components, )) self.total_var = 0.0 def get_param_str(self): return "spca_c{}_a{}{}".format(self.n_components, self.alpha, '_w' if self.whiten else '') def fit(self, X): self.transformer.fit(X) # Save variance for later self.total_var = X.var(axis=0).sum() # Compute projected standard deviations # NB: cannot simply project with dot product! self.stdev = self.transformer.transform(X).std( axis=0) # X = (n_samples, n_features) # Sort components based on explained variance idx = np.argsort(self.stdev)[::-1] self.stdev = self.stdev[idx] self.transformer.components_[:] = self.transformer.components_[idx] # Check orthogonality dotps = [ np.dot(*self.transformer.components_[[i, j]]) for (i, j) in itertools.combinations(range(self.n_components), 2) ] if not np.allclose(dotps, 0, atol=1e-4): print('SPCA components not orghogonal, max dot', np.abs(dotps).max()) def get_components(self): var_ratio = self.stdev**2 / self.total_var return self.transformer.components_, self.stdev, var_ratio # SPCA outputs are normalized
class SparsePCAImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class DimensionalityReducer(object): def __init__(self): self.sc = None self.pca = None def fitPCA(self, X, nfeats=3): self.sc = StandardScaler() self.pca = SparsePCA(n_components=nfeats) self.pca.fit(self.sc.fit_transform(X)) def transformPCA(self, X): components = self.pca.transform(self.sc.transform(X)) return components
def WeightsEstimatedFromSparsePCA(ret_port, n_com=25): tf = SparsePCA(n_components=n_com) # , random_state=0) tf.fit(ret_port.agg(lambda x: x - x.mean()).fillna(0.0)) # 注意量级 tf.transform( ret_port.fillna(0.0) ) # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12']) # 根据组合的组合的平均收益,调整组合的符号 weights = pd.DataFrame(tf.components_, columns=signal_names.split(',')).T ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace( 0.0, np.nan) for c in weights.columns: weights[c] = weights[c] * np.sign( ret_transformed_port[c].mean()) / np.abs(weights[c]).sum() ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace( 0.0, np.nan) # 按t值选,还是按SR选择 select_port = np.abs( PortfolioAnalysis(ret_transformed_port.dropna( how='all', axis=1))).T.sort_values(by='SR', ascending=False).index[:int(n_com * 0.67)] for p in select_port: weights[p] *= np.sign(ret_transformed_port[p].mean()) return weights[select_port]
def WeightsEstimatedFromSparsePCAWithWeightedCovariance(ret_p, n_com=30): ret_port = ret_p.dropna(how='all', axis=1) tf = SparsePCA(n_components=n_com) # , random_state=0) cov_matrix = WeightedCovariance(ret_port) tf.fit(cov_matrix) # 注意量级 tf.transform( ret_port.fillna(0.0) ) # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12']) # 根据组合的组合的平均收益,调整组合的符号 weights = pd.DataFrame(tf.components_, columns=cov_matrix.columns).T ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace( 0.0, np.nan) for c in ret_transformed_port.columns: weights[c] = weights[c] * np.sign( ret_transformed_port[c].mean()) / np.abs(weights[c]).sum() ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace( 0.0, np.nan) # 按t值选,还是按SR选择 select_port = np.abs( PortfolioAnalysis(ret_transformed_port)).T.sort_values( by='SR', ascending=False).index for p in select_port: weights[p] *= np.sign(ret_transformed_port[p].mean()) return weights[select_port]
def test_pca_vs_spca(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2) pca = PCA(n_components=2) pca.fit(Y) spca.fit(Y) results_test_pca = pca.transform(Z) results_test_spca = spca.transform(Z) assert_allclose(np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-5) results_test_pca *= np.sign(results_test_pca[0, :]) results_test_spca *= np.sign(results_test_spca[0, :]) assert_allclose(results_test_pca, results_test_spca)
def test_pca_vs_spca(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2, normalize_components=True) pca = PCA(n_components=2) pca.fit(Y) spca.fit(Y) results_test_pca = pca.transform(Z) results_test_spca = spca.transform(Z) assert_allclose(np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-5) results_test_pca *= np.sign(results_test_pca[0, :]) results_test_spca *= np.sign(results_test_spca[0, :]) assert_allclose(results_test_pca, results_test_spca)
class SPCA: def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = SparsePCA(*args, **kwargs) def fit(self, X, y): Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any( axis=1)] if Z.shape[0] != X.shape[0]: print( 'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: self.model.fit(X_) def predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print( 'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: predicted = self.model.transform(X_) Z = numpy.full(shape=(X.shape[0], predicted.shape[1]), fill_value=numpy.nan, dtype=numpy.float64) Z[nan_mask, :] = predicted return Z
def tu_spca(self, dataname="kong", components_n=1, data=None): #测试数据 X, y = make_blobs(n_samples=10000, n_features=3, centers=[[3, 3, 3], [0, 0, 0], [1, 1, 1], [2, 2, 2]], cluster_std=[0.2, 0.1, 0.2, 0.2], random_state=9) if data == None: data = X message = [] #训练数据 spca = SparsePCA(n_components=components_n, normalize_components=True, random_state=0) spca.fit(X) #保存数据 value = spca.transform(X) save_helper.save_txt_helper(value, dataname) components = spca.components_ error = spca.error_ page2 = Page() #绘图 for j in range(0, components.shape[0]): bar1 = Bar("稀疏组建" + str(j)) bar1.add("", [ "components_" + str(i) for i in range(0, components.shape[1]) ], components[j]) page2.add(bar1) message.append("我们仅提供稀疏组建和数据误差供给分析") print(error) bar2 = Bar("数据误差分析") bar2.add("", ["error" + str(i) for i in range(0, len(error))], error) page2.add(bar2) save_helper.save_tu_helper(page2, dataname) return message
# print (matrix_KPCA_BF) # print ("get KPCA mean matrix") # print (matrix_KPCA_mean) matrix_KPCA_BF.to_csv(gl.get_value("outputFile") + "_KPCA_BF.txt", sep='\t', header=True, index=True) matrix_KPCA_mean.to_csv(gl.get_value("outputFile") + "_KPCA_mean.txt", sep='\t', header=True, index=True) if gl.get_value("SPCA_Flag"): spca = SparsePCA(n_components=gl.get_value("SPCA_n_components")) spca.fit(wholeData) expre_SPCA = spca.transform(expre_data) # print ("get SPCA data") matrix_SPCA = Methods.get_matrix_dist( data=expre_SPCA, lab=lab, clusters=clusters, average_number=gl.get_value("SPCA_AvgNum"), caculation_number=gl.get_value("SPCA_CalNum")) # print ("get SPCA matrix") matrix_SPCA_BF = Methods.disMatrix_to_bfMatrix(matrix_SPCA, clusters) matrix_SPCA_mean = Methods.disMatrix_to_meanMatrix( matrix_SPCA, clusters) # print ("get SPCA BF matrix") # print (matrix_SPCA_BF) matrix_SPCA_BF.to_csv(gl.get_value("outputFile") + "_SPCA_BF.txt", sep='\t',
def get_cv_accuracy(dpath, site, dtype, description, RESULTPATH, k_tune_params={}, knn_params={}, USE_NCA=False, graphParams={}, nca_train_params={}, elastic_net_params={}, USE_PCA=False, USE_BAGGING=False, bagging_params={}): """ Get KNN cross validation accuracy with or without PCA and NCA """ # Get a dict of function params and save params_all = locals() with open(RESULTPATH + description + \ 'params_all.pkl','wb') as f: _pickle.dump(params_all, f) #%% ======================================================================= # Define relevant methods #========================================================================== def _get_numpc_optim(feats_train, feats_valid, T_train, C_train, T_valid, C_valid): """ Given PCA-transformed traing and validation sets, find the optimal no of principal components to maximize the Ci """ print("\nFinding optimal number of PC's.") print("\n\tnumpc\tCi") print("\t--------------") cis = [] numpc_max = np.min([feats_train.shape[1], 200]) for numpc in range(4, numpc_max, 4): feats_train_new = feats_train[:, 0:numpc] feats_valid_new = feats_valid[:, 0:numpc] # get neighbor indices neighbor_idxs = knnmodel._get_neighbor_idxs(feats_valid_new, feats_train_new, norm = norm) # Predict validation set _, Ci = knnmodel.predict(neighbor_idxs, Survival_train=T_train, Censored_train=C_train, Survival_test =T_valid, Censored_test =C_valid, K=elastic_net_params['K'], Method = Method) cis.append([numpc, Ci]) print("\t{}\t{}".format(numpc, Ci)) # now get optimal no of PC's cis = np.array(cis) numpc_optim = cis[cis[:,1].argmax(), 0] print("\nnumpc_optim = {}".format(round(numpc_optim, 3))) return int(numpc_optim) #%% ======================================================================= # Begin main body #========================================================================== print("\n--------------------------------------") print("Getting cv accuracy: {}, {}".format(site, dtype)) print("--------------------------------------\n") print("Loading data.") #Data = loadmat(dpath) #Features = Data[dtype + '_X'] #N = Features.shape[0] with open(dpath.split('.mat')[0] + '_splitIdxs.pkl','rb') as f: splitIdxs = _pickle.load(f) # # result structure # RESULTPATH_NCA = RESULTPATH + "nca/" RESULTPATH_KNN = RESULTPATH + "knn/" LOADPATH = None os.system('mkdir ' + RESULTPATH_NCA) os.system('mkdir ' + RESULTPATH_KNN) # Go through outer folds, optimize and get accuracy #========================================================================== # Instantiate a KNN survival model. knnmodel = knn.SurvivalKNN(RESULTPATH_KNN, description=description) # # initialize # n_outer_folds = len(splitIdxs['idx_optim']) n_folds = len(splitIdxs['fold_cv_test'][0]) CIs = np.zeros([n_folds, n_outer_folds]) # # itirate through folds # #outer_fold = 0 for outer_fold in range(n_outer_folds): print("\nOuter fold {} of {}\n".format(outer_fold, n_outer_folds-1)) # Note, this is done for each outer loop # since they will be modified locally in each outer loop print("Loading data ...") Data = loadmat(dpath) X = Data[dtype + '_X'].copy() N = X.shape[0] Survival = Data['Survival'].reshape([N,]) Censored = Data['Censored'].reshape([N,]) Data = None # Isolate optimization set (and divide into training and validation) optimIdxs = splitIdxs['idx_optim'][outer_fold] if (USE_NCA or USE_PCA): stoppoint = int(elastic_net_params['VALID_RATIO'] * len(optimIdxs)) optimIdxs_valid = optimIdxs[0:stoppoint] optimIdxs_train = optimIdxs[stoppoint:] x_train = X[optimIdxs_train, :] x_valid = X[optimIdxs_valid, :] #%% =================================================================== # Unsupervised dimensionality reduction - PCA #====================================================================== if USE_PCA: # Find optimal number of PC's pca = PCA() x_train = pca.fit_transform(x_train) x_valid = pca.transform(x_valid) # keep optimal number of PC's numpc_optim = _get_numpc_optim(feats_train=x_train, feats_valid=x_valid, T_train=Survival[optimIdxs_train], C_train=Censored[optimIdxs_train], T_valid=Survival[optimIdxs_valid], C_valid=Censored[optimIdxs_valid]) x_train = x_train[:, 0:numpc_optim] x_valid = x_valid[:, 0:numpc_optim] # Now learn final PC matrix on full optimization set print("\nLearning final PCA matrix.") pca = PCA(n_components=numpc_optim) pca.fit(X[optimIdxs, :]) X = pca.transform(X) #%% =================================================================== # Supervized dimensionality reduction - NCA #====================================================================== if USE_NCA: # instantiate NCA model ncamodel = nca.SurvivalNCA(RESULTPATH_NCA, description = description, LOADPATH = LOADPATH) # # Finding optimal values for ALPHA and LAMBDA (regularization) # ALPHAS = np.arange(0, 1.1, 0.2) LAMBDAS = np.arange(0, 1.1, 0.2) cis = [] for ALPHA in ALPHAS: for LAMBDA in LAMBDAS: if ((LAMBDA == 0) and (ALPHA > ALPHAS.min())): continue graphParams['ALPHA'] = ALPHA graphParams['LAMBDA'] = LAMBDA w = ncamodel.train(features = x_train, survival = Survival[optimIdxs_train], censored = Censored[optimIdxs_train], COMPUT_GRAPH_PARAMS = graphParams, **nca_train_params) W = np.zeros([len(w), len(w)]) np.fill_diagonal(W, w) ncamodel.reset_TrainHistory() # transform x_valid_transformed = np.dot(x_valid, W) x_train_transformed = np.dot(x_train, W) # get neighbor indices neighbor_idxs = knnmodel._get_neighbor_idxs(x_valid_transformed, x_train_transformed, norm = norm) # Predict validation set _, Ci = knnmodel.predict(neighbor_idxs, Survival_train=Survival[optimIdxs_train], Censored_train=Censored[optimIdxs_train], Survival_test = Survival[optimIdxs_valid], Censored_test = Censored[optimIdxs_valid], K = elastic_net_params['K'], Method = Method) cis.append([ALPHA, LAMBDA, Ci]) print("\n----------------------") print("ALPHA\tLAMBDA\tCi") print("{}\t{}\t{}".format(ALPHA, LAMBDA, round(Ci, 3))) print("----------------------\n") cis = np.array(cis) optimal = cis[:,2].argmax() ALPHA_OPTIM = cis[optimal, 0] LAMBDA_OPTIM = cis[optimal, 1] print("\nOptimal Alpha, Lambda = {}, {}".format(ALPHA_OPTIM, LAMBDA_OPTIM)) # # Learn final NCA matrix on optimization set # print("\nLearning final NCA matrix\n") graphParams['ALPHA'] = ALPHA_OPTIM graphParams['LAMBDA'] = LAMBDA_OPTIM # Learn NCA matrix w = ncamodel.train(features = X[optimIdxs, :], survival = Survival[optimIdxs], censored = Censored[optimIdxs], COMPUT_GRAPH_PARAMS = graphParams, **nca_train_params) W = np.zeros([len(w), len(w)]) np.fill_diagonal(W, w) # Transform features according to learned nca model X = np.dot(X, W) #%% =================================================================== # Now get accuracy #====================================================================== print("\nGetting accuracy.") ci, _ = knnmodel.cv_accuracy(X, Survival, Censored, splitIdxs, outer_fold=outer_fold, k_tune_params=k_tune_params, USE_BAGGING=USE_BAGGING, bagging_params=bagging_params) # record result CIs[:, outer_fold] = ci #%% print("\nAccuracy") print("------------------------") print("25th percentile = {}".format(np.percentile(CIs, 25))) print("50th percentile = {}".format(np.percentile(CIs, 50))) print("75th percentile = {}".format(np.percentile(CIs, 75))) # Save results print("\nSaving final results.") with open(RESULTPATH + description + 'testing_Ci.txt','wb') as f: np.savetxt(f, CIs, fmt='%s', delimiter='\t')
# Sparse PCA from sklearn.decomposition import SparsePCA n_components = 27 alpha = 0.0001 random_state = 2018 n_jobs = -1 sparsePCA = SparsePCA(n_components=n_components, alpha=alpha, random_state=random_state, n_jobs=n_jobs) sparsePCA.fit(X_train.loc[:, :]) X_train_sparsePCA = sparsePCA.transform(X_train) X_train_sparsePCA = pd.DataFrame(data=X_train_sparsePCA, index=X_train.index) scatterPlot(X_train_sparsePCA, y_train, "Sparse PCA") # In[46]: X_train_sparsePCA_inverse = np.array(X_train_sparsePCA).dot( sparsePCA.components_) + np.array(X_train.mean(axis=0)) X_train_sparsePCA_inverse = pd.DataFrame(data=X_train_sparsePCA_inverse, index=X_train.index) anomalyScoresSparsePCA = anomalyScores(X_train, X_train_sparsePCA_inverse) preds = plotResults(y_train, anomalyScoresSparsePCA, True) # In[47]:
# Sparse PCA from sklearn.decomposition import SparsePCA n_components = 100 alpha = 0.0001 random_state = 2020 n_jobs = -1 sparsePCA = SparsePCA(n_components=n_components, alpha=alpha, random_state=random_state, n_jobs=n_jobs) sparsePCA.fit(X_train.loc[:10000, :]) X_train_sparsePCA = sparsePCA.transform(X_train) X_train_sparsePCA = pd.DataFrame(data=X_train_sparsePCA, index=train_index) X_validation_sparsePCA = sparsePCA.transform(X_validation) X_validation_sparsePCA = pd.DataFrame(data=X_validation_sparsePCA, index=validation_index) scatterPlot(X_train_sparsePCA, y_train, "Sparse PCA") # In[ ]: # Kernel PCA from sklearn.decomposition import KernelPCA n_components = 100 kernel = 'rbf'
class SPCA(object): def __init__(self, n_components=None, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-8, method='lars', n_jobs=None, U_init=None, V_init=None, verbose=False, random_state=None, normalize_components='deprecated'): """ :param n_components: :param alpha: :param ridge_alpha: :param max_iter: :param tol: :param method: :param n_jobs: :param U_init: :param V_init: :param verbose: :param random_state: :param normalize_components: """ self.model = SparsePCA(n_components=n_components, alpha=alpha, ridge_alpha=ridge_alpha, max_iter=max_iter, tol=tol, method=method, n_jobs=n_jobs, U_init=U_init, V_init=V_init, verbose=verbose, random_state=random_state, normalize_components=normalize_components) def fit(self, x, y): self.model.fit(X=x, y=y) def transform(self, x): self.model.transform(X=x) def fit_transform(self, x, y=None): return self.model.fit_transform(X=x, y=y) def get_params(self): return self.model.get_params(deep=True) def set_params(self, **params): return self.model.set_params(**params) def get_attributes(self): components = self.model.components_ error = self.model.error_ n_iter = self.model.n_iter_ mean = self.model.mean_ return components, error, n_iter, mean
for index, sentence in enumerate(sentences): if labels[index] not in all_jiras: all_jiras[labels[index]] = [ "https://oktainc.atlassian.net/browse/" + str(issues[index]['key']) ] else: all_jiras[ labels[index]].append("https://oktainc.atlassian.net/browse/" + str(issues[index]['key'])) unique, counts = np.unique(labels, return_counts=True) res = dict(zip(unique, counts)) res = sorted(res.items(), key=lambda x: x[1], reverse=True) pca = SparsePCA(n_components=2).fit(X.toarray()) coords = pca.transform(X.toarray()) label_colors = [ '#2AB0E9', '#2BAF74', '#D7665E', '#CCCCCC', '#D2CA0D', '#522A64', '#A3DB05', '#FC6514' ] colors = [label_colors[i % len(label_colors)] for i in labels] plt.scatter(coords[:, 0], coords[:, 1], c=colors) centroids = clf.cluster_centers_ centroid_coords = pca.transform(centroids) plt.title("Principal Component Analysis Diagram of Classes") plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker='X', s=200, linewidth=2, c='#444d61')
## get columns which are nonzero in at least one trace wv_mat = np.zeros((len(wv_coefs), len(wv_coefs[0]["coef"]))) files = [] channels = [] for i, coef in enumerate(wv_coefs): wv_mat[i, :] = coef["coef"] files.append(coef["file"]) channels.append(coef["channel"]) wv_mat = wv_mat[:, np.where(wv_mat.any(axis=0))[0]] wv_df = pd.DataFrame({"fname": files, "channel": channels}) wv_df = pd.merge(wv_df, metadata) pca_wv = SparsePCA(n_components=4, ridge_alpha=2, alpha=1).fit(wv_mat) scores = pca_wv.transform(wv_mat) wv_df = pd.DataFrame({ "x": scores[:, 0], "y": scores[:, 1], "z": scores[:, 2], "fname": files, "channel": channels }) wv_df = pd.merge(wv_df, metadata) (ggplot(wv_df) + geom_point(aes(x="x", y="y", size="z", color="genotype", shape="target")) + scale_size_continuous(range=(0.3, 0.7)) + # ylim(-0.7, 0.55) + # xlim(-1.5, 2.6) +
if idx_arr != idx_lopo_cv ] training_label = [ arr for idx_arr, arr in enumerate(label_bal) if idx_arr != idx_lopo_cv ] # Concatenate the data training_data = np.vstack(training_data) training_label = np.ravel( label_binarize(np.hstack(training_label).astype(int), [0, 255])) print 'Create the training set ...' # Learn the PCA projection pca = SparsePCA(n_components=sp) training_data = pca.fit_transform(training_data) testing_data = pca.transform(testing_data) # Perform the classification for the current cv and the # given configuration crf = RandomForestClassifier(n_estimators=100, n_jobs=-1) pred_prob = crf.fit( training_data, np.ravel(training_label)).predict_proba(testing_data) result_cv.append([pred_prob, crf.classes_]) results_sp.append(result_cv) # Save the information path_store = '/data/prostate/results/mp-mri-prostate/exp-3/selection-extraction/sparse-pca/mrsi' if not os.path.exists(path_store):
n_comp = 12 # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1)) tsvd_results_test = tsvd.transform(test) # PCA # pca = PCA(n_components=n_comp, random_state=420) # pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) # pca2_results_test = pca.transform(test) #sparse PCA spca = SparsePCA(n_components=n_comp, random_state=420) spca2_results_train = spca.fit_transform(train.drop(["y"], axis=1)) spca2_results_test = spca.transform(test) #Kernel PCA kpca = KernelPCA(n_components=n_comp, random_state=420) kpca2_results_train = kpca.fit_transform(train.drop(["y"], axis=1)) kpca2_results_test = kpca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test)
def transform(xTrain,yTrain,xTest): pca = SparsePCA(n_components=2); newXTrain = pca.fit_transform(xTrain,yTrain) newXTest = pca.transform(xTest) return newXTrain,newXTest
def sparse_pca(data, dim=3): transformer = SparsePCA(n_components=dim, random_state=0) transformer.fit(data) result = transformer.transform(data) return result
sns.scatterplot(data=Xn_indexed, x="retail_and_recreation_percent_change_from_baseline", y="grocery_and_pharmacy_percent_change_from_baseline", hue="Rt_binarized") plt.show() # 2D projection sparse_pca = SparsePCA(n_components=2, random_state=0, alpha=2) X_scaled = minmax_scale(X) sparse_pca.fit(X=X_scaled) print( pd.DataFrame(sparse_pca.components_, columns=[ _.replace("_percent_change_from_baseline", "") for _ in X.columns ])) X_tf = sparse_pca.transform(X_scaled) X_tf_Rt = pd.DataFrame(X_tf, columns=["X1", "X2"]) X_tf_Rt["Rt"] = X["Rt_binarized"] sns.scatterplot(data=X_tf_Rt, x="X1", y="X2", hue="Rt_binarized") plt.show() ax = fig.add_subplot(111, projection='3d') svc = LinearSVC(random_state=0, penalty="l1", loss="squared_hinge", dual=False, max_iter=10000) svc.fit(X=X_normed, y=X["Rt_binarized"])
def fit_model(self): #for filename in glob.glob(os.path.join(self.data_path, '*MP034_2017-09-11.mat')): #for filename in glob.glob(os.path.join(self.data_path, '*.mat')): #self.mat_file_lst=[#'natimg2800_M170717_MP034_2017-09-11.mat',#'natimg2800_M160825_MP027_2016-12-14.mat', #'natimg2800_M161025_MP030_2017-05-29.mat'#, #'natimg2800_M170604_MP031_2017-06-28.mat','natimg2800_M170714_MP032_2017-09-14.mat','natimg2800_M170714_MP032_2017-08-07.mat','natimg2800_M170717_MP033_2017-08-20.mat' #] for filename in self.mat_file_lst: print(filename) data = io.loadmat(self.data_path+filename) resp = data['stim'][0]['resp'][0] spont =data['stim'][0]['spont'][0] if self.model=='EnsemblePursuit': X=subtract_spont(spont,resp) for lambd_ in self.lambdas: neuron_init_dict={'method':'top_k_corr','parameters':{'n_av_neurons':100,'n_of_neurons':1,'min_assembly_size':8}} print(str(neuron_init_dict['parameters']['n_av_neurons'])) ep=EnsemblePursuitPyTorch() start=time.time() U_V,nr_of_neurons,U,V, cost_lst,seed_neurons,ensemble_neuron_lst=ep.fit_transform(X,lambd_,self.nr_of_components,neuron_init_dict) end=time.time() tm=end-start print('Time', tm) #np.save(self.save_path+filename[45:85]+'_n_av_n_'+str(neuron_init_dict['parameters']['n_av_neurons'])+'_'+str(lambd_)+'_'+str(self.nr_of_components)+'_V_ep.npy',V) np.save(self.save_path+filename+'_V_ep.npy',V) np.save(self.save_path+filename+'_U_ep.npy',U) np.save(self.save_path+filename+'_ensemble_pursuit_lst_ep.npy',ensemble_neuron_lst) np.save(self.save_path+filename+'_seed_neurons_ep.npy', seed_neurons) np.save(self.save_path+filename+'_time_ep.npy', tm) #np.save(self.save_path+filename[45:85]+'_n_av_n_'+str(neuron_init_dict['parameters']['n_av_neurons'])+'_'+str(lambd_)+'_'+str(self.nr_of_components)+'_U_ep.npy',U) #np.save(self.save_path+filename[45:85]+'_n_av_n_'+str(neuron_init_dict['parameters']['n_av_neurons'])+'_'+str(lambd_)+'_'+str(self.nr_of_components)+'_cost_ep.npy',cost_lst) #np.save(self.save_path+filename[45:85]+'_n_av_n_'+str(neuron_init_dict['parameters']['n_av_neurons'])+'_'+str(lambd_)+'_'+str(self.nr_of_components)+'_n_neurons_ep.npy',nr_of_neurons) #np.save(self.save_path+filename[45:85]+'_n_av_n_'+str(neuron_init_dict['parameters']['n_av_neurons'])+'_'+str(lambd_)+'_'+str(self.nr_of_components)+'_ensemble_neuron_lst.npy',ensemble_neuron_lst) #np.save(self.save_path+filename[45:85]+'_n_av_n_'+str(neuron_init_dict['parameters']['n_av_neurons'])+'_'+str(lambd_)+'_'+str(self.nr_of_components)+'_time_ep.npy',tm) #np.save(self.save_path+filename[45:85]+'_n_av_n_'+str(neuron_init_dict['parameters']['n_av_neurons'])+'_'+str(lambd_)+'_'+str(self.nr_of_components)+'_seed_neurons.npy',seed_neurons) if self.model=='SparsePCA': X=subtract_spont(spont,resp) X=stats.zscore(X) print(X.shape) for alpha in self.alphas: sPCA=SparsePCA(n_components=self.nr_of_components,alpha=alpha,random_state=7, max_iter=100, n_jobs=-1,verbose=1) #X=X.T start=time.time() model=sPCA.fit(X) end=time.time() elapsed_time=end-start U=model.components_ print('U',U.shape) #errors=model.error_ V=sPCA.transform(X) print('V',V.shape) np.save(self.save_path+filename[45:85]+'_'+str(alpha)+'_'+str(self.nr_of_components)+'_U_sPCA.npy',U) np.save(self.save_path+filename[45:85]+'_'+str(alpha)+'_'+str(self.nr_of_components)+'_V_sPCA.npy',V) np.save(self.save_path+filename[45:85]+'_'+str(alpha)+'_'+str(self.nr_of_components)+'_time_sPCA.npy',elapsed_time) #np.save(self.save_path+filename[45:85]+'_'+str(alpha)+'_'+str(self.nr_of_components)+'_errors_sPCA.npy',errors) if self.model=='NMF': X=subtract_spont(spont,resp) X-=X.min(axis=0) for alpha in self.alphas: model = NMF(n_components=self.nr_of_components, init='nndsvd', random_state=7,alpha=alpha) start=time.time() V=model.fit_transform(X) end=time.time() time_=end-start print(end-start) U=model.components_ np.save(self.save_path+filename[45:85]+'_'+str(alpha)+'_'+str(self.nr_of_components)+'_U_NMF.npy',U) np.save(self.save_path+filename[45:85]+'_'+str(alpha)+'_'+str(self.nr_of_components)+'_V_NMF.npy',V) np.save(self.save_path+filename[45:85]+'_'+str(alpha)+'_'+str(self.nr_of_components)+'_time_NMF.npy',time_) if self.model=='PCA': X=subtract_spont(spont,resp) X=stats.zscore(X) pca=PCA(n_components=self.nr_of_components) start=time.time() V=pca.fit_transform(X) U=pca.components_ end=time.time() elapsed_time=end-start #V=pca.components_ var=pca.explained_variance_ np.save(self.save_path+filename[45:85]+'_'+str(self.nr_of_components)+'_V_pca.npy',V) np.save(self.save_path+filename[45:85]+'_'+str(self.nr_of_components)+'_time_pca.npy',elapsed_time) np.save(self.save_path+filename[45:85]+'_'+str(self.nr_of_components)+'_var_pca.npy',var) np.save(self.save_path+filename[45:85]+'_'+str(self.nr_of_components)+'_U_pca.npy',U) if self.model=='LDA': X=resp X-=X.min(axis=0) lda=LatentDirichletAllocation(n_components=self.nr_of_components, random_state=7) start=time.time() V=lda.fit_transform(X) end=time.time() elapsed_time=end-start print('time',elapsed_time) U=lda.components_ np.save(self.save_path+filename[45:85]+'_'+str(self.nr_of_components)+'_V_lda.npy',V) np.save(self.save_path+filename[45:85]+'_'+str(self.nr_of_components)+'_U_lda.npy',U) np.save(self.save_path+filename[45:85]+'_'+str(self.nr_of_components)+'_time_lda.npy',elapsed_time)
def fit_model(self): for filename in self.mat_file_lst: print(filename) data = io.loadmat(self.data_path + filename) resp = data['stim'][0]['resp'][0] spont = data['stim'][0]['spont'][0] if self.model == 'EnsemblePursuit_numpy': X = subtract_spont(spont, resp).T options_dict = { 'seed_neuron_av_nr': 100, 'min_assembly_size': 8 } ep_np = EnsemblePursuitNumpy(n_ensembles=self.nr_of_components, lambd=self.lambd_, options_dict=options_dict) start = time.time() U, V = ep_np.fit_transform(X) end = time.time() tm = end - start print('Time', tm) np.save(self.save_path + filename + '_V_ep_numpy.npy', V) np.save(self.save_path + filename + '_U_ep_numpy.npy', U) np.save(self.save_path + filename + '_timing_ep_numpy.npy', tm) if self.model == 'EnsemblePursuit_pytorch': X = subtract_spont(spont, resp).T options_dict = { 'seed_neuron_av_nr': 100, 'min_assembly_size': 8 } ep_pt = EnsemblePursuitPyTorch( n_ensembles=self.nr_of_components, lambd=self.lambd_, options_dict=options_dict) start = time.time() U, V = ep_pt.fit_transform(X) end = time.time() tm = end - start print('Time', tm) np.save(self.save_path + filename + '_V_ep_pytorch.npy', V) np.save(self.save_path + filename + '_U_ep_pytorch.npy', U) np.save(self.save_path + filename + '_timing_ep_pytorch.npy', tm) if self.model == 'EnsemblePursuit_adaptive': X = subtract_spont(spont, resp).T options_dict = { 'seed_neuron_av_nr': 100, 'min_assembly_size': 8 } ep_pt = EnsemblePursuitPyTorch( n_ensembles=self.nr_of_components, lambd=self.lambd_, options_dict=options_dict) start = time.time() U, V = ep_pt.fit_transform(X) end = time.time() tm = end - start print('Time', tm) np.save(self.save_path + filename + '_V_ep_adaptive.npy', V) np.save(self.save_path + filename + '_U_ep_adaptive.npy', U) if self.model == 'SparsePCA': X = subtract_spont(spont, resp) X = zscore(X) sPCA = SparsePCA(n_components=self.nr_of_components, random_state=7, max_iter=100, n_jobs=-1, verbose=1) start = time.time() model = sPCA.fit(X) end = time.time() elapsed_time = end - start U = model.components_ V = sPCA.transform(X) np.save(self.save_path + filename + '_U_sPCA.npy', U) np.save(self.save_path + filename + '_V_sPCA.npy', V) np.save(self.save_path + filename + '_time_sPCA.npy', elapsed_time) if self.model == 'ICA': X = subtract_spont(spont, resp) X = zscore(X) ICA = FastICA(n_components=self.nr_of_components, random_state=7) start = time.time() V = ICA.fit_transform(X) end = time.time() elapsed_time = end - start U = ICA.components_ np.save(self.save_path + filename + '_U_ICA.npy', U) np.save(self.save_path + filename + '_V_ICA.npy', V) np.save(self.save_path + filename + '_time_ICA.npy', elapsed_time)
cov_tmp[cov_EWMA.columns] @ V_tmp[:, w_top5]).loc['t_NW_adjusted'] pd.DataFrame(V_tmp[:, w_top5], index=cov_EWMA.columns).mean(axis=1) # todo EWMA+同频历史数据 # Sparse PCA from sklearn.decomposition import SparsePCA transformer = SparsePCA(n_components=30) #, random_state=0) # todo 输入原始矩阵(+标准化)还是输入协方差矩阵??? # 注意输入的变量量级不要太小,也不要太大,100比较适合(未/std) transformer.fit( cov_chara_ret.dropna(how='all', axis=0).agg(lambda x: x - x.mean()).fillna(0.0) * 10000.0) #.cov()*1e4) transformer.transform( cov_chara_ret.dropna(how='all', axis=0).fillna(0.0) ) #.apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12']) # 根据组合的组合的平均收益,调整组合的符号 weights = pd.DataFrame(transformer.components_, columns=signal_names.split(',')).T weights ret_transformed_port = ( cov_chara_ret.fillna(0.0) @ transformer.components_.T).replace( 0.0, np.nan) ret_transformed_port for c in weights.columns: # 调整权重的符号,并且scale权重 weights[c] = weights[c] * np.sign(ret_transformed_port[c].mean()) / np.abs( weights[c]).sum() ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace( 0.0, np.nan)
normalize_components=True, random_state=1000) spca.fit(X) # Show the components sns.set() fig, ax = plt.subplots(3, 10, figsize=(22, 8)) for i in range(3): for j in range(10): ax[i, j].imshow(spca.components_[(3 * j) + i].reshape((8, 8)), cmap='gray') ax[i, j].set_xticks([]) ax[i, j].set_yticks([]) plt.show() # Transform X[0] y = spca.transform(X[0].reshape(1, -1)).squeeze() # Show the absolute magnitudes fig, ax = plt.subplots(figsize=(22, 10)) ax.bar(np.arange(1, 31, 1), np.abs(y)) ax.set_xticks(np.arange(1, 31, 1)) ax.set_xlabel('Component', fontsize=16) ax.set_ylabel('Coefficient (absolute values)', fontsize=16) plt.show()
TPW = np.zeros(100) FPW = np.zeros(100) nblocks = np.insert(np.ones(k), np.zeros(Number_of_blocks - k), 0) print(nblocks) from tqdm import tqdm for j in tqdm(range(100)): true_block = np.random.permutation(nblocks) X = rv.rvs(n) SPCA = SparsePCA(n_components=Number_of_blocks) Xfit = SPCA.fit(X) XPCA = SPCA.transform(X) signal = 2 * (np.log(p) / n)**.5 beta = np.zeros(p) counter = 0 for i in range(Number_of_blocks): if true_block[i] == 1: beta[counter:counter + blocks_size[i]] = c * signal * np.random.dirichlet( np.ones(blocks_size[i]), 1) counter += blocks_size[i] y = np.matmul(X, beta) + np.random.normal(0, 1, n)