class PCAImpl(): def __init__(self, n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): self._hyperparams = { 'n_components': n_components, 'copy': copy, 'whiten': whiten, 'svd_solver': svd_solver, 'tol': tol, 'iterated_power': iterated_power, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def pca_plot(fp_list, clusters): np_fps = [] for fp in fp_list: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) pca = PCA(n_components=3) pca.fit(np_fps) np_fps_r = pca.transform(np_fps) p1 = figure(x_axis_label="PC1", y_axis_label="PC2", title="PCA clustering of PAINS") p2 = figure(x_axis_label="PC2", y_axis_label="PC3", title="PCA clustering of PAINS") color_vector = ["blue", "red", "green", "orange", "pink", "cyan", "magenta", "brown", "purple"] print len(set(clusters)) for clust_num in set(clusters): print clust_num local_cluster = [] for i in xrange(len(clusters)): if clusters[i] == clust_num: local_cluster.append(np_fps_r[i]) print len(local_cluster) p1.scatter(np_fps_r[:,0], np_fps_r[:,1], color=color_vector[clust_num]) p2.scatter(np_fps_r[:,1], np_fps_r[:,2], color=color_vector[clust_num]) return HBox(p1, p2)
def pca(tx, ty, rx, ry): compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) #eigenvalues = compressor.explained_variance_ print "PCA" # for eigenvalue, eigenvector in zip(eigenvalues, compressor.components_): # print(eigenvalue) # variance = compressor.explained_variance_ratio_ #calculate variance ratios # var = np.cumsum(np.round(compressor.explained_variance_ratio_, decimals=3)*100) # print var #print compressor.explained_variance_ #print compressor.explained_variance_ratio_ print compressor.explained_variance_ratio_.cumsum() print compressor.singular_values_ newtx = compressor.transform(tx) newrx = compressor.transform(rx) #em(newtx, ty, newrx, ry, add="wPCAtr", times=10) #km(newtx, ty, newrx, ry, add="wPCAtr", times=10) # var=np.cumsum(np.round(compressor.explained_variance_ratio_, decimals=3)*100) # print var # plt.ylabel('% Variance Explained') # plt.xlabel('# of Features') # plt.title('PCA Analysis') # plt.ylim(30,100.5) # plt.style.context('seaborn-whitegrid') # plt.plot(var) # plt.savefig('PCA.png') # plt.show() nn(newtx, ty, newrx, ry, add="wPCA")
def main(): print('Reading data file') data = pd.read_csv(path + 'Sentiment Analysis Dataset.csv', usecols=['Sentiment', 'SentimentText'], error_bad_lines=False) print('Preprocess') corpus = data['SentimentText'] vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode', stop_words='english', tokenizer=tokenize) X = vectorizer.fit_transform(corpus.values) y = data['Sentiment'].values print('Train sentiment classification') classifier = MultinomialNB() classifier.fit(X, y) print('Word2Vec') corpus = corpus.map(lambda x: tokenize(x)) word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4) word2vec.init_sims(replace=True) print('Fitting 2 PCA') #word_vectors = [word2vec[word] for word in word2vec.vocab] # pre -1.0.0 word_vectors = [word2vec[word] for word in word2vec.wv.vocab] # in genism 1.0.0+ should use pca = PCA(n_components=2) pca.fit(word_vectors)
def PComponent_(train_Set, test_Set, var_Threshold=None, components=None): if (var_Threshold == None and components == None): print( "please give a threshold for PComponent - either var threshold or components" ) quit() if (var_Threshold != None and components != None): print("give only one threshold") quit() if (var_Threshold != None): pca = PCA() pca.fit(train_Set) #variance ratio in percentage explain_Variance = around(pca.explained_variance_ratio_, decimals=4) explain_Variance = explain_Variance.tolist() explain_Variance = [x * 100 for x in explain_Variance] #cumulative variance temp = 0 for x in range(len(explain_Variance)): explain_Variance[x] = temp + explain_Variance[x] temp = explain_Variance[x] explain_Variance = [x for x in explain_Variance if x < var_Threshold] n_components = len(explain_Variance) pca = PCA(n_components=n_components) return (pca.fit_transform(train_Set), pca.transform(test_Set)) else: pca = PCA(n_components=components) return (pca.fit_transform(train_Set), pca.transform(test_Set))
class PCADecomposition(AbstractPreProcessor): pca = None no_components = 2 def fit(self, data, y=None): self.pca = PCA(n_components=self.no_components) self.pca.fit(data) def fit_transform(self, data, y=None): self.fit(data, y) return self.transform(data, y) def transform(self, data, y=None): data = self._check_input(data) output = self.pca.transform(data) output = self._check_output(data, output) return output def _check_output(self, data, output): if isinstance(data, pd.DataFrame): columns = [ 'Component ' + str(x + 1) for x in range(self.no_components) ] output = pd.DataFrame(data=output, columns=columns, index=data.index) return output
def pca(target, control, title, name_one, name_two): np_fps = [] for fp in target + control: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) ys_fit = [1] * len(target) + [0] * len(control) names = ["PAINS", "Control"] pca = PCA(n_components=3) pca.fit(np_fps) np_fps_r = pca.transform(np_fps) p1 = figure(x_axis_label="PC1", y_axis_label="PC2", title=title) p1.scatter(np_fps_r[:len(target), 0], np_fps_r[:len(target), 1], color="blue", legend=name_one) p1.scatter(np_fps_r[len(target):, 0], np_fps_r[len(target):, 1], color="red", legend=name_two) p2 = figure(x_axis_label="PC2", y_axis_label="PC3", title=title) p2.scatter(np_fps_r[:len(target), 1], np_fps_r[:len(target), 2], color="blue", legend=name_one) p2.scatter(np_fps_r[len(target):, 1], np_fps_r[len(target):, 2], color="red", legend=name_two) return HBox(p1, p2)
def init_from_linear_case(self, Y, d_): """ Solve the equation min ||(Y-\hat{Y}) - M(Y-\hat{Y})||2_2 Here we take PCA on Y, which compute the eigen-decomposition on YY^{T} = USU^{T} and M = U_{d_} * U_{d_}^{T}, where U_{d_} are the first d_ eignvectors and b = \hat{y} - M\hat{y} @Parameters: Y: ndarray with shape (d, num_imags * H' * W' * sample_ratio) d_: the number of eigenvectors to remain @Returns: M: d * d_ b = d * 1 """ logger.debug("Init M, b from linear-case...") pca = PCA(n_components=d_) # pca = PCA() # with shape d_, * d pca.fit(Y.transpose()) # d_ * d U = pca.components_ # d * d M = U.transpose().dot(U) mean_Y = np.average(Y, axis=1) mean_Y = mean_Y.reshape(mean_Y.shape[0], 1) b = mean_Y - M.dot(mean_Y) Err = (Y - mean_Y) - M.dot(Y - mean_Y) logger.debug("Linear-case loss:{:.3f}".format(np.linalg.norm(Err))) logger.debug("Linear-case: M.max:{:.2f}, M.min:{:.2f}, b.max:{:.2f}," " b.min:{:.2f}".format(M.max(), M.min(), b.max(), b.min())) return M, U.transpose(), U, b
def pca_prefit(weights, xs): """ SOMの初期値を計算するための前処理. 線形変換によって重みベクトル列の主成分とその固有値を入力ベクトル列のものと一致させる. :param weights: 初期重みベクトル列 :param xs: 入力ベクトル列 :return: 前処理した重みベクトル列 """ n = np.shape(xs)[1] pca_w = PCA(n_components=n) pca_w.fit(weights) pca_x = PCA(n_components=n) pca_x.fit(xs) mean_w = np.mean(weights, axis=0) mean_x = np.mean(xs, axis=0) com_w = pca_w.components_ com_x = pca_x.components_ var_w = pca_w.explained_variance_ var_x = pca_x.explained_variance_ var_w[var_w == 0] = np.max(var_w) * 1e-6 new_w = (weights - mean_w).dot(com_w.T) / np.sqrt(var_w) new_w = (new_w * np.sqrt(var_x)).dot(com_x) + mean_x return new_w
def dim_redux(): directions = ['left', 'right', 'up', 'down'] df = pandas.read_csv(FILE_RECORD_MOVES, sep='|', header=None) df = df.iloc[:20, :] columns = df.columns.tolist() index_direction = columns[-1] # df = df[columns[:len(columns) // 2] + [index_direction]] x = df[columns[:len(columns) // 2]] y = df[index_direction] # Set 1 column for each direction {0, 1} for direction in directions: df[direction] = df[index_direction].map( lambda s: s == direction and 1 or 0) vectors_to_keep = [] for direction in directions: x_train = x[y == direction] pca = PCA(n_components=2) pca.fit(x_train) eigenval = pca.explained_variance_ratio_ eigenvect = pca.components_ vectors_to_keep.append(eigenvect[0]) if eigenval[1] > 0.1: vectors_to_keep.append(eigenvect[1]) vectors_to_keep = reduce_space_to_base(vectors_to_keep) print("Base :") print(vectors_to_keep)
def main(): print('Reading in data file...') data = pd.read_csv(path + 'Sentiment Analysis Dataset.csv', usecols=['Sentiment', 'SentimentText'], error_bad_lines=False) print('Pre-processing tweet text...') corpus = data['SentimentText'] vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode', stop_words='english', tokenizer=tokenize) X = vectorizer.fit_transform(corpus.values) y = data['Sentiment'].values print('Training sentiment classification model...') classifier = MultinomialNB() classifier.fit(X, y) print('Training word2vec model...') corpus = corpus.map(lambda x: tokenize(x)) word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4) word2vec.init_sims(replace=True) print('Fitting PCA transform...') word_vectors = [word2vec[word] for word in word2vec.vocab] pca = PCA(n_components=2) pca.fit(word_vectors) print('Saving artifacts to disk...') joblib.dump(vectorizer, path + 'vectorizer.pkl') joblib.dump(classifier, path + 'classifier.pkl') joblib.dump(pca, path + 'pca.pkl') word2vec.save(path + 'word2vec.pkl') print('Process complete.')
def pca(tx, ty, rx, ry): compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wPCAtr", times=10) km(newtx, ty, newrx, ry, add="wPCAtr", times=10) nn(newtx, ty, newrx, ry, add="wPCAr")
def pca(tx, ty, rx, ry): compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wPCAtr", times=10) km(newtx, ty, newrx, ry, add="wPCAtr", times=10) nn(newtx, ty, newrx, ry, add="wPCAtr")
def pca(tx, ty, rx, ry): print "pca" compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wPCAtr") km(newtx, ty, newrx, ry, add="wPCAtr") nn(newtx, ty, newrx, ry, add="wPCAtr") print "pca done"
def PCA佮SVM模型(self, 問題, 答案): sample_weight_constant = np.ones(len(問題)) clf = svm.SVC(C=1) pca = PCA(n_components=100) # clf = svm.NuSVC() print('訓練PCA') pca.fit(問題) print('訓練SVM') clf.fit(pca.transform(問題), 答案, sample_weight=sample_weight_constant) print('訓練了') return lambda 問:clf.predict(pca.transform(問))
def get_diversity_fom(ndim, data, return_pca=False): pca = PCA(n_components=ndim) pca.fit(data) if return_pca: return pca vec = pca.explained_variance_ratio_ + 1e-15 div = (-vec * np.log(vec)).sum(-1) * pca.explained_variance_.sum(-1) div /= ndim return div
def pca(data, whiten_bool, components): # Set PCA parameters pca = PCA(n_components=components, whiten=whiten_bool, svd_solver="full") # Fit PCA to data pca.fit(data) np.set_printoptions(suppress=True) print("PCA Components Explained Variance Ratio: " + str(np.around(pca.explained_variance_ratio_ * 100, 2))) # Calculate loading matrix loadings_matrix = (pca.components_.T * np.sqrt(pca.explained_variance_)).T # Transform data data_transformed = pca.transform(data) return data_transformed
def caller(tx, ty, rx, ry): nums = [4,8,12,16] for n in nums: print("PCA") print(n) compressor = PCA(n_components = n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) nnTable(newtx, ty, newrx, ry, alg="PCA") for n in nums: print("ICA") print(n) compressor = ICA(n_components = n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) nnTable(newtx, ty, newrx, ry, alg="ICA") for n in nums: print("RandProj") print(n) compressor = RandomProjection(n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) nnTable(newtx, ty, newrx, ry, alg="PCA") for n in nums: print("kbest") print(n) compressor = best(k=n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) nnTable(newtx, ty, newrx, ry, alg="PCA")
def graphCallerNN(tx, ty, rx, ry): n = tx[1].size/2 compressor = PCA(n_components = n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) newtx = oneem(newtx, ty, newrx, ry) myNN(newtx, ty, newrx, ry, "EM-PCA") # nnTable(newtx, ty, newrx, ry, alg="EM-PCA") compressor = ICA(n_components = n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) newtx = oneem(newtx, ty, newrx, ry) nnTable(newtx, ty, newrx, ry, alg="EM-ICA") myNN(newtx, ty, newrx, ry, "EM-Ica") compressor = RandomProjection(n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) newtx = oneem(newtx, ty, newrx, ry) nnTable(newtx, ty, newrx, ry, alg="EM-RP") myNN(newtx, ty, newrx, ry, "EM-RP") compressor = best(k=n) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) newtx = oneem(newtx, ty, newrx, ry) nnTable(newtx, ty, newrx, ry, alg="EM-KB") myNN(newtx, ty, newrx, ry, "EM-KB")
def do_train_with_freq(): tf_mix = TrainFiles(train_path=train_path_mix, labels_file=labels_file, test_size=0.) tf_freq = TrainFiles(train_path=train_path_freq, labels_file=labels_file, test_size=0.) X_m, Y_m, _, _ = tf_mix.prepare_inputs() X_f, Y_f, _, _ = tf_freq.prepare_inputs() X = np.c_[X_m, X_f] Y = Y_f X, Xt, Y, Yt = train_test_split(X, Y, test_size=0.1) sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt) sl.fit_standard_scaler() pca = PCA(250) pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled]) X_pca = pca.transform(sl.X_train_scaled) X_pca_test = pca.transform(sl.X_test_scaled) #sl.train_params = {'C': 100, 'gamma': 0.0001, 'probability' : True} #print "Start SVM: ", time_now_str() #sl_ll_trn, sl_ll_tst = sl.fit_and_validate() #print "Finish Svm: ", time_now_str() ##construct a dataset for RBM #X_rbm = X[:, 257:] #Xt_rbm = X[:, 257:] #rng = np.random.RandomState(123) #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng) #pretrain_lr = 0.1 #k = 2 #pretraining_epochs = 200 #for epoch in xrange(pretraining_epochs): # rbm.contrastive_divergence(lr=pretrain_lr, k=k) # cost = rbm.get_reconstruction_cross_entropy() # print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost trndata, tstdata = createDataSets(X_pca, Y, X_pca_test, Yt) fnn = train(trndata, tstdata, epochs=1000, test_error=0.025, momentum=0.2, weight_decay=0.0001)
def test_no_X_PCA_but_explained_variance(): with pytest.raises(ValueError, match='If `explained variance` is not None, the ' '`X_pca` values should not be `None`.'): X, y = iris_data() pca = PCA(n_components=2) pca.fit(X) eigen = pca.explained_variance_ plot_pca_correlation_graph(X, variables_names=['1', '2', '3', '4'], X_pca=None, explained_variance=eigen)
def train_pca(pains_fps, num_components=3): ''' Dimensional reduction of fps bit vectors to principal components :param pains_fps: :return: pca reduced fingerprints bit vectors ''' np_fps = [] for fp in pains_fps: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) pca = PCA(n_components=num_components) pca.fit(np_fps) fps_reduced = pca.transform(np_fps) return fps_reduced
def calc_pca(bnd, npc=None, preaverage=False, use_unbiased=False, \ method='mdp'): ''' Parameters ---------- bnd : BinnedData binned data npc : int or None, optional number of PCs to calculate, defaults to None preaverage : bool average across repeats? Returns ------- score : ndarray (npc, nobs) weight : ndarray (npc, nvar) ''' assert method in ['mdp', 'skl'] data = format_for_fa(bnd, preaverage=preaverage, use_unbiased=use_unbiased) if method == 'mdp': pca_node = mdp.nodes.PCANode(output_dim=npc) score = pca_node.execute(data) weight = pca_node.get_projmatrix() elif method == 'skl': pca_obj = PCA(n_components=npc) score = pca_obj.fit(data).transform(data) weight = pca_obj.components_.T return score.T, weight.T
def pca_analysis(model, dataset, out): pca = PCA(n_components=len(dataset[0])) pca.fit(dataset) columns = ['W id', 'component vector id', 'dot'] analysis_result = pd.DataFrame(columns=columns)\ .astype({'W id': int, 'component vector id': int, 'dot': float}) for n, (i, j) in enumerate( itertools.product(range(len(model.W.T)), range(len(pca.components_)))): analysis_result.loc[n] = [ i, j, np.dot(model.W[:, i], pca.components_[j]) ] analysis_result.to_csv(out.joinpath('pca_analysis.csv')) plot_pca_analysis(analysis_result, out)
def do_pca(X, c=3): """Do PCA""" from sklearn import preprocessing from sklearn.decomposition.pca import PCA, RandomizedPCA #do PCA #S = standardize_data(X) S = pd.DataFrame(preprocessing.scale(X),columns = X.columns) pca = PCA(n_components=c) pca.fit(S) print (pca.explained_variance_ratio_) #print pca.components_ w = pd.DataFrame(pca.components_,columns=S.columns)#,index=['PC1','PC2']) #print w.T.max(1).sort_values() pX = pca.fit_transform(S) pX = pd.DataFrame(pX,index=X.index) return pX
def reduction(data, params): # parse parameters for item in params: if isinstance(params[item], str): exec(item+'='+'"'+params[item]+'"') else: exec(item+'='+str(params[item])) # apply PCA pca = PCA(n_components=n_components) pca.fit(data) X = pca.transform(data) return X
def airline_pca(): X = np.array(pca_data) pca = PCA(n_components=3) pca.fit(X) Y = pca.transform(normalize(X)) fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) colordict = {carrier: i for i, carrier in enumerate(major_carriers)} pointcolors = [colordict[carrier] for carrier in target_carrier] ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], c=pointcolors) ax.set_title("First three PCA directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([])
def pca_no_labels(target, title="PCA clustering of PAINS", color="blue"): np_fps = [] for fp in target: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) pca = PCA(n_components=3) pca.fit(np_fps) np_fps_r = pca.transform(np_fps) p3 = figure(x_axis_label="PC1", y_axis_label="PC2", title=title) p3.scatter(np_fps_r[:, 0], np_fps_r[:, 1], color=color) p4 = figure(x_axis_label="PC2", y_axis_label="PC3", title=title) p4.scatter(np_fps_r[:, 1], np_fps_r[:, 2], color=color) return HBox(p3, p4)
def airline_pca(): X = np.array(pca_data) pca = PCA(n_components=3) pca.fit(X) Y=pca.transform(normalize(X)) fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) colordict = {carrier:i for i,carrier in enumerate(major_carriers)} pointcolors = [colordict[carrier] for carrier in target_carrier] ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], c=pointcolors) ax.set_title("First three PCA directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([])
def main(): print('Read Project Sentiment Analysis Dataset from Sentiment140 ...') data = pd.read_csv(path + 'HCI Project-Sentiment Analysis Dataset.csv', usecols=['Sentiment', 'SentimentText'], error_bad_lines=False) print('Victorize Tweets with Sentiment (It takes some time) ...') corpus = data['SentimentText'] vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode', stop_words='english', tokenizer=tokenize) X = vectorizer.fit_transform(corpus.values) y = data['Sentiment'].values print( 'Classify Sentiment Texts using sklearn Naive Bayes Classifier for Multinomial Models ...' ) classifier = MultinomialNB() classifier.fit(X, y) print( 'Produce a Vector Space using Word2Vec Model (It takes some time) ...') corpus = corpus.map(lambda x: tokenize(x)) word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4) word2vec.init_sims(replace=True) print( 'Reduce Word Dimensions using Principal Component Analysis (PCA) ...') word_vectors = [word2vec[word] for word in word2vec.wv.vocab] pca = PCA(n_components=2) pca.fit(word_vectors) print('Save Analyzed Data to Corresponding Files ...') joblib.dump(vectorizer, path + 'vectorizer_data.pkl') joblib.dump(classifier, path + 'classifier_data.pkl') joblib.dump(pca, path + 'pca_data.pkl') word2vec.save(path + 'word2vec_data.pkl') print('Sentiment Analysis for Dataset -> Done')
def plot_embeddings(embeddings, labels, protecteds, plot3d=False, subsample=False, label_names=None, protected_names=None): if protected_names is None: protected_names = ["A0", "A1"] if label_names is None: label_names = ["L0", "L1"] n = embeddings.shape[0] if not subsample: subsample = n inds = np.random.permutation(n)[:subsample] pca = PCA(n_components=3 if plot3d else 2) labels = labels.astype(bool)[inds] protecteds = protecteds.astype(bool)[inds] pca.fit(embeddings) embs = pca.transform(embeddings)[inds, :] fig = plt.figure() if plot3d: ax = fig.add_subplot(111, projection='3d') else: ax = fig.add_subplot(111) for l in [False, True]: # labels for p in [False, True]: # protecteds idxs = np.logical_and(labels == l, protecteds == p) embs_slice = embs[idxs, :] data_vectors = [embs_slice[:, 0], embs_slice[:, 1]] if plot3d: data_vectors.append(embs_slice[:, 2]) color = "b" if p else "r" marker = "o" if l else "x" name = "{} {}".format(protected_names[p], label_names[l]) ax.scatter( *data_vectors, edgecolors=color, marker=marker, facecolors=[color, 'none'][l], # only leave circles unfilled label=name) ax.legend(fontsize="small") plt.show()
def do_train_with_freq(): tf_mix = TrainFiles(train_path = train_path_mix, labels_file = labels_file, test_size = 0.) tf_freq = TrainFiles(train_path = train_path_freq, labels_file = labels_file, test_size = 0.) X_m, Y_m, _, _ = tf_mix.prepare_inputs() X_f, Y_f, _, _ = tf_freq.prepare_inputs() X = np.c_[X_m, X_f] Y = Y_f X, Xt, Y, Yt = train_test_split(X, Y, test_size = 0.1) sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt) sl.fit_standard_scaler() pca = PCA(250) pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled]) X_pca = pca.transform(sl.X_train_scaled) X_pca_test = pca.transform(sl.X_test_scaled) #sl.train_params = {'C': 100, 'gamma': 0.0001, 'probability' : True} #print "Start SVM: ", time_now_str() #sl_ll_trn, sl_ll_tst = sl.fit_and_validate() #print "Finish Svm: ", time_now_str() ##construct a dataset for RBM #X_rbm = X[:, 257:] #Xt_rbm = X[:, 257:] #rng = np.random.RandomState(123) #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng) #pretrain_lr = 0.1 #k = 2 #pretraining_epochs = 200 #for epoch in xrange(pretraining_epochs): # rbm.contrastive_divergence(lr=pretrain_lr, k=k) # cost = rbm.get_reconstruction_cross_entropy() # print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost trndata, tstdata = createDataSets(X_pca, Y, X_pca_test, Yt) fnn = train(trndata, tstdata, epochs = 1000, test_error = 0.025, momentum = 0.2, weight_decay = 0.0001)
def dimensional(tx, ty, rx, ry, add=None): print "pca" for j in range(tx[1].size): i = j + 1 print "===" + str(i) compressor = PCA(n_components = i) t0 = time() compressor.fit(tx, y=ty) newtx = compressor.transform(tx) runtime=time() - t0 V = compressor.components_ print runtime, V.shape, compressor.score(tx) distances = np.linalg.norm(tx-compressor.inverse_transform(newtx)) print distances print "pca done" print "ica" for j in range(tx[1].size): i = j + 1 print "===" + str(i) compressor = ICA(whiten=True) t0 = time() compressor.fit(tx, y=ty) newtx = compressor.transform(tx) runtime=time() - t0 print newtx.shape, runtime distances = np.linalg.norm(tx-compressor.inverse_transform(newtx)) print distances print "ica done" print "RP" for j in range(tx[1].size): i = j + 1 print "===" + str(i) compressor = RandomProjection(n_components=i) t0 = time() compressor.fit(tx, y=ty) newtx = compressor.transform(tx) runtime=time() - t0 shape = newtx.shape print runtime, shape print "RP done" print "K-best" for j in range(tx[1].size): i = j + 1 print "===" + str(i) compressor = best(add, k=i) t0 = time() compressor.fit(tx, y=ty.ravel()) newtx = compressor.transform(tx) runtime=time() - t0 shape = newtx.shape print runtime, shape print "K-best done"
def showDataTable(): title = "Descriptive statistics" df = frame[cols] data_dsc = df.describe().transpose() # dsc = df.describe() pca = PCA(n_components=5) pca.fit(df) pc = pca.explained_variance_ratio_ data_corr = df.corr() eigenValues, eigenVectors = LA.eig(data_corr) idx = eigenValues.argsort()[::-1] # print sorted(eigenValues, key=int, reverse=True) print eigenValues.argsort()[::-1] print eigenValues.argsort() eigenValues = pd.DataFrame(eigenValues[idx]).transpose() eigenVectors = pd.DataFrame(eigenVectors[:, idx]) return render_template("showDataTable.html", title=title, data=df, data_dsc=data_dsc, pca=pd.DataFrame(pc).transpose(),data_corr=data_corr, w=eigenValues, v=eigenVectors)
def do_pca(X, c=3): """Do PCA""" from sklearn import preprocessing from sklearn.decomposition.pca import PCA, RandomizedPCA #do PCA #S = standardize_data(X) #remove non numeric X = X._get_numeric_data() S = pd.DataFrame(preprocessing.scale(X), columns=X.columns) pca = PCA(n_components=c) pca.fit(S) out = 'explained variance %s' % pca.explained_variance_ratio_ print(out) #print pca.components_ w = pd.DataFrame(pca.components_, columns=S.columns) #print w.T.max(1).sort_values() pX = pca.fit_transform(S) pX = pd.DataFrame(pX, index=X.index) return pX, pca
def dim_redux_tst(): df = pandas.read_csv(FILE_RECORD_MOVES, sep='|', header=None) # df = df.iloc[:10, :] columns = df.columns.tolist() index_direction = columns[-1] x = df[columns[:len(columns) // 2]] for direction in ['left', 'right', 'up', 'down']: print('\n' + direction) x_dir = x[df[index_direction] == direction] # y = df[df[index_direction] == direction][index_direction] # y = y.map(lambda s: s and 1 or 0) pca = PCA() pca.fit(x_dir) eigenval = pca.explained_variance_ratio_ for i in range(len(eigenval)): val = eigenval[i] vect = pca.components_[i] print(val, vect) yield direction, pca
def pca(X, y, components, max_cluster, num_classes, run_nn=False): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, shuffle=True) pca_compress = PCA(n_components=components, whiten=True) pca_compress.fit(X_train, y=y_train) X_train_new = pca_compress.transform(X_train) X_test_new = pca_compress.transform(X_test) X_original = pca_compress.inverse_transform(X_test_new) loss = ((X_test - X_original)**2).mean() print("Reconstruction Error " + str(loss)) eigenvalues = pca_compress.explained_variance_ print(eigenvalues) if run_nn: mlp_classifier(X_train_new, y_train, 0.3, plot=True, X_test=X_test_new, y_test=y_test) X_new = np.concatenate((X_train_new, X_test_new), axis=0) y = np.concatenate((y_train, y_test), axis=0) kmeans(X_new, y,max_cluster, num_classes, run_nn=run_nn, plot_cluster=True, reduction_algo='PCA') expectation_max(X_new, y, max_cluster, num_classes, run_nn=run_nn, plot_cluster=True, reduction_algo='PCA')
def reduction(data, params): # parse parameters possible_keys = [ 'components', ] for item in params: if item not in possible_keys: ERROR(item) if isinstance(params[item], str): exec(item + '=' + '"' + params[item] + '"') else: exec(item + '=' + str(params[item])) # apply PCA pca = PCA(n_components=n_components) pca.fit(data) X = pca.transform(data) return X
class Model(BaseModel): ''' classdocs ''' def __init__(self, n_components): ''' Constructor ''' self.model = PCA(n_components=5) self.model_name = 'pca' def fit(self, X): ''' Performs a principal component analysis. ''' self.model.fit(X) variance = self.model.explained_variance_ratio_ print variance def transform(self, X): ''' Transforms the given data with the eigenvalues found in the principal component analysis. ''' return self.model.transform(X) def save(self, filepath): ''' Persists the trained model to a file. ''' joblib.dump(self.model, create_filename(filepath, '%s.pkl' % self.model_name)) def load(self, filepath): ''' Loads an already train model from a file to perform predictions. ''' self.model = joblib.load( create_filename(filepath, '%s.pkl' % self.model_name))
def pipeline(align, X_S, X_T, y_S): # Intitlaise objects Norm = Normalizer() Random_Forest = RandomForestClassifier(250, random_state=42) SVM = SVC(random_state=42, gamma='scale') # Change these values depending on the specifics of the data subspace_dimension = 3 # Normalise the data X_S = Norm.fit_transform(X_S) X_T = Norm.fit_transform(X_T) # Create the PCA pca_train = PCA(n_components=subspace_dimension, random_state=42) pca_test = PCA(n_components=subspace_dimension, random_state=42) # Create the PCA components to reduce to subspace P_S = np.transpose(pca_train.fit(X_S).components_) P_T = np.transpose(pca_test.fit(X_T).components_) # In both cases:reduce source to subspace and reduce target to subspace # If using SA then rotate target data into source allignment if align == True: X_S_A = np.matmul(X_S, P_S) X_T_A = np.matmul(X_T, np.matmul(P_T, np.matmul(np.transpose(P_T), P_S))) else: X_S_A = np.matmul(X_S, P_S) X_T_A = np.matmul(X_T, P_S) # Train the classifiers on the source data Random_Forest.fit(X_S_A, y_S) SVM.fit(X_S_A, y_S) # Predict the labels Random_Forest_pred = Random_Forest.predict(X_T_A) SVM_pred = SVM.predict(X_T_A) return (np.array([Random_Forest_pred, SVM_pred]))
def get_center_point(config, points, object_class, past_points=None): # if object_class == 1: # half_w = config.float('car_avg_w') / 2 # half_h = config.float('car_avg_h') / 2 # half_l = config.float('car_avg_l') / 2 # else: # half_w = config.float('pedestrian_avg_w') / 2 # half_h = config.float('pedestrian_avg_h') / 2 # half_l = config.float('pedestrian_avg_l') / 2 points = np.asarray(points) x_mean, y_mean, z_mean = np.median(points, axis=0) theta_best = 0 if past_points is not None: current = np.median(points, axis=0) past = np.median(past_points, axis=0) if np.linalg.norm(current - past, 2) > 0.7: dy = current[2] - past[2] dx = current[0] - past[0] theta_best = np.arctan(dy / dx) if theta_best == 0: ps = points ps = np.column_stack((ps[:, 0], ps[:, 2])) from sklearn.decomposition.pca import PCA pca = PCA(n_components=2) pca.fit(ps) theta_best = np.arctan(pca.components_[0][1] / pca.components_[0][0]) theta_best = theta_best + np.pi / 2 return np.array((x_mean, y_mean, z_mean, theta_best))
def plot_similarity_clusters(desc1, desc2, files, plot = None): """ find similar sounds using Affinity Propagation clusters :param desc1: first descriptor values :param desc2: second descriptor values :returns: - euclidean_labels: labels of clusters """ if plot == True: print((Fore.MAGENTA + "Clustering")) else: pass min_max = preprocessing.scale(np.vstack((desc1,desc2)).T, with_mean=False, with_std=False) pca = PCA(n_components=2, whiten=True) y = pca.fit(min_max).transform(min_max) euclidean = AffinityPropagation(convergence_iter=1800, affinity='euclidean') euclidean_labels= euclidean.fit_predict(y) if plot == True: time.sleep(5) print((Fore.WHITE + "Cada número representa el grupo al que pertence el sonido como ejemplar de otro/s. El grupo '0' esta coloreado en azul, el grupo '1' esta coloreado en rojo, el grupo '2' esta coloreado en amarillo. Observa el ploteo para ver qué sonidos son ejemplares de otros")) print(np.vstack((euclidean_labels,files)).T) time.sleep(6) plt.scatter(y[euclidean_labels==0,0], y[euclidean_labels==0,1], c='b') plt.scatter(y[euclidean_labels==1,0], y[euclidean_labels==1,1], c='r') plt.scatter(y[euclidean_labels==2,0], y[euclidean_labels==2,1], c='y') plt.scatter(y[euclidean_labels==3,0], y[euclidean_labels==3,1], c='g') plt.show() else: pass return euclidean_labels
def calc_pcs_variance_explained(bnd, preaverage=False, use_unbiased=False, method='skl'): ''' Parameters ---------- bnd : BinnedData binned data preaverage : bool average across repeats? use_unbiased : False use the unbiased spike rates calculated using Rob Kass's spike rate method ''' assert type(method) == str data = format_for_fa(bnd, preaverage=preaverage, use_unbiased=use_unbiased) if method == 'skl': pca_obj = PCA() score = pca_obj.fit(data) return pca_obj.explained_variance_ratio_ else: raise ValueError('method %s not implemented' % method)
def pca(tx, ty, rx, ry, dataset): ncomponents = tx[1].size/2 compressor = PCA(n_components = ncomponents) xarr = [] for i in range(0, ncomponents): xarr.append(i+1) compressor.fit(tx, y=ty) arr = compressor.explained_variance_ plt.figure() plt.title('Phishing PCA Explained Variance') plt.rc('legend',**{'fontsize':10}) plt.plot(xarr, arr, '-', label='explained variance') plt.legend() plt.ylabel('explained variance') plt.xlabel('number of components') plt.savefig("phishingPCAVar" + dataset + ".png") compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wPCAtr", times=21, dataset=dataset, alg="PCA") em(newtx, ty, newrx, ry, PCA(n_components=2).fit_transform(tx), add="wPCAtr", times=9, dataset=dataset, alg="PCA") nn(newtx, ty, newrx, ry, add="wPCAtr") km(newtx, ty, newrx, ry, add="wPCAtr", times=10) myNN(newtx, ty, newrx, ry, "PCA") km(newtx, ty, newrx, ry, [], add="", times=4, dataset=dataset, alg="PCA") reduced_data = PCA(n_components=2).fit_transform(tx) em(tx, ty, rx, ry, reduced_data, add="", times=4, dataset=dataset, alg="PCA") pca = PCA(n_components=2) pca.fit(tx) result=pd.DataFrame(pca.transform(tx), columns=['PCA%i' % i for i in range(2)]) my_color = pd.Series(ty).astype('category').cat.codes fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(result['PCA0'], result['PCA1'], c=my_color, cmap="Dark2_r", s=60) ax.set_xlabel("PC1") ax.set_ylabel("PC2") ax.set_title("PCA on the phishing data set") plt.show()
from sklearn import datasets from sklearn.decomposition.pca import PCA iris = datasets.load_iris() pca = PCA(n_components=2) fit = pca.fit(iris.data) print(fit.explained_variance_ratio_) print(fit.components_)
plt.savefig("explained_variance.png", dpi=300, transparent=True) fig = plt.figure() ax = Axes3D(fig) ax.scatter(new_data[p, 0], new_data[p, 1], new_data[p, 2], label="Piano") ax.scatter(new_data[v, 0], new_data[v, 1], new_data[v, 2], label="Violin") #plt.savefig("pca_3d.png",dpi=300,transparent=True) # %% Train an SVM on raw data, plot an ROC curve from sklearn.svm import SVC from sklearn.metrics import roc_curve, auc mdl = SVC(kernel="poly", degree=2, probability=True) mdl.fit(df_seg, df_info["Instrument"]) proba = mdl.predict_proba(df_seg) fpr, tpr, thresholds = roc_curve(df_info["Instrument"], proba[:, 0], pos_label="Piano") roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr) plt.xlabel("FPR") plt.ylabel("TPR") plt.grid(b=True) plt.savefig("ROC_2class.png", dpi=300, transparent=True) # %% Train an SVM on raw data to classify. with split of test and training
from sklearn.neighbors import NearestNeighbors from operator import itemgetter data_dir = '../../data/' n_pca_components = 10 eps_range = numpy.arange(0.01,20,0.1) min_samples_range = [2,3,5,10] allowed_noise_ratio = 0.2 # data derivatives = numpy.loadtxt(os.path.join(data_dir, 'derivatives.dat')) # PCA pca = PCA(n_components=n_pca_components) pca.fit(derivatives) X = pca.transform(derivatives) X = StandardScaler().fit_transform(X) results = [] for eps in eps_range: for minsamp in min_samples_range: model = DBSCAN(eps=eps, min_samples=minsamp, algorithm='kd_tree') model.fit(X) labels = model.labels_ noise_ratio = float(sum(labels==-1)) / len(labels) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) if noise_ratio <= allowed_noise_ratio:
import numpy as np import pandas as pd from sklearn.decomposition.pca import PCA from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC from sklearn.mixture import GMM from sklearn.base import BaseEstimator import matplotlib.pyplot as plt X_test = pd.read_csv('Data/test.csv', header=None).as_matrix() y = pd.read_csv('Data/trainLabels.csv', header=None)[0].as_matrix() X = pd.read_csv('Data/train.csv', header=None).as_matrix() pca2 = PCA(n_components=2, whiten=True) pca2.fit(np.r_[X, X_test]) X_pca = pca2.transform(X) i0 = np.argwhere(y == 0)[:, 0] i1 = np.argwhere(y == 1)[:, 0] X0 = X_pca[i0, :] X1 = X_pca[i1, :] plt.plot(X0[:, 0], X0[:, 1], 'ro') plt.plot(X1[:, 0], X1[:, 1], 'b*') pca = PCA(whiten=True) X_all = pca.fit_transform(np.r_[X, X_test]) print (pca.explained_variance_ratio_) def kde_plot(x): from scipy.stats.kde import gaussian_kde kde = gaussian_kde(x) positions = np.linspace(x.min(), x.max())
plt.title("correlation matrix") plt.savefig("correlation_matrix.png") show() ### center the variables before performing PCA # center to the mean, but DO NOT component wise scale to unit variance # by centering the variables, principal components remain the same, # by standardizing the variables, principal components change X_train = pp.scale(X_train, with_mean=True, with_std=False) X_test = pp.scale(X_test, with_mean=True, with_std=False) ### dimensionality reduction using PCA # since data is uncorrelated and with variance almost equal to 1, # whitening is not necessary pca40 = PCA(n_components=40, whiten=False) pca40.fit(X_train) print(pca40.explained_variance_ratio_) # plot all the principal components with their relative explained variance features = [x for x in range(1,41)] plt.figure(3) # percentage of variance explained by each of the selected components. # The sum of explained variances is equal to 1.0 plt.plot(features, pca40.explained_variance_ratio_, 'g--', marker='o') plt.axis([1, 40, 0, 0.3]) plt.grid(True) plt.xlabel("principal components"), plt.ylabel("variance explained") plt.title("scree plot") plt.savefig("scree_plot.png") # from the scree plot we choose to pick the first 12 principal components
print it, inertia if ((old_extra - extra)**2).sum() < tol: print "finished at iteration %d" % it break old_extra = extra.copy() return labels if __name__ == "__main__": X, Y = data.libras_movement() labels = kernel_k_means(X, k=15) # Pour representer les donnees, prendre le PCA pca = PCA(n_components=2) pca.fit(X) Xt = pca.transform(X) fig = pl.figure() colors = ['#334433', '#6699aa', '#88aaaa', '#aacccc', '#447799', '#225533', '#44bbcc', '#88dddd', '#bbeeff', '#0055bb', '#220000',
def _calc_factors(data, npc=None): pca_obj = PCA(n_components=npc) score = pca_obj.fit(data).transform(data) # transpose here makes the output match with mdp weight = pca_obj.components_.T return score.T, weight.T
#!/usr/bin/env python # encoding: utf-8 ''' John Doe PCA, simple stuff, simple plot ''' from data import Hallem import numpy as np from sklearn.decomposition.pca import PCA import pylab as pl hallem = Hallem() matrix = np.transpose(hallem.response) print matrix.shape x = PCA() x.fit(matrix) fig = pl.figure() ax = fig.add_subplot(111) a = x.explained_variance_ratio_ b = x.explained_variance_ print len(a) ax.plot(range(len(a)), a) ax.plot(np.cumsum(a)) ax.grid() pl.show()
""" http://stats.stackexchange.com/questions/82050/principal-component-analysis- \and-regression-in-python """ import pandas as pd from sklearn.decomposition.pca import PCA source = pd.read_csv('../files/multicollinearity.csv') frame = pd.DataFrame(source) cols = [col for col in frame.columns if col not in ['response']] frame2 = frame[cols] pca = PCA(n_components=5) pca.fit(frame2) # The amount of variance that each PC explains? print pca.explained_variance_ratio_ # What are these? Eigenvectors? print pca.components_ # Are these the eigenvalues? print pca.explained_variance_ # it looks like sklearn won't operate directly on a pandas dataframe. # Let's say that I convert it to a numpy array: npa = frame2.values npa
import numpy as np import pandas as pd from sklearn.decomposition.pca import PCA # package for principal # component analysis from sklearn import svm import csv X_train = pd.read_csv('train.csv', header=None).as_matrix() X_test = pd.read_csv('test.csv', header=None).as_matrix() trainLabels = np.loadtxt(open('trainLabels.csv', 'rb'), delimiter=',', skiprows=0) pca=PCA(n_components=12, whiten=True) #pca.fit(np.r_[X_train, X_test],trainLabels) pca.fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) clf = svm.SVC(C=3, gamma=0.6) clf.fit(X_train_pca,trainLabels) predictions = clf.predict(X_test_pca) with open('svm_model_submission.csv', 'wb') as prediction_file: writer=csv.writer(prediction_file, delimiter=',') writer.writerow(['Id','Solution']) for i in range(0,len(predictions)): writer.writerow([i+1,int(predictions[i])])
import numpy as np from sklearn import tree from sklearn.decomposition.pca import PCA import mnist_loader as loader import mnist_writer as writer print('Reading data...') train_data, train_labels = loader.load_train_data() test_data = loader.load_test_data() # convert to numpy arrays train_data = np.array(train_data) train_labels = np.array(train_labels) test_data = np.array(test_data) print('PCA analysis...') pca = PCA(n_components=35, whiten=True) pca.fit(train_data) train_data = pca.transform(train_data) test_data = pca.transform(test_data) print('Fitting decision tree...') clf = tree.DecisionTreeClassifier() clf.fit(train_data, train_labels) print('Making predictions...') predict = clf.predict(test_data) print('Writing results...') writer.write_predictions(predict, '/Users/clint/Development/data/mnist/predict_tree.csv')
# plot the first 10 normalized faces image_grid(arr_norm[:10,:],H,W) # # Principal Component Analysis # In[9]: from sklearn.decomposition.pca import PCA # In[10]: pca = PCA() pca.fit(arr_norm) # ## Scree Plot # In[11]: # Let's make a scree plot pve = pca.explained_variance_ratio_ pve.shape plt.plot(range(len(pve)), pve) plt.title("Scree Plot") plt.ylabel("Proportion of Variance Explained") plt.xlabel("Principal Component Number")
lbls = np.array(dgts_lbl) dt = dgts_data.T #remove mean values from each row mn = np.mean(dt,axis=0).reshape(1,dt.shape[1]) print dt.shape print mn.shape # now subtract the mean dt = dt - mn sigma = np.dot(dt,dt.T)/dt.shape[0] print sigma u,s,v = linalg.svd(sigma) dt_rot = np.dot(u.T,dt) sigma1 = np.cov(dt_rot) pc = PCA() pc.fit(dt) ab = pc.transform(dt) print ab print sigma1 print tyu abc =np.divide(s,np.sqrt(s+0.000001)) pcawhite = np.dot(abc,np.dot(u.T,dt)) print pcawhite
r('test3.hex <- h2o.importFile(h2oServer, path = TEST, header = F, sep = ",", destination_frame="test3.hex")') ## Generate predictions r('predictions_dl <- h2o.predict(dlmodel, test3.hex)') r('head(predictions_dl)') ## new predictions pred = r('as.matrix(predictions_dl)') return var(pred -test) ################################################################ figure() variances_table = [] for i in range(2,11,1): pca = PCA(n_components=i) der = derivatives[train_mask_TL] pca.fit(der) X = pca.transform(derivatives[test_mask]) pred_pca_temp = (pca.inverse_transform(X)) # var_fraction_pca_TL = var(pred_pca_temp-derivatives[test_mask])/var(derivatives[test_mask]) #plot([i], [var(pred_pca_temp-derivatives[test_mask])],'D') var_fraction_DL_TL = DL( derivatives[train_mask_TL], derivatives[test_mask], i)/var(derivatives[test_mask]) #plot([i], [var_DL_TL ],'Dk') pca = PCA(n_components=i) der = derivatives[train_mask_no_TL] pca.fit(der) X = pca.transform(derivatives[test_mask]) pred_pca_temp = (pca.inverse_transform(X))