def test_kernel_pca_consistent_transform(): # X_fit_ needs to retain the old, unmodified copy of X state = np.random.RandomState(0) X = state.rand(10, 10) kpca = KernelPCA(random_state=state).fit(X) transformed1 = kpca.transform(X) X_copy = X.copy() X[:, 0] = 666 transformed2 = kpca.transform(X_copy) assert_array_almost_equal(transformed1, transformed2)
def main(): #set the timer start = time.time() #load the data trainX = np.load('trainX.npy') testX = np.load('testX.npy') trainY = np.load('trainY.npy') testY = np.load('testY.npy') print('\n!!! Data Loading Completed !!!\n') #get the 1st digit zero and plot it zero = trainX[14].reshape(28, 28) plt.imshow(zero, cmap=cm.Greys_r) plt.savefig("original"+str(trainY[14])+".png") #plt.show() #apply kpca kpca = KernelPCA(kernel='rbf', gamma=1, fit_inverse_transform=True) kpca.fit(trainX[0:3000]) trainX_kpca = kpca.transform(trainX) testX_kpca = kpca.transform(testX) #do inverse transform and plot the result orig = kpca.inverse_transform(trainX_kpca) img = orig[14].reshape(28, 28) plt.imshow(img, cmap=cm.Greys_r) plt.savefig("reconstructed"+str(trainY[14])+".png") #plt.show() selector = SelectPercentile(f_classif, percentile=5) selector.fit(trainX_kpca, trainY) trainX = selector.transform(trainX_kpca) testX = selector.transform(testX_kpca) #fit a classifier parameters = {'n_neighbors' : list(np.arange(15)+1)} clf = GridSearchCV(KNeighborsClassifier(weights='distance', n_jobs=-1), parameters) clf.fit(trainX, trainY) pred = clf.predict(testX) print accuracy_score(testY, pred) print confusion_matrix(testY, pred) #print(clf.best_params_) print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY))) print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def test_kernel_pca(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) def histogram(x, y, **kwargs): # Histogram kernel implemented as a callable. assert_equal(kwargs, {}) # no kernel_params that we didn't ask for return np.minimum(x, y).sum() for eigen_solver in ("auto", "dense", "arpack"): for kernel in ("linear", "rbf", "poly", histogram): # histogram kernel produces singular matrix inside linalg.solve # XXX use a least-squares approximation? inv = not callable(kernel) # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # non-regression test: previously, gamma would be 0 by default, # forcing all eigenvalues to 0 under the poly kernel assert_not_equal(X_fit_transformed.size, 0) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1]) # inverse transform if inv: X_pred2 = kpca.inverse_transform(X_pred_transformed) assert_equal(X_pred2.shape, X_pred.shape)
class RegionSplitter_PCA_KMean(): def __init__(self, data, label): data_dim_num = len(data[0]) label_dim_num = len(label[0]) self.n_comp = max(1, data_dim_num) self.pca = PCA(n_components=self.n_comp) data = self.pca.fit_transform(data) data_zipped = list(zip(*data)) # k-mean cluster for the dimension self.clusterer = KMeans(n_clusters=2, init='k-means++') self.clusterer.fit(list(zip(*data_zipped))) def classify(self, data): if not isinstance(data, tuple): raise(TypeError, "data must be a tuple") data = tuple(self.pca.transform(data)[0]) group = self.clusterer.predict(data) return group == 0
def test_kernel_pca(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) for eigen_solver in ("auto", "dense", "arpack"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=True) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # non-regression test: previously, gamma would be 0 by default, # forcing all eigenvalues to 0 under the poly kernel assert_not_equal(X_fit_transformed, []) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1]) # inverse transform X_pred2 = kpca.inverse_transform(X_pred_transformed) assert_equal(X_pred2.shape, X_pred.shape)
def test_compare_clinical_kernel(self): x_full, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1', standardize_numeric=False, to_numeric=False) trans = ClinicalKernelTransform() trans.fit(x_full) x = encode_categorical(standardize(x_full)) kpca = KernelPCA(kernel=trans.pairwise_kernel) xt = kpca.fit_transform(x) nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0) nrsvm.fit(xt, y) rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel=trans.pairwise_kernel, tol=1e-8, max_iter=1000, random_state=0) rsvm.fit(x, y) pred_nrsvm = nrsvm.predict(kpca.transform(x)) pred_rsvm = rsvm.predict(x) self.assertEqual(len(pred_nrsvm), len(pred_rsvm)) c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm) c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm) self.assertAlmostEqual(c1[0], c2[0]) self.assertTupleEqual(c1[1:], c2[1:])
def main(): #set the timer start = time.time() #load the data mnist = fetch_mldata('MNIST original') mnist.target = mnist.target.astype(np.int32) seed = np.random.randint(1,30000) rand = np.random.RandomState(seed) items = len(mnist.target) indices = rand.randint(items, size = 70000) trindex = indices[0:30000] tsindex = indices[30000:] #scale down features to the range [0, 1] mnist.data = mnist.data/255.0 mnist.data = mnist.data.astype(np.float32) trainX = mnist.data[trindex] testX = mnist.data[tsindex] trainY = mnist.target[trindex] testY = mnist.target[tsindex] #extract the features using KPCA kpca = KernelPCA(kernel='precomputed') kpca_train = arc_cosine(trainX[0:1000], trainX[0:1000]) #Fit the model from data in X kpca.fit(kpca_train) kernel_train = arc_cosine(trainX, trainX[0:1000]) kernel_test = arc_cosine(testX, trainX[0:1000]) trainX_kpca = kpca.transform(kernel_train) testX_kpca = kpca.transform(kernel_test) print testX_kpca.shape #fit the svm model and compute accuaracy measure clf = svm.SVC(kernel=arc_cosine) clf.fit(trainX_kpca, trainY) pred = clf.predict(testX_kpca) print accuracy_score(testY, pred) print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY))) print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def train_kmeans_on_pca(train_data, train_labels, test_data, test_labels, n_components=2): from sklearn.decomposition import KernelPCA from sklearn.cluster import KMeans pca = KernelPCA(n_components=n_components).fit(train_data) transformed_train_data = pca.transform(train_data) transformed_test_data = pca.transform(test_data) kmeans = KMeans(n_clusters=2, random_state=0).fit(transformed_train_data) kmeans_pred = kmeans.predict(transformed_test_data) kmeans_pred[kmeans_pred == 0] = -1 acc = accuracy_score(test_labels, kmeans_pred) return pca, kmeans, max(acc, 1-acc)
def doKernelPCA(q, components=40): global data # load test query loadFile('test', q) # fit model kpca = KernelPCA(components, kernel="rbf") kpca.fit(data) # transform and print test query data = kpca.transform(data) printFile('test{}'.format(q)) for kind in ['train', 'vali']: loadFile(kind) data = kpca.transform(data) printFile(kind + str(q))
def generate_kpca_compression(X, n_components=16): """ Compresses the data using sklearn KernelPCA implementation. :param X: Data (n_samples, n_features) :param n_components: Number of dimensions for PCA to keep :return: X_prime (the compressed representation), pca """ kpca = KernelPCA(n_components=n_components, kernel='rbf', eigen_solver='arpack', fit_inverse_transform=False) kpca.fit(X) return kpca.transform(X), kpca
def test_kernel_pca_sparse(): rng = np.random.RandomState(0) X_fit = sp.csr_matrix(rng.random_sample((5, 4))) X_pred = sp.csr_matrix(rng.random_sample((2, 4))) for eigen_solver in ("auto", "arpack"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=False) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1])
def reduction(data, params): # parse parameters for item in params: if isinstance(params[item], str): exec(item+'='+'"'+params[item]+'"') else: exec(item+'='+str(params[item])) # apply PCA kpca = KernelPCA(n_components=n_components, kernel=kernel) kpca.fit(data) X = kpca.transform(data) return X
def kpca(data, n_components, train, test, kernel='linear', gamma=None, degree=3, coef0=1, alpha=0.1, evaluation=False): # Kernel PCA kpca = KernelPCA(n_components, fit_inverse_transform=True, kernel=kernel, gamma=gamma, degree=degree, coef0=coef0, alpha=alpha).fit(data[train]) data_reduced = kpca.transform(data) if evaluation: data_rec = kpca.inverse_transform(data_reduced) loss = mean_squared_error(data[test], data_rec[test]) return loss #name = 'Kernel PCA ('+kernel+')' name = 'Kernel PCA' return data_reduced, name, kpca.inverse_transform
class KernelPCAReduction(AbstractReduction): """ Use kernel PCA to reduce dimensionality http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html """ def __init__(self, n_components, **kwargs): self.pca = KernelPCA(n_components=n_components, **kwargs) self.n_components = n_components def n_components(self): return self.n_components def fit(self, X, Y=None): self.pca.fit(X) def transform(self, X): return self.pca.transform(X)
def __init__(self,master): super (Linear_PCA_vs_SPY ,self).__init__(master) fig = Figure(figsize=(10,10), dpi = 100) ax= fig.add_subplot(1,1,1, axisbg='#cccccc') demo_port = ['SPY','BA', 'WFC', 'PEP', 'AMGN', 'BAX', 'BK', 'FB', 'COST', 'DIS', 'LOW', 'FDX', 'TWX', 'AIG', 'MSFT', 'IBM', 'SBUX', 'FCX', 'PG', 'BMY', 'MDT', 'SPG', 'VZ', 'OXY', 'CL', 'GILD', 'CVS', 'AMZN', 'GE', 'ABT', 'JNJ', 'UTX', 'WMT', 'ALL', 'PFE', 'FOXA', 'MO', 'MCD', 'MMM', 'SO', 'MON', 'APC', 'NOV', 'APA', 'CMCSA', 'DVN', 'ACN', 'CAT', 'EXC', 'TXN', 'UNP', 'HPQ', 'V', 'LMT', 'RTN', 'CSCO', 'DOW', 'LLY', 'NSC', 'JPM', 'C', 'HAL', 'INTC', 'ABBV', 'UNH', 'MA', 'GM', 'XOM', 'KO', 'EBAY', 'MET', 'GS', 'CVX', 'HON', 'MRK', 'AXP', 'USB', 'EMC', 'DD', 'HD', 'AAPL', 'PM', 'F', 'T', 'UPS', 'SLB', 'AEP', 'EMR', 'COF', 'MDLZ', 'GOOG', 'NKE', 'COP', 'QCOM', 'TGT', 'ORCL', 'GD', 'MS', 'BAC'] data = pd.DataFrame() for symbol in demo_port: data[symbol] = web.DataReader(symbol, data_source='yahoo')['Close'] data = data.dropna() spy = pd.DataFrame(data.pop('SPY')) #normalize data scale_func = lambda x: ( x-x.mean())/x.std() #apply PCA pca = KernelPCA().fit(data.apply(scale_func)) get_we = lambda x: x/x.sum() #print (get_we(pca.lambdas_)[:20]) pca_one = KernelPCA(n_components = 1).fit(data.apply(scale_func)) spy['PCA_1'] = pca_one.transform(data) # Plotting spy.apply(scale_func) ax= fig.add_subplot(1,1,1, axisbg='#cccccc') lin_reg = np.polyval(np.polyfit(spy['PCA_1'],spy['SPY'],1) , spy['PCA_1']) ax.scatter(spy['PCA_1'], spy['SPY'], c = data.index) ax.plot(spy['PCA_1'], lin_reg, 'r', lw = 2) ax.set_xlabel('PCA_1') ax.set_ylabel('SPY') canvas = FigureCanvasTkAgg(fig, self) canvas.show() canvas.get_tk_widget().pack(side = tk.TOP,fill= tk.BOTH,expand = True) toolbar = NavigationToolbar2TkAgg(canvas, self) toolbar.update() canvas._tkcanvas.pack(side = tk.TOP, fill = tk.BOTH, expand = True)
def getKPCAcomp(dict_read): A = np.arange(10000) for key in dict_read.keys(): if key<=1000: [sample_rate,X] = dict_read.get(key) # if song doesnt have 10000 features, then add 0s at the end (this usually isnt the case) if (len(X)<10000): dif = 10000 - len(X) for i in range(dif): X = np.hstack((X,0.0)) A = np.vstack((A,X[:10000])) else: break A = np.delete(A, 0, 0) A = A.astype(float) kpca = KernelPCA(n_components=100, kernel="rbf") kpca.fit(A) A = kpca.transform(A) return A
def perform_kpca(input_data): ''' Applying Kernal PCA on removed outliers data# Using scikit module for Kpca ''' from sklearn.decomposition import KernelPCA # Specify kernal fucntion used in the K pca KERNEL = raw_input('Enter the kernal of kernalPCA(options are :cosine,rbf,linear,sigmoid:') kpca=KernelPCA(n_components=len(input_data.T),kernel=KERNEL) #Scaling thing for input dataset from sklearn.preprocessing import scale scld_input_data= scale(input_data, axis=0, with_mean=True, with_std=True, copy=True ) kpca.fit(scld_input_data) # Transform the dataset on the given PC's kpca_input_data=kpca.transform(scld_input_data) #Percentage variance representarion Kpca_percent=np.array(map(lambda y: (kpca.lambdas_[y]/sum(kpca.lambdas_)),range(len(kpca.lambdas_)))) Var_explanied=np.c_[Kpca_percent.reshape(len(Kpca_percent),1)] print '\nVariance explanied by eigenvalues of KPca ' print (['Kpca']) print Var_explanied return (kpca_input_data)
def test_compare_rbf(self): x, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1') kpca = KernelPCA(kernel="rbf") xt = kpca.fit_transform(x) nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0) nrsvm.fit(xt, y) rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel="rbf", tol=1e-8, max_iter=1000, random_state=0) rsvm.fit(x, y) pred_nrsvm = nrsvm.predict(kpca.transform(x)) pred_rsvm = rsvm.predict(x) self.assertEqual(len(pred_nrsvm), len(pred_rsvm)) c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm) c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm) self.assertAlmostEqual(c1[0], c2[0]) self.assertTupleEqual(c1[1:], c2[1:])
class PCAKernel(PCAnalyzer): """ Non-linear PCA as wrapper over SciKitLearn Kernels """ def __init__(self, components, ktype='poly'): PCAnalyzer.__init__(self) if isinstance(components, int): self.n_components = components self.pca = KernelPCA(kernel=ktype, n_components=components) self.type = 'kernel' def solve(self, X): self.dim = np.prod(X.shape[1:]) self.pca.fit(X.reshape(len(X), self.dim)) self.trainsize = len(X) def project(self, X): if isinstance(X, list): X = np.array(X) dimX = np.prod(X.shape[1:]) if dimX != self.dim: logging.error('Projection Error in KPCA: Cannot reshape/project %s size data using PC Vects of size, %s', str(X.shape), str(self.dim)) return None projection = self.pca.transform(X.reshape(len(X), dimX)) return projection
print('LDA transform_support vector machines_training score: ', svm.score(X_train_lda, y_train)) print('LDA transform_support vector machines_testing score: ', svm.score(X_test_lda, y_test)) #kPCA gamma_space = np.logspace(-2, 0, 10) lr_train = [] lr_test = [] svm_train = [] svm_test = [] for gamma in gamma_space: kPCA = KernelPCA(n_components=2, kernel='rbf', gamma=gamma) X_train_kpca = kPCA.fit_transform(X_train_std, y_train) X_test_kpca = kPCA.transform(X_test_std) lr = LogisticRegression() lr = lr.fit(X_train_kpca, y_train) lr_train.append(lr.score(X_train_kpca, y_train)) lr_test.append(lr.score(X_test_kpca, y_test)) svm = SVC(kernel='linear', C=1.0, random_state=1) svm.fit(X_train_kpca, y_train) svm_train.append(svm.score(X_train_kpca, y_train)) svm_test.append(svm.score(X_test_kpca, y_test)) print("gamma lr_train lr_test svm_train svm_test") for i in range(10): print('%.3f, %.3f, %.3f, %.3f, %.3f' % ( gamma_space[i], lr_train[i], lr_test[i],
class RegionSplitter_PCA_oudeyer(): def __init__(self, data, label): self.cut_dim = 0 self.cut_val = 0 num_candidates = 50 data_dim_num = len(data[0]) label_dim_num = len(label[0]) self.n_comp = max(1, data_dim_num) self.pca = PCA(n_components=self.n_comp, kernel='linear') # self.ica = ICA(n_components=self.n_comp) data = self.pca.fit_transform(data) #data = self.ica.fit_transform(data) data_zipped = list(zip(*data)) data_dim_num = len(data[0]) label_dim_num = len(label[0]) # sort in each dimension dim_min = float("inf") for i in range(data_dim_num): for k in range(num_candidates): # pick a random value max_val = max(data_zipped[i]) min_val = min(data_zipped[i]) cut_val = random.choice(np.linspace(min_val, max_val, num=500)) groups = [[label[j] for j in range(len(data_zipped[i])) if data_zipped[i][j] <= cut_val], [label[j] for j in range(len(data_zipped[i])) if data_zipped[i][j] > cut_val]] # check if any of the group is 0 if len(groups[0]) == 0 or len(groups[1]) == 0: continue weighted_avg_variance = [] for group in groups: num_sample = len(group) group = zip(*group) variance = [] for group_k in group: mean = math.fsum(group_k)/len(group_k) norm = max(math.fsum([x**2 for x in group_k])/len(group_k), 1) variance.append(math.fsum([((x - mean)**2)/norm for x in group_k])) weighted_avg_variance.append(math.fsum(variance)/len(variance)*num_sample) in_group_variance = math.fsum(weighted_avg_variance) if in_group_variance < dim_min: dim_min = in_group_variance self.cut_dim = i self.cut_val = cut_val # just cut in half #self.cut_val = exemplars[int(sample_num/2)][0][self.cut_dim] def classify(self, data): if not isinstance(data, tuple): raise(TypeError, "data must be a tuple") data = tuple(self.pca.transform(data)[0]) # data = tuple(self.ica.transform(data)[0]) group = data[self.cut_dim] <= self.cut_val return group == 0
if (0): #%% K-PCA # Calculate accumulated variance kpca = KernelPCA(kernel="rbf",gamma=gamma) kpca.fit_transform(Xtrain) eigenvals = kpca.lambdas_[0:220] # Calculate classifiation scores for each component nComponents = np.linspace(1, 500, 100, endpoint=True) kpcaScores = np.zeros((5,np.alen(nComponents))) kpca = KernelPCA(n_components = Ntrain,kernel="rbf",gamma=gamma) kpca.fit(Xtrain) XtrainT = kpca.transform(Xtrain) XtestT = kpca.transform(Xtest) for i in range(len(nComponents)): kpcaScores[:,i] = util.classify(XtrainT[:,:nComponents[i]],XtestT[:,:nComponents[i]],labelsTrain,labelsTest) #%% Plot accuracies for kPCA plt.figure() for i in range (5): plt.plot(nComponents,kpcaScores[i,:],lw=3) plt.xlim(1,np.amax(nComponents)) plt.title('kPCA accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) # Kernel PCA from sklearn.decomposition import KernelPCA Kpca = KernelPCA(n_components=2, kernel='rbf') X_train = Kpca.fit_transform(X_train) X_test = Kpca.transform(X_test) # Fitting Logistic Regression to the Training Set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap
class DimensionalityReductionPool: # common param random_state = None n_jobs = None runtime = None # PCA param pca_obj = None # LDA param lda_obj = None # Kernel PCA param gamma_kernel_pca = None kernel_pca_obj = None def __init__(self, random_state, n_jobs, gamme_kernel_pca): self.random_state = random_state self.n_jobs = n_jobs self.runtime = numpy.zeros(3) # PCA object self.pca_obj = PCA(n_components='mle', svd_solver='full', random_state=self.random_state) # LDA object self.lda_obj = LinearDiscriminantAnalysis(n_components=None) # Kernel PCA object self.gamma_kernel_pca = gamme_kernel_pca self.kernel_pca_obj = KernelPCA(n_components=None, kernel='rbf', gamma=self.gamma_kernel_pca, random_state=self.random_state, n_jobs=self.n_jobs) def PCA_fit(self, X): # Record fitting runtime start_time = time.time() print("Fitting PCA on X") self.pca_obj.fit(X=X) self.runtime[0] = time.time() - start_time print("Fitting ended in " + "{0:.2f}".format(round(self.runtime[0], 2)) + " seconds") def LDA_fit(self, X, Y): # Record fitting runtime start_time = time.time() print("Fitting LDA on X and Y") self.lda_obj.fit(X=X, y=Y) self.runtime[1] = time.time() - start_time print("Fitting ended in " + "{0:.2f}".format(round(self.runtime[1], 2)) + " seconds") def KernelPCA_fit(self, X): # Record fitting runtime start_time = time.time() print("Fitting Kernel PCA on X") self.kernel_pca_obj.fit(X=X) self.runtime[2] = time.time() - start_time print("Fitting ended in " + "{0:.2f}".format(round(self.runtime[2], 2)) + " seconds") def Fit_all(self, X, Y): self.PCA_fit(X) self.LDA_fit(X, Y) self.KernelPCA_fit(X) def Transform_all(self, X): transformed_X_dict = { 'PCA': self.pca_obj.transform(X), 'LDA': self.lda_obj.transform(X), 'KernelPCA': self.kernel_pca_obj.transform(X) } return transformed_X_dict def GetDimRedObjects(self): DimRedObj = { 'PCA': self.pca_obj, 'LDA': self.lda_obj, 'KernelPCA': self.kernel_pca_obj } return DimRedObj
symbols = [ 'AAPL', 'AXP', 'BA', 'CAT', 'CSCO', 'CVX', 'DD', 'DIS', 'GE', 'GS', 'HD', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PFE', 'PG', 'TRV', 'UNH', 'UTX', 'V', 'VZ', 'WMT', 'XOM', '^DJI' ] data = pd.DataFrame() for sym in symbols: data[sym] = web.DataReader(sym, data_source='yahoo')['Adj Close'] data = data.dropna() dji = pd.DataFrame(data.pop('^DJI')) scale_function = lambda x: (x - x.mean()) / x.std() pca = KernelPCA().fit(data.apply(scale_function)) pca.lambdas_[:10].round() get_we = lambda x: x / x.sum() get_we(pca.lambdas_)[:10] get_we(pca.lambdas_)[:5].sum() pca = KernelPCA(n_components=1).fit(data.apply(scale_function)) dji['PCA_1'] = pca.transform(-data) import matplotlib.pyplot as plt dji.apply(scale_function).plot(figsize=(20, 10)) plt.show()
sc, x_train, x_cv = feature_Scaling(x_train, x_cv) # ### Applying Kernal PCA and fit the logistic regression model into training # In[12]: from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA # In[13]: #Applying KPCA pca = KPCA(n_components=2, kernel='rbf') x_train = pca.fit_transform(x_train) x_cv = pca.transform(x_cv) # explained_varience = pca.explained_variance_ratio_ # In[14]: # fitting logistinc regression to the training set classifier = LogisticRegression(random_state=0) classifier = classifier.fit(x_train, y_train) # In[15]: # predict y data y_pred = classifier.predict(x_cv) # In[16]:
data = pd.read_csv('data/Social_Network_Ads.csv') X = data.iloc[:, [2, 3]].values Y = data.iloc[:, -1].values # pre processing # it's important to scaling sc = StandardScaler() X = sc.fit_transform(X) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) # using PCA for dimension reduction k_pca = KernelPCA(n_components=2, kernel='rbf') X_train_transformed = k_pca.fit_transform(X_train) X_test_transformed = k_pca.transform(X_test) classifier = LogisticRegression() classifier.fit(X_train_transformed, Y_train) y_pre = classifier.predict(X_test_transformed) cm = confusion_matrix(Y_test, y_pre) accuracy = accuracy_score(Y_test, y_pre) # visualising data X_set, Y_set = X_train_transformed, Y_train X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01), np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1,
pca.fit(X_train_noisy) _ = kernel_pca.fit(X_train_noisy) # %% # Reconstruct and denoise test images # ----------------------------------- # # Now, we can transform and reconstruct the noisy test set. Since we used less # components than the number of original features, we will get an approximation # of the original set. Indeed, by dropping the components explaining variance # in PCA the least, we hope to remove noise. Similar thinking happens in kernel # PCA; however, we expect a better reconstruction because we use a non-linear # kernel to learn the PCA basis and a kernel ridge to learn the mapping # function. X_reconstructed_kernel_pca = kernel_pca.inverse_transform( kernel_pca.transform(X_test_noisy)) X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test_noisy)) # %% plot_digits(X_test, "Uncorrupted test images") plot_digits( X_reconstructed_pca, f"PCA reconstruction\nMSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}", ) plot_digits( X_reconstructed_kernel_pca, "Kernel PCA reconstruction\n" f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}", ) # %%
data_matrix = np.zeros((m, n)) print "There are %d hurricanes for cross validation. So I need to perform %d comparisons." % ( m, m * n) count = 0 for i in range(len(cv_hurricane_list)): for j in range(n): data_matrix[i][j] = M2(cv_ts_list[i], ts_list[j], delta, eps) count = count + 1 if count % 1000 == 0: print count cv_feature_coords = kpca.transform((data_matrix**2) * -0.5) print cv_feature_coords [cv_mean, cv_mfx] = reg.fit(cv_feature_coords) print cv_mean cv_predicted = np.zeros(m) cv_high_prob = np.zeros(m) + 0.5 num_high_prob = 0 thresh = 0.05 for i in range(m): if cv_mean[i] >= 0.5: cv_predicted[i] = 1 if cv_mean[i] <= thresh: num_high_prob = num_high_prob + 1
plt.figure() plt.subplot(2, 2, 1, aspect="equal") plt.title("Original space") reds = y == 0 blues = y == 1 plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor="k") plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor="k") plt.xlabel("$x_1$") plt.ylabel("$x_2$") X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50)) X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T # projection on the first principal component (in the phi space) Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape) plt.contour(X1, X2, Z_grid, colors="grey", linewidths=1, origin="lower") plt.subplot(2, 2, 3, aspect="equal") plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=20, edgecolor="k") plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor="k") plt.title("Projection by KPCA") plt.xlabel(r"1st principal component in space induced by $\phi$") plt.ylabel("2nd component") plt.subplot(2, 2, 4, aspect="equal") from graspologic.embed import ClassicalMDS plt.tight_layout() plt.show()
def kernel_pincipal_component(train_x, test_x, n_com, kernel_type = 'rbf'): pca = KernelPCA(n_components=n_com, kernel= kernel_type) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) return train_x, test_x
def pid_pca(args): # import modular data processing toolkit import mdp # load data file tblfilename = "bf_optimize_mavlink.h5" h5file = tb.open_file(tblfilename, mode = "a") # table = h5file.root.v1.evaluations # get tabke handle table = h5file.root.v2.evaluations # sort rows if not table.cols.mse.is_indexed: table.cols.mse.createCSIndex() if not args.sorted: pids = [ [x["alt_p"], x["alt_i"], x["alt_d"], x["vel_p"], x["vel_i"], x["vel_d"]] for x in table.iterrows() ] mses = [ [x["mse"]] for x in table.iterrows() ] else: pids = [ [x["alt_p"], x["alt_i"], x["alt_d"], x["vel_p"], x["vel_i"], x["vel_d"]] for x in table.itersorted("mse")] mses = [ [x["mse"]] for x in table.itersorted("mse")] print "best two", pids mses_a = np.log(np.clip(np.array(mses), 0, 200000.)) mses_a /= np.max(mses_a) # FIXME: try kernel pca on this from sklearn.decomposition import PCA, KernelPCA, SparsePCA kpca = KernelPCA(n_components = None, kernel="rbf", degree=6, fit_inverse_transform=True, gamma=1/6., alpha=1.) # kpca = SparsePCA(alpha=2., ridge_alpha=0.1) X_kpca = kpca.fit_transform(np.asarray(pids).astype(float)) # X_back = kpca.inverse_transform(X_kpca) Z_kpca = kpca.transform(np.asarray(pids).astype(float)) print Z_kpca.shape, X_kpca.shape print "|Z_kpca|", np.linalg.norm(Z_kpca, 2, axis=1) # for i in range(8): # pl.subplot(8,1,i+1) # pl.plot(Z_kpca[:,i]) # pl.legend() # pl.show() # fast PCA # pid_p = mdp.pca(np.array(pids).astype(float)) pid_array = np.array(pids).astype(float) print "pid_array.shape", pid_array.shape pcanode = mdp.nodes.PCANode(output_dim = 6) # pcanode.desired_variance = 0.75 pcanode.train(np.array(pids).astype(float)) pcanode.stop_training() print "out dim", pcanode.output_dim pid_p = pcanode.execute(np.array(pids).astype(float)) # pid_array_mse = np.hstack((np.array(pids).astype(float), mses_a)) pid_ica = mdp.fastica(np.array(pids).astype(float)) print "ica.shape", pid_ica.shape # pid_p = np.asarray(pids)[:,[0, 3]] # pid_p = pids[:,0:2] # [:,0:2] sl_start = 0 sl_end = 100 sl = slice(sl_start, sl_end) print "expl var", pcanode.explained_variance pl.subplot(111) colors = np.zeros((100, 3)) # colors = np.hstack((colors, 1-(0.5*mses_a))) colors = np.hstack((colors, 1-(0.8*mses_a))) # print colors.shape # pl.scatter(pid_p[sl,0], pid_p[sl,1], color=colors) # ica spektrum pid_ica_sum = np.sum(np.square(pid_ica), axis=0) # pid_ica_sum_sort = np.sort(pid_ica_sum) pid_ica_sum_0 = np.argmax(pid_ica_sum) pid_ica_sum[pid_ica_sum_0] = 0 pid_ica_sum_1 = np.argmax(pid_ica_sum) # pl.scatter(pid_p[sl,0], pid_p[sl,1], color=colors) pl.scatter(pid_ica[sl,pid_ica_sum_0], pid_ica[sl,pid_ica_sum_1], color=colors) # pl.scatter(X_kpca[:,0], X_kpca[:,1], color=colors) pl.gca().set_aspect(1) # pl.scatter(pid_p[:,0], pid_p[:,1], alpha=1.) # pl.show() # plot raw pid values pl.subplot(411) pl.plot(pid_array[sl,[0,3]], "o") pl.xlim((sl_start - 0.2, sl_end + 0.2)) pl.subplot(412) pl.plot(pid_array[sl,[1,4]], "o") pl.xlim((sl_start - 0.2, sl_end + 0.2)) pl.subplot(413) pl.plot(pid_array[sl,[2,5]], "o") # plot compressed pid values: pca, ica, ... # pl.subplot(211) # pl.plot(pid_p, ".") # pl.plot(pid_p[sl], "o") # pl.plot(pid_ica[sl] + np.random.uniform(-0.01, 0.01, size=pid_ica[sl].shape), "o") pl.xlim((sl_start - 0.2, sl_end + 0.2)) # pl.plot(Z_kpca[:,:], "-o", label="kpca") # pl.plot(Z_kpca[:,:], ".", label="kpca") # pl.legend() # pl.subplot(212) pl.subplot(414) pl.plot(mses_a[sl], "ko") # pl.gca().set_yscale("log") pl.xlim((sl_start - 0.2, sl_end + 0.2)) pl.show() # gp fit x = mses_a[sl] x_sup = np.atleast_2d(np.arange(0, x.shape[0])).T x_ones = x != 1. x_ones[0:20] = False print x, x_sup, x_ones, x_ones.shape print "x[x_ones]", x[x_ones].shape print "x_sup[x_ones]", x_sup[x_ones].shape from sklearn.gaussian_process import GaussianProcess # gp = GaussianProcess(regr='constant', corr='absolute_exponential', # theta0=[1e-4] * 1, thetaL=[1e-12] * 1, # thetaU=[1e-2] * 1, nugget=1e-2, optimizer='Welch') gp = GaussianProcess(corr="squared_exponential", theta0=1e-2, thetaL=1e-4, thetaU=1e-1, nugget=1e-1/x[x_ones]) gp.fit(x_sup[x_ones,np.newaxis], x[x_ones,np.newaxis]) x_pred, sigma2_pred = gp.predict(x_sup, eval_MSE=True) print x_pred, sigma2_pred from sklearn import linear_model clf = linear_model.Ridge (alpha = .5) clf.fit(x_sup[x_ones,np.newaxis], x[x_ones,np.newaxis]) x_pred = clf.predict(x_sup[20:100]) pl.subplot(111) pl.plot(mses_a[sl], "ko") x_mean = np.mean(x[0:20]) pl.plot(np.arange(0, 20), np.ones((20, )) * x_mean, "k-", alpha=0.5) pl.plot(np.arange(20, 100), x_pred, "k-", alpha=0.5) pl.axhspan(0.5, 1.1, 0, 0.19, facecolor="0.5", alpha=0.25) # pl.plot(x_pred + sigma2_pred, "k-", alpha=0.5) # pl.plot(x_pred - sigma2_pred, "k-", alpha=0.5) # pl.gca().set_yscale("log") pl.xlim((sl_start - 0.2, sl_end + 0.2)) pl.ylim((0.5, 1.1)) pl.text(5, 0.6, "Random\ninitialization") pl.text(40, 0.6, "Optimizer\nsuggestions") pl.xlabel("Episode #") pl.ylabel("MSE") if args.plotsave: pl.gcf().set_size_inches((10, 3)) pl.gcf().savefig("%s-mse.pdf" % (sys.argv[0][:-3]), dpi=300, bbox_inches="tight") pl.show()
x_red[0] = pca.fit_transform(x[2]) x_red[1] = pca.transform(x[3]) x_red[2] = pca.transform(x[4]) print("Dimenionality Reduction methd used: ", pca) if args.dimensionality_reduction_method == "LDA": lda = LDA(n_components=k) x_red[0] = lda.fit_transform(x[2], y[2]) x_red[1] = lda.transform(x[3]) x_red[2] = lda.transform(x[4]) print("Dimenionality Reduction methd used: ", lda) if args.dimensionality_reduction_method == "KPCA": kpca = KPCA(n_components=k, kernel=args.kernel_pca) x_red[0] = kpca.fit_transform(x[2]) x_red[1] = kpca.transform(x[3]) x_red[2] = kpca.transform(x[4]) print("Dimenionality Reduction methd used: ", kpca) # training the model if args.C == None: C = [0.5, 5, 10, 20] else: C = [args.C] if args.gamma == None: gam = [0.01, 0.05, 0.1, 0.5, 1] else: gam = [args.gamma] if args.training_model == "LR":
pca.fit(dat) kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) kpca.fit(dat) ## project data into PC space # 0,1 denote PC1 and PC2; change values for other PCs xvector = pca.components_[0] # see 'prcomp(my_data)$rotation' in R yvector = pca.components_[1] xs = pca.transform(dat)[:, 0] # see 'prcomp(my_data)$x' in R ys = pca.transform(dat)[:, 1] kxs = kpca.transform(dat)[:, 0] # see 'prcomp(my_data)$x' in R kys = kpca.transform(dat)[:, 1] ## visualize projections ## Note: scale values for arrows and text are a bit inelegant as of now, ## so feel free to play around with them for i in range(len(xvector[:n])): # arrows project features (ie columns from csv) as vectors onto PC axes plt.arrow(0, 0, xvector[i] * max(xs), yvector[i] * max(ys), color='r', width=0.0005,
from sklearn.cross_validation import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0) from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) #Applying kernel PCA from sklearn.decomposition import KernelPCA #n_components - No. of extracted features that you need that will explain most variance kpca = KernelPCA(n_components = 2, kernel = 'rbf') #Fitting PCA to training set X_train = kpca.fit_transform(X_train) X_test = kpca.transform(X_test) #Fitting Logistic Regression to the Training Data from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state = 0) classifier.fit(X_train,Y_train) y_pred = classifier.predict(X_test) #Create Confusion Matrix #Class has capitals while functions have small letters from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) #Visualizing Training set results from matplotlib.colors import ListedColormap
class PCAUmap: def __init__( self, n_neighbors=15, use_pca=1, kernel='linear', min_dist=0.1, n_components=2, random_state=None, transform_seed=None, scaler=True, metric="euclidean", augment_size=3, impute_rate=0.1, ): if kernel == 'linear': self.pca = PCA() else: self.pca = KernelPCA(kernel=kernel, fit_inverse_transform=True) self.umap = UMAP( random_state=random_state, transform_seed=transform_seed, n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric, ) self.use_pca = use_pca self.random_state = random_state self.scaler = StandardScaler() self.data = None self.pca_features = None self.embedding = None self.imputer = KNNImputer() self.augment_size = augment_size self.impute_rate = impute_rate def fit(self, data): self.data = pd.DataFrame(data) augmented_data = self.augumentation(self.augment_size, self.impute_rate) if self.scaler is None: if self.use_pca is None: self.umap.fit(augmented_data) self.embedding = self.umap.transform(data) else: self.umap.fit(self.pca.fit_transform(augmented_data)) self.pca_features = self.pca.transform(data) self.embedding = self.umap.transform(self.pca_features) else: if self.use_pca is None: self.umap.fit(self.scaler.fit_transform(augmented_data)) self.embedding = self.umap.transform( self.scaler.transform(data)) else: self.umap.fit( self.pca.fit_transform( self.scaler.fit_transform(augmented_data))) self.pca_features = self.pca.transform( self.scaler.transform(data)) self.embedding = self.umap.transform(self.pca_features) return self def transform(self, data): self.data = pd.DataFrame(data) if self.scaler is None: if self.pca is None: self.embedding = self.umap.transform(data) return self.embedding else: self.pca_features = self.pca.transform(data) self.embedding = self.umap.transform(self.pca_features) return self.embedding else: if self.pca is None: self.embedding = self.umap.transform( self.scaler.transform(data)) return self.embedding else: self.pca_features = self.pca.transform( self.scaler.transform(data)) self.embedding = self.umap.transform(self.pca_features) return self.embedding def fit_transform(self, data): self.fit(data) return self.transform(data) def inverse_transform(self, embedded): if self.scaler is None: if self.pca is None: return self.umap.inverse_transform(embedded) else: return self.pca.inverse_transform( self.umap.inverse_transform(embedded)) else: if self.pca is None: return self.scaler.inverse_transform( self.umap.inverse_transform(embedded)) else: return self.scaler.inverse_transform( self.pca.inverse_transform( self.umap.inverse_transform(embedded))) def pca_summary(self, c=None): plt.figure(figsize=(6, 6)) if c is None: plt.scatter(self.pca_features[:, 0], self.pca_features[:, 1], alpha=0.5) else: plt.scatter(self.pca_features[:, 0], self.pca_features[:, 1], alpha=0.5, c=c) plt.xlabel("PC1 ({}%)".format( int(self.pca.explained_variance_ratio_[0] * 100))) plt.ylabel("PC2 ({}%)".format( int(self.pca.explained_variance_ratio_[1] * 100))) plt.grid() plt.show() plt.figure(figsize=(6, 6)) plt.scatter(self.pca.components_[0], self.pca.components_[1], alpha=0.5) plt.xlabel("loading 1") plt.ylabel("loading 2") plt.grid() plt.show() plt.figure(figsize=(6, 6)) plt.plot([0] + list(np.cumsum(self.pca.explained_variance_ratio_)), "-o") plt.xlabel("Number of principal components") plt.ylabel("Cumulative contribution ratio") plt.grid() plt.show() def map_predicted_values( self, model, c=None, alpha=0.5, edgecolors="k", figsize=(8, 6), h=0.2, cm=plt.cm.jet, ): x_min = self.embedding[:, 0].min() - 0.5 x_max = self.embedding[:, 0].max() + 0.5 y_min = self.embedding[:, 1].min() - 0.5 y_max = self.embedding[:, 1].max() + 0.5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) plt.figure(figsize=figsize) if hasattr(model, "predict_proba"): Z = model.predict_proba( self.inverse_transform(np.c_[xx.ravel(), yy.ravel()]))[:, 1] elif hasattr(model, "decision_function"): Z = model.decision_function( self.inverse_transform(np.c_[xx.ravel(), yy.ravel()])) else: Z = model.predict( self.inverse_transform(np.c_[xx.ravel(), yy.ravel()])) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, alpha=alpha, cmap=cm) plt.colorbar() if c is None: plt.scatter( self.embedding[:, 0], self.embedding[:, 1], alpha=alpha, edgecolors=edgecolors, ) else: plt.scatter( self.embedding[:, 0], self.embedding[:, 1], alpha=alpha, c=c, edgecolors=edgecolors, ) plt.grid() plt.show() def augumentation(self, augment_size, rate): augmented_data = pd.concat([self.data] * augment_size).values augmented_data = fill_randomly(augmented_data, np.nan, rate) augmented_data = pd.DataFrame( self.imputer.fit_transform(augmented_data)) augmented_data = pd.concat([self.data, augmented_data]) return augmented_data
np.arange(number_of_molecules_COMP)) random_indices = random_indices[:number_of_data_COMP] frames_benchmark, Y_benchmark, mol_indices_comp = load_COMP(random_indices) X_benchmark = compute_soap_matrix(frames_benchmark) weights = ridge_regression(Y, X, np.ones_like(X[0, :].T)) full_errorMAE, full_errorMSE = compute_loss(X_benchmark, weights, Y_benchmark) # In[5]: from sklearn.decomposition import KernelPCA pca = KernelPCA(n_components=4000, kernel='precomputed') pca.fit(np.dot(X.T, X)) XPCA = pca.transform(X) X_benchmarkPCA = pca.transform(X_benchmark) XPCA.shape # In[25]: methods = ['F', 'FPS', 'PCA'] indices = { 'F': ind_F_test, 'FPS': ind_FPS, } numbers_steps = 2 * np.logspace(0, 3, 7).astype(int) vecMAE_AL = np.zeros([len(numbers_steps), len(methods)]) vecMSE_AL = np.zeros([len(numbers_steps), len(methods)]) for i, number_of_feature in enumerate(numbers_steps):
svm.fit(X_train_lda, y_train) svm_pred_test_lda = svm.predict(X_test_lda) svm_pred_train_lda = svm.predict(X_train_lda) print(accuracy_score(svm_pred_train_lda, y_train)) print(accuracy_score(svm_pred_test_lda, y_test)) ########################################kpca_lr gamma_space = np.arange(0.01, 5, 0.05) acc_lp_kpca_train = np.empty(len(gamma_space)) acc_lp_kpca_test = np.empty(len(gamma_space)) for j, i in enumerate(gamma_space): scikit_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=i) X_train_kpca = scikit_kpca.fit_transform(X_train_std) X_test_kpca = scikit_kpca.transform(X_test_std) lr_kpca = lr.fit(X_train_kpca, y_train) lr_pred_train = lr.predict(X_train_kpca) lr_pred_test = lr.predict(X_test_kpca) acc_lp_kpca_train[j] = accuracy_score(lr_pred_train, y_train) acc_lp_kpca_test[j] = accuracy_score(lr_pred_test, y_test) plt.title('lr accuracy varies according to gamma') plt.plot(gamma_space, acc_lp_kpca_train, label='training accuracy') plt.plot(gamma_space, acc_lp_kpca_test, label='testing accuracy') _ = plt.xlabel('gamma') _ = plt.ylabel('accuracy') plt.show() print(max(acc_lp_kpca_train)) print(max(acc_lp_kpca_test))
#Splitting the data d = data.values x_train, x_test, y_train, y_test = train_test_split(d[:,0:12], d[:,12:], test_size = 0.25, random_state = 0) #Feature Scaling sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) #Applying PCA pca = KernelPCA(n_components = 8, kernel='rbf') X_train = pca.fit_transform(x_train) X_test = pca.transform(x_test) #explained_variance = pca.explained_variance_ratio_ #Model building #Fitting model to KNN knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) knn_classifier.fit(x_train, y_train) #Fitting model to kernel SVM svm_classifier = SVC(kernel = 'rbf', random_state = 0) svm_classifier.fit(x_train, y_train) #Fitting model to naive bayes nb_classifier = GaussianNB() nb_classifier.fit(x_train, y_train)
#For PCA we need to normalize our data with some function dt_features = StandardScaler().fit_transform(dt_features) X_train, X_test, y_train, y_test = train_test_split(dt_features, dt_target, test_size=0.3, random_state=42) #train_test_split part of a set of tests (30% of tests), random_state, says that by giving it a value, the model will always start from the same point kpca = KernelPCA( n_components=4, kernel='poly' ) #n_components (optional),tells us to look for the 4 variables that provide the most amount of info kpca.fit(X_train) dt_train = kpca.transform(X_train) dt_test = kpca.transform(X_test) logistic = LogisticRegression(solver='lbfgs') logistic.fit(dt_train, y_train) print("SCORE KPCA: ", logistic.score(dt_test, y_test)) print(X_train.shape) #Table shape print( y_train.shape ) #target data, (0-1): there is presence of heart disease or there is no #n_components = min(n_samples, n_features) pca = PCA(n_components=3) pca.fit(X_train)
distortion.append(sum(numpy.min(cdist(delta_noname, kmeans.cluster_centers_, 'euclidean'), axis=1)) / delta_noname.shape[0]) plt.plot(K, distortion, 'bx-') plt.title('The Elbow Method showing the optimal k') plt.show() # In[277]: #PCA with RBF from sklearn.decomposition import PCA, KernelPCA kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10,n_components=4) kpca.fit(delta_noname) delta_noname_components_rbf = kpca.transform(delta_noname) # In[278]: #to get the variance being explained by the components import numpy explained_variance = numpy.var(delta_noname_components_rbf, axis=0) explained_variance_ratio = explained_variance / numpy.sum(explained_variance) print(explained_variance_ratio) # In[279]:
#split dataset to training and test datasets from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0) #feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) #apply kernelPCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components = 2, kernel ='rbf') x_train = kpca.fit_transform(x_train) x_test = kpca.transform(x_test) #logistic regression from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state = 0) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) #making the cpnfusion matrix from sklearn.metrics import confusion_matrix, accuracy_score cm = confusion_matrix(y_test, y_pred) ac = accuracy_score(y_test, y_pred) #model selection "kfold cross validation"
class RegionSplitter_PCA_oudeyer_modified(): def __init__(self, data, label): self.cut_dim = 0 self.cut_val = 0 num_candidates = 500 min_group_size = 20 data_dim_num = len(data[0]) self.n_comp = max(1, data_dim_num) self.pca = PCA(n_components=self.n_comp, kernel='linear') #self.pca = ICA(n_components=self.n_comp) data = self.pca.fit_transform(data) data_dim_num = len(data[0]) label_dim_num = len(label[0]) data_zipped = list(zip(*data)) # model used to evaluate the data model = linear_model.LinearRegression() # the error of whole partition n_fold = 2 kf = KFold(len(data), n_folds=n_fold) rms_error_whole = 0 for train_index, test_index in kf: data_train, data_test = np.array(data)[train_index], np.array(data)[test_index] label_train, label_test = np.array(label)[train_index], np.array(label)[test_index] model = linear_model.LinearRegression() model.fit(data_train, label_train) label_predict = model.predict(data_test) rms_error_whole += metrics.mean_squared_error(label_test, label_predict) rms_error_whole /= n_fold # sort in each dimension dim_min = float("inf") for i in range(data_dim_num): for k in range(num_candidates): # pick a random value max_val = max(data_zipped[i]) min_val = min(data_zipped[i]) cut_val = random.uniform(min_val, max_val) groups = [[[data[j], label[j]] for j in range(len(data_zipped[i])) if data_zipped[i][j] <= cut_val], [[data[j], label[j]] for j in range(len(data_zipped[i])) if data_zipped[i][j] > cut_val]] # check if any of the group is 0 or 1 if len(groups[0]) < min_group_size or len(groups[1]) < min_group_size: continue avg_error = [] weighted_avg_variance = [] for group in groups: # calculate error with a linear model data_k = list(zip(*group))[0] label_k = list(zip(*group))[1] # the split groups error n_fold = 2 kf = KFold(len(data_k), n_folds=n_fold) rms_error_split = 0 for train_index, test_index in kf: data_train, data_test = np.array(data_k)[train_index], np.array(data_k)[test_index] label_train, label_test = np.array(label_k)[train_index], np.array(label_k)[test_index] model.fit(data_train, label_train) label_predict = model.predict(data_test) rms_error_split += metrics.mean_squared_error(label_test, label_predict) rms_error_split /= n_fold avg_error.append(rms_error_split) num_sample = len(group) group = zip(*group[0]) # calculate variance of data points variance = [] for group_k in group: mean = math.fsum(group_k)/len(group_k) norm = max(math.fsum([x**2 for x in group_k])/len(group_k), 1) variance.append(math.fsum([((x - mean)**2)/norm for x in group_k])) weighted_avg_variance.append(math.fsum(variance)/len(variance)*num_sample) error_diff = (avg_error[0] - avg_error[1])**2 smallest_error = min(avg_error) biggest_error_reduction = max(rms_error_whole - avg_error[0], rms_error_whole-avg_error[1]) in_group_variance = math.fsum(weighted_avg_variance) #print('cut_dim=%d cut_val=%f avg_err=%f var=%f'%(i, cut_val, smallest_error, in_group_variance)) try: score = ((in_group_variance+1)*(smallest_error+1)) / (error_diff*(biggest_error_reduction**0.5)) except ZeroDivisionError: score = float("inf") if score < dim_min: dim_min = score self.cut_dim = i self.cut_val = cut_val # just cut in half #self.cut_val = exemplars[int(sample_num/2)][0][self.cut_dim] def classify(self, data): if not isinstance(data, tuple): raise(TypeError, "data must be a tuple") data = tuple(self.pca.transform(data)[0]) group = data[self.cut_dim] <= self.cut_val return group == 0
def apply_Kenel_PCA(self, X_training, variance, kernel): pca = KernelPCA(variance, kernel=kernel, degree=4) pca.fit(X_training) return pca.transform(X_training)
data=data.dropna() dax=pd.DataFrame(data.pop('DJIA')) data[data.columns[:6]].head() scale_function=lambda x:(x-x.mean())/x.std() #考虑多个成分的pca pca=KernelPCA().fit(data.apply(scale_function)) len(pca.lambdas_) pca.lambdas_[:10].round() #规范化 get_we=lambda x:x/x.sum() get_we(pca.lambdas_)[:10] get_we(pca.lambdas_)[:5].sum() #构造pca指数 #只包含第一个成分的pca指数 pca=KernelPCA(n_components=1).fit(data.apply(scale_function)) dax['PCA_1']=pca.transform(-data) dax.apply(scale_function).plot(figsize=(8,4)) #计算单个结果成分的加权平均数 pca=KernelPCA(n_components=5).fit(data.apply(scale_function)) pca_components=pca.transform(-data) weights=get_we(pca.lambdas_) dax['PCA_5']=np.dot(pca_components,weights) dax.apply(scale_function).plot(figsize=(8,4)) #散点图 mpl_dates=mpl.dates.date2num([n for n in pd.to_datetime(data.index)]) #mpl_dates=mpl.dates.date2num(data.index) mpl_dates plt.figure(figsize=(8,4)) plt.scatter(dax['PCA_5'],dax['DJIA'],c=mpl_dates) lin_reg=np.polyval(np.polyfit(dax['PCA_5'],dax['DJIA'],1),dax['PCA_5'])
#PCA降维 #from sklearn.decomposition import PCA #pca = PCA(n_components=3) # np_data_3d = pca.fit(np_data) # #返回所保留的n个成分各自的方差百分比 # print(pca.explained_variance_ratio_) # print(pca.explained_variance_) # 核 PCA from sklearn.decomposition import KernelPCA pca = KernelPCA(n_components=6, kernel='rbf', gamma=15) np_data_3d = pca.fit(np_data) data_new_3d = pca.transform(np_data) #显示处理后数据大小 print(data_new_3d.shape) #各种聚类算法、评价 metrics #k-means from sklearn.cluster import KMeans import sklearn.metrics as metrics from sklearn.cluster import DBSCAN ## k-means++ y_pred = KMeans(n_clusters=3, random_state=9).fit_predict(data_new_3d) # y_pred = DBSCAN(eps=0.4, # 邻域半径 # min_samples=5, # 最小样本点数,MinPts
def pca_transform(self, nb_PC=4, remove_mean0=False, remove_mean1=False, standard=False, sklearn=False, sklearn_kernel=False, cov=True): """ Perform the Principal component analysis with SKlearn using singular value fft The dataframe is standardize parameters: standard: default = True, standardize the dataframe nb_PC: default = 4, number of principal components to be used sklearn: if True (default=False) use svd by sklearn cov: if true (by default) sue the correlation matrix to perform the PCA analysis Stock in the object Dataframe with: eigenvalues eigenvectors scores list of vectors: eigenpairs NOTE: By default sklearn remove the mean from the dataset. So I cant use it to perform the downscalling References: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html#projection-onto-the-new-feature-space """ df = self.df self.nb_PC = nb_PC if remove_mean0: print('remove_mean0') df = df.subtract(df.mean(axis=0), axis='columns') if remove_mean1: print('remove_mean1') df = df.subtract(df.mean(axis=1), axis='index') print(df) if standard: # standardize # df_std = StandardScaler().fit_transform(df) self.standard = True df = (df - df.mean(axis=0)) / df.std( axis=0) # another way to standardise #======================================================================= # Sklearn #======================================================================= if sklearn: print("o" * 80) print("SVD sklearn used") print("o" * 80) if sklearn_kernel: print('sklearn_kernel') pca = KernelPCA(nb_PC, kernel="rbf", fit_inverse_transform=True, gamma=10) #Create a PCA model with nb_PC principal components else: pca = PCA(nb_PC) # fit data pca.fit(df) #Get the components from transforming the original data. scores = pca.transform(df) # or PCs eigenvalues = pca.explained_variance_ eigenvectors = pca.components_ # or loading # Make a list of (eigenvalue, eigenvector) tuples self.eigpairs = [(np.abs(self.eigenvalues[i]), self.eigenvector[i, :]) for i in range(len(self.eigenvalues))] #======================================================================= # Covariance Matrix #======================================================================= if cov: print("o" * 80) print("Covariance used") print("o" * 80) X = df.values cov_mat = np.cov(X.T) eigenvalues, eigenvectors = np.linalg.eig(cov_mat) scores = X.dot(eigenvectors) scores = pd.DataFrame(scores, columns=np.arange(1, len(df.columns) + 1), index=df.index) eigenvalues = pd.Series(eigenvalues, index=np.arange(1, len(df.columns) + 1)) eigenvectors = pd.DataFrame(eigenvectors.T, columns=df.columns, index=np.arange( 1, len(df.columns) + 1)) self.scores = scores.iloc[:, 0:nb_PC] self.eigenvalues = eigenvalues #[0:nb_PC] self.eigenvectors = eigenvectors[0:nb_PC] tot = sum(eigenvalues) self.var_exp = [(i / tot) * 100 for i in sorted(eigenvalues, reverse=True)]
coeff_slices = [] for (i, x) in enumerate(X): norm = normalize(x[1][:, np.newaxis], axis=0).ravel() coeffs = pywt.wavedec(norm, 'sym13', level=2) arr, coeff_slice = pywt.coeffs_to_array(coeffs) arrays.append(arr) coeff_slices.append(coeff_slice) plt.plot(arrays[1]) plt.savefig('ja_dwt') plt.clf() plt.plot(arrays[0]) plt.savefig('tymo_dwt') plt.clf() pca = KernelPCA(kernel='sigmoid').fit(arrays) transformed_X = pca.transform(arrays) plt.plot(transformed_X[1]) plt.savefig('ja_pca') plt.clf() plt.plot(transformed_X[0]) plt.savefig('tymo_pca') plt.clf() plt.scatter(transformed_X[1:21][:, 0], transformed_X[1:21][:, 1], c=y, cmap=matplotlib.colors.ListedColormap(["red", "blue"])) # plt.scatter(transformed_X[0][:, 0], transformed_X[0][:, 1]) plt.title("2D") plt.savefig('2d.png') clf = SVC()
X = dataset.iloc[:,2:4].values Y = dataset.iloc[:, 4].values # Split of Data into training and testing from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 0) # feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Applying Kernal PCA from sklearn.decomposition import KernelPCA kernalpca = KernelPCA(n_components = 2, kernel = 'rbf') kernalpca.fit_transform(X_train) kernalpca.transform(X_test) # Logistic regression from sklearn.linear_model import LogisticRegression regression = LogisticRegression(random_state = 0) regression.fit(X_train, Y_train) Y_pred = regression.predict(X_test) # Confusion metrics from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, Y_pred)
plt.figure(figsize=(10, 10)) plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', markerfacecolor='blue', markersize=12, color='blue', linewidth=4) plt.xlabel('Number of components') plt.ylabel('Cumulative explained variance') plt.show() #Applying Kernel PCA #Please Turn Off when applying PCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components=32, kernel='rbf') X_train = kpca.fit_transform(X_train) X_test = kpca.transform(X_test) #Applying LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=35) X_train = lda.fit_transform(X_train, Y_train) X_test = lda.fit_transform(X_test, Y_test) #Fitting SVM to the Training Set from sklearn.svm import SVC classifier = SVC( kernel='rbf', random_state=0) #kernel can be changed to linear for linear SVM classifier.fit(X_train, Y_train) #Fitting Decision Tree to the Training Set
class RGNN_PCA(): """ (Kernel) PCA module with fit and transform functions embedding_model = RGNN-based model to compute embeddings {'deep', 'arma', 'pool'} T = depth of each RGNN stack K = number of parallel stacks in_scaling = scaling of the input weights in the RGNNs in the first layer hid_scaling = scaling of the input weights in the RGNNs in all the other layers return_last = use as representation only the states from the last layer in the stack aggregation = global pooling method to obtain a graph embedding {'sum', 'average'} kwargs = dict specifying RGNN hyperparams """ def __init__(self, embedding_model=None, T=None, K=None, in_scaling=None, hid_scaling=None, return_last=None, aggregation=None, **kwargs): if embedding_model == 'deep': model = deep elif embedding_model == 'arma': model = ARMA elif embedding_model == 'pool': model = pool else: raise NotImplementedError('unsupported model type') # Reservoir-based model self.embedding_model = model(K=K, T=T, in_scaling=in_scaling, hid_scaling=hid_scaling, return_last=return_last, aggregation=aggregation, **kwargs) self.pca = None self.embeddings_tr = None def fit(self, *args): print('Fitting model') # Generate embeddings embeddings = [] for elem in tqdm.tqdm(zip(*args)): emb = self.embedding_model.get_embeddings(*elem) embeddings.append(emb) embeddings = np.vstack(embeddings) self.embeddings_tr = embeddings # Compute empirical covariance matrix (linear kernel) - Train vs Train K_tr = np.dot(self.embeddings_tr, self.embeddings_tr.T) self.pca = KernelPCA(n_components=2, kernel='precomputed') embeddings_pca = self.pca.fit_transform(K_tr) # self.pca = umap.UMAP() # embeddings = StandardScaler().fit_transform(embeddings) # embeddings_pca = self.pca.fit_transform(embeddings) return embeddings_pca def transform(self, *args): print('Evaluating model') # Generate embeddings embeddings = [] for elem in tqdm.tqdm(zip(*args)): emb = self.embedding_model.get_embeddings(*elem) embeddings.append(emb) embeddings = np.vstack(embeddings) # Compute empirical covariance matrix (linear kernel) - Test vs Train K_te = np.dot(embeddings, self.embeddings_tr.T) embeddings_pca = self.pca.transform(K_te) return embeddings_pca
from sklearn.feature_selection import SelectKBest from sklearn.decomposition import PCA, KernelPCA from sklearn import cross_validation from sklearn.cross_validation import StratifiedKFold import matplotlib.cm as cm # Import test data and labels import_test = sio.loadmat(file_loc + 'Test.mat') import_train = sio.loadmat(file_loc + 'Train.mat') X_train = import_train['Xtrain'] X_testing = import_test['Xtest'] Y_train = import_train['Ytrain'] pca = KernelPCA(kernel="rbf", degree=5, gamma=10) pca.fit_transform(X_train) #print(pca.explained_variance_ratio_) X_train = pca.transform(X_train) #k_fold = cross_validation.KFold(len(X_train), 5) Y_kf = Y_train.ravel() k_fold = StratifiedKFold(Y_kf, n_folds=5) print(k_fold) #X, X_test, Y, Y_test = cross_validation.train_test_split(X_train, Y_train, test_size=0.2, random_state=0) #y = Y.ravel() #X_test = X[401:,:] #X = X[:400,:] #X = X[:, :2] #Y_test = Y[401:,:] #Y = Y[:400,:] ''' eventsTrain = import_train['eventsTrain']
def classifyPHC(): data = readFile() #data = equalizeClasses(data) features, labels = splitData(data) #determine the training and testing size in the range of 1, 1 = 100% validation_size = 0.2 #here we are splitting our data based on the validation_size into training and testing data features_train, features_validation, labels_train, labels_validation = model_selection.train_test_split( features, labels, test_size=validation_size) #normalize data in the range [-1,1] scaler = MinMaxScaler(feature_range=(-1, 1)) #fit only th training data in order to find the margin and then test to data without normalize them scaler.fit(features_train) features_train_scalar = scaler.transform(features_train) #trnasform the validation features without fitting them features_validation_scalar = scaler.transform(features_validation) #determine the pca, and determine the dimension you want to end up pca = KernelPCA(n_components=5, kernel='rbf', fit_inverse_transform=True) #fit only the features train pca.fit(features_train_scalar) #dimensionality reduction of features train features_train_pca = pca.transform(features_train_scalar) #dimensionality reduction of fatures validation features_validation_pca = pca.transform(features_validation_scalar) #reconstruct data training error reconstruct_data = pca.inverse_transform(features_train_pca) error_percentage = ( sum(sum(error_matrix)) / (len(features_train_scalar) * len(features_train_scalar[0]))) * 100 #len(features_train_scalar) = len(reconstruct_data) = 89 #len(features_train_scalar[0]) = len(reconstruct_data[0]) = 13 #len(error_matrix) = 89, which means for all the samples #len(error_matrix[0]) = 13, for every feature of every sample #we take the sum and we conlcude in an array which has the sum for every feature (error) #so we take the sum again and we divide it with the 89 samples * 13 features print 'Information loss of KernelPCA:', error_percentage, '% \n' lda = LinearDiscriminantAnalysis() lda.fit(features_train_pca, labels_train) features_train_pca = lda.transform(features_train_pca) features_validation_pca = lda.transform(features_validation_pca) #we can see the shapes of the array just to check print 'feature training array: ', features_train_pca.shape, 'and label training array: ', labels_train.shape print 'feature testing array: ', features_validation_pca.shape, 'and label testing array: ', labels_validation.shape, '\n' #take the best couple of parameters from the procedure of greedy search #paramTuning(features_train, labels_train, 5) #we initialize our model #svm = SVC(kernel='rbf',C=10,gamma=0.0001,decision_function_shape='ovo') svm = KNeighborsClassifier(n_neighbors=3) #train our model with the data that we previously precessed svm.fit(features_train_pca, labels_train) #now test our model with the test data predicted_labels = svm.predict(features_validation_pca) accuracy = accuracy_score(labels_validation, predicted_labels) print 'Classification accuracy: ', accuracy * 100, '\n' #see the accuracy in training procedure predicted_labels_train = svm.predict(features_train_pca) accuracy_train = accuracy_score(labels_train, predicted_labels_train) print 'Training accuracy: ', accuracy_train * 100, '\n' #confusion matrix to illustrate the faulty classification of each class conf_matrix = confusion_matrix(labels_validation, predicted_labels) print 'Confusion matrix: \n', conf_matrix, '\n' print 'Support class 0 class 1 class2:' #calculate the support of each class print ' ', conf_matrix[0][0] + conf_matrix[0][1] + conf_matrix[0][ 2], ' ', conf_matrix[1][0] + conf_matrix[1][1] + conf_matrix[1][ 2], ' ', conf_matrix[2][0] + conf_matrix[2][ 1] + conf_matrix[2][2], '\n' #calculate the accuracy of each class edema = (conf_matrix[0][0] / (conf_matrix[0][0] + conf_matrix[0][1] + conf_matrix[0][2])) * 100 paralysis = ( conf_matrix[1][1] / (conf_matrix[1][0] + conf_matrix[1][1] + conf_matrix[1][2])) * 100 normal = ( conf_matrix[2][2] / (conf_matrix[2][0] + conf_matrix[2][1] + conf_matrix[2][2])) * 100 #see the inside details of the classification print 'For class 0 edema cases:', conf_matrix[0][ 0], 'classified correctly and', conf_matrix[0][1] + conf_matrix[0][ 2], 'missclassified,', edema, 'accuracy \n' print 'For class 1 paralysis cases:', conf_matrix[1][ 1], 'classified correctly and', conf_matrix[1][0] + conf_matrix[1][ 2], 'missclassified,', paralysis, 'accuracy\n' print 'For class 0 normal cases:', conf_matrix[2][ 2], 'classified correctly and', conf_matrix[2][0] + conf_matrix[2][ 1], 'missclassified,', normal, 'accuracy \n' #try 5-fold cross validation scores = cross_val_score(svm, features_train_pca, labels_train, cv=5) print 'cross validation scores for 5-fold', scores, '\n' print 'parameters of the model: \n', svm.get_params(), '\n'
#Perform PCA pca=KernelPCA().fit(data.apply(scale_function).fillna(0)) #Review eigenvalues (only look at first ten) pca.lambdas_[:10].round() #Get relative weight get_we=lambda x: x/x.sum() get_we(pca.lambdas_)[:10] #First compoonent explains~65% of variability #consutruct pca index with just the first component pca=KernelPCA(n_components=1).fit(data.apply(scale_function).fillna(0)) dax['PCA_1']=pca.transform(-data.fillna(0)) dax.apply(scale_function).plot(figsize=(8,4)) #Add in more components pca=KernelPCA(n_components=5).fit(data.apply(scale_function).fillna(0)) pca_components=pca.transform(data.fillna(0)) weights=get_we(pca.lambdas_) dax['PCA_5']=np.dot(pca_components,weights) dax.apply(scale_function).plot(figsize=(8,4)) #############################
pl.figure() pl.subplot(2, 2, 1, aspect='equal') pl.title("Original space") reds = y == 0 blues = y == 1 pl.plot(X[reds, 0], X[reds, 1], "ro") pl.plot(X[blues, 0], X[blues, 1], "bo") pl.xlabel("$x_1$") pl.ylabel("$x_2$") X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50)) X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T # projection on the first principal component (in the phi space) Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape) pl.contour(X1, X2, Z_grid, colors='grey', linewidths=1, origin='lower') pl.subplot(2, 2, 2, aspect='equal') pl.plot(X_pca[reds, 0], X_pca[reds, 1], "ro") pl.plot(X_pca[blues, 0], X_pca[blues, 1], "bo") pl.title("Projection by PCA") pl.xlabel("1st principal component") pl.ylabel("2nd component") pl.subplot(2, 2, 3, aspect='equal') pl.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro") pl.plot(X_kpca[blues, 0], X_kpca[blues, 1], "bo") pl.title("Projection by KPCA") pl.xlabel("1st principal component in space induced by $\phi$") pl.ylabel("2nd component")
kpca_0.fit(data) kpca_data = kpca_0.fit_transform(data) fig = plt.figure(4) if kpca_num == 2: for i in range(len(labels_predict)): plt.scatter(kpca_data[i,0],kpca_data[i,1],c=color[labels_predict[i]] ,marker='o') else: ax = fig.add_subplot(111,projection='3d') for i in range(len(labels_predict)): ax.scatter(kpca_data[i,0],kpca_data[i,1],kpca_data[i,2] ,c=color[labels_predict[i]],marker='o') kpca_1 = KernelPCA(n_components=kpca_num,kernel='rbf',gamma=gamma,degree=degree) kpca_1.fit(data_fs1) kpca_data_fs1 = kpca_1.transform(data_fs1) fig = plt.figure(5) if kpca_num == 2: for i in range(len(labels_fs1_predict)): plt.scatter(kpca_data_fs1[i,0],kpca_data_fs1[i,1] ,c=color[labels_fs1_predict[i]],marker='o') else: ax = fig.add_subplot(111,projection='3d') for i in range(len(labels_fs1_predict)): ax.scatter(kpca_data_fs1[i,0],kpca_data_fs1[i,1],kpca_data_fs1[i,2] ,c=color[labels_fs1_predict[i]],marker='o') kpca_2 = KernelPCA(n_components=kpca_num,kernel='rbf',gamma=gamma,degree=degree) kpca_2.fit(data_fs2) kpca_data_fs2 = kpca_2.transform(data_fs2) fig = plt.figure(6)
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( sparse_corpus_tfidf_transpose, df.ix[:, 1], test_size=0.2, random_state=seed) from sklearn.decomposition import KernelPCA from sklearn.linear_model import LogisticRegression from sklearn.model_selection import learning_curve import matplotlib.pyplot as plt # reduce dimensions print('Starting dimensionality reduction') reducer = KernelPCA(n_components=1500, kernel="cosine", random_state=seed) corpus_train_tfidf_kpca = reducer.fit_transform(X_train) corpus_test_tfidf_kpca = reducer.transform(X_test) print('Finished dimensionality reduction') #Initialize Logistic Regression log_reg = LogisticRegression(C=1.0) log_reg.fit(corpus_train_tfidf_kpca, y_train) a = log_reg.score(corpus_test_tfidf_kpca, y_test) print('Starting logistic regression 2') log_reg.fit(X_train, y_train) b = log_reg.score(X_test, y_test)
svm_lda = clf.best_estimator_ svm_lda.fit(X_train_lda, y_train) y_train_pred = svm_lda.predict(X_train_lda) print('Accruacy of LDA SVM model(training set):') print(metrics.accuracy_score(y_train, y_train_pred)) y_test_pred = svm_lda.predict(X_test_lda) print('Accruacy of LDA SVM model(testing set):') print(metrics.accuracy_score(y_test, y_test_pred)) print("") #kpca kpca = KernelPCA(n_components = 10, kernel = 'rbf') X_train_kpca = kpca.fit_transform(X_train_std, y_train) X_test_kpca = kpca.transform(X_test_std) # LR after kpca tuned_parameters = [{'C':np.arange(0.01,1.0,0.01).tolist(), 'multi_class':['ovr']}] clf=GridSearchCV(LogisticRegression(),tuned_parameters,scoring='accuracy',cv=5) clf.fit(X_train_kpca,y_train) lr_kpca = clf.best_estimator_ lr_kpca. fit(X_train_kpca,y_train) y_train_pred = lr_kpca.predict(X_train_kpca) print('Accruacy of kPCA Logistic Regression model(training set):') print(metrics.accuracy_score(y_train, y_train_pred))
plt.clf() # Clear any existing figure axes = scatter_matrix(pca_sample_df, diagonal=d, **scatter_kwds) plt.savefig(scatter_matrix_fp_fmt.format('pca_' + d)) # Kernel PCA feature reduction kernels = [ 'linear', 'poly', 'rbf', # 'sigmoid', 'cosine', ] # Kernel PCAs are compute and memory intensive so fit on a random sample X_sample = X.sample(n=1000) print('Kernel PCA sample shape: {}'.format(X_sample.shape)) for kernel in kernels: kpca = KernelPCA(n_components=3, kernel=kernel, n_jobs=4) start_time = time.perf_counter() kpca.fit(X_sample) end_time = time.perf_counter() print('Time to fit: {:.1f}s'.format(end_time - start_time)) kpca_df = pd.DataFrame(data=kpca.transform(X_sample), index=X_sample.index) kpca_df[data.DEPENDENT] = y for d in diagonals: plt.clf() # Clear any existing figure axes = scatter_matrix(kpca_df, diagonal=d, **scatter_kwds) plt.savefig(scatter_matrix_fp_fmt.format(kernel + '_pca_' + d)) # kernel_pcas[kernel] = pca print(kernel, dt.datetime.now())
components = 0.99 #canshu if PCAflag == 1: pca = PCA(n_components=components, svd_solver='full') pca.fit(train) train_new = pca.transform(train) sim_new = pca.transform(sim) print('pca.explained_variance_ratio_', pca.explained_variance_ratio_) print('sum(pca.explained_variance_ratio_)', sum(pca.explained_variance_ratio_)) print(pca.singular_values_) else: kpca = KernelPCA(n_components=components, kernel="rbf", fit_inverse_transform=True) kpca.fit(train) train_new = kpca.transform(train) sim_new = kpca.transform(sim) print('pca.explained_variance_ratio_', pca.explained_variance_ratio_) print('sum(pca.explained_variance_ratio_)', sum(pca.explained_variance_ratio_)) print(pca.singular_values_) print('train.shape', train.shape) print('train_new.shape', train_new.shape) if plotflag == 1: plt.figure(figsize=(10, 8)) for i in range(0, category): plt.subplot(1, 2, 1) plt.scatter(train_new[i * traindataset:(i + 1) * traindataset, 0], train_new[i * traindataset:(i + 1) * traindataset, 1], marker='o',