def reduceDataset(self,nr=3,method='PCA'): '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library Methods available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] #dataset=self.dataset[Model.in_columns] #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']] #PCA if method=='PCA': sklearn_pca = sklearnPCA(n_components=nr) reduced = sklearn_pca.fit_transform(dataset) #Factor Analysis elif method=='FactorAnalysis': fa=FactorAnalysis(n_components=nr) reduced=fa.fit_transform(dataset) #kernel pca with rbf kernel elif method=='KPCArbf': kpca=KernelPCA(nr,kernel='rbf') reduced=kpca.fit_transform(dataset) #kernel pca with poly kernel elif method=='KPCApoly': kpca=KernelPCA(nr,kernel='poly') reduced=kpca.fit_transform(dataset) #kernel pca with cosine kernel elif method=='KPCAcosine': kpca=KernelPCA(nr,kernel='cosine') reduced=kpca.fit_transform(dataset) #kernel pca with sigmoid kernel elif method=='KPCAsigmoid': kpca=KernelPCA(nr,kernel='sigmoid') reduced=kpca.fit_transform(dataset) #ICA elif method=='IPCA': ipca=IncrementalPCA(nr) reduced=ipca.fit_transform(dataset) #Fast ICA elif method=='FastICAParallel': fip=FastICA(nr,algorithm='parallel') reduced=fip.fit_transform(dataset) elif method=='FastICADeflation': fid=FastICA(nr,algorithm='deflation') reduced=fid.fit_transform(dataset) elif method == 'All': self.dimensionalityReduction(nr=nr) return self self.ModelInputs.update({method:reduced}) self.datasetsAvailable.append(method) return self
def Kernel_PCA(HE_MI_train_test, kernel, invTran, degree): ''' 개요 - Kernel PCA 을 적용한다. ''' MyDataSet = HE_MI_train_test my_HEtraining = MyDataSet[0] my_MItraining = MyDataSet[1] my_HEtest = MyDataSet[2] my_MItest = MyDataSet[3] kpca = KernelPCA(kernel=kernel, fit_inverse_transform=invTran, degree=degree) HE_training_kpca = kpca.fit_transform(my_HEtraining) MI_training_kpca = kpca.fit_transform(my_MItraining) HE_test_kpca = kpca.fit_transform(my_HEtest) MI_test_kpca = kpca.fit_transform(my_MItest) HE_training_KPCA_2dim = []; MI_training_KPCA_2dim = [] HE_test_KPCA_2dim = []; MI_test_KPCA_2dim = [] for pt in HE_training_kpca: HE_training_KPCA_2dim.append((pt[0], pt[1])) for pt in MI_training_kpca: MI_training_KPCA_2dim.append((pt[0], pt[1])) for pt in HE_test_kpca: HE_test_KPCA_2dim.append((pt[0], pt[1])) for pt in MI_test_kpca: MI_test_KPCA_2dim.append((pt[0], pt[1])) return [HE_training_KPCA_2dim, MI_training_KPCA_2dim, HE_test_KPCA_2dim, MI_test_KPCA_2dim]
def dimensionalityReduction(self,nr=5): '''It applies all the dimensionality reduction techniques available in this class: Techniques available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] sklearn_pca = sklearnPCA(n_components=nr) p_components = sklearn_pca.fit_transform(dataset) fa=FactorAnalysis(n_components=nr) factors=fa.fit_transform(dataset) kpca=KernelPCA(nr,kernel='rbf') rbf=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='poly') poly=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='cosine') cosine=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='sigmoid') sigmoid=kpca.fit_transform(dataset) ipca=IncrementalPCA(nr) i_components=ipca.fit_transform(dataset) fip=FastICA(nr,algorithm='parallel') fid=FastICA(nr,algorithm='deflation') ficaD=fip.fit_transform(dataset) ficaP=fid.fit_transform(dataset) '''isomap=Isomap(n_components=nr).fit_transform(dataset) try: lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset) except ValueError: lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset) try: lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset) except ValueError: lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) try: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset) except ValueError: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)''' values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3] keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa'] self.ModelInputs.update(dict(zip(keys, values))) [self.datasetsAvailable.append(key) for key in keys ] #debug #dataset=pd.DataFrame(self.ModelInputs['Dataset']) #dataset['Output']=self.ModelOutput #self.debug['Dimensionalityreduction']=dataset ### return self
def _dimReduce(df, method='pca', n_components=2, labels=None, standardize=False, smatFunc=None, ldaShrinkage='auto'): if method == 'kpca': """By using KernelPCA for dimensionality reduction we don't need to impute missing values""" if smatFunc is None: smatFunc = corrTSmatFunc pca = KernelPCA(kernel='precomputed', n_components=n_components) smat = smatFunc(df).values xy = pca.fit_transform(smat) pca.components_ = pca.alphas_ pca.explained_variance_ratio_ = pca.lambdas_ / pca.lambdas_.sum() return xy, pca elif method == 'pca': if standardize: normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0) else: normed = df.apply(lambda vec: vec - vec.mean(), axis=0) pca = PCA(n_components=n_components) xy = pca.fit_transform(normed) return xy, pca elif method == 'lda': if labels is None: raise ValueError('labels needed to perform LDA') if standardize: normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0) else: normed = df.apply(lambda vec: vec - vec.mean(), axis=0) if df.shape[1] > df.shape[0]: """Pre-PCA step""" ppca = PCA(n_components=int(df.shape[0]/1.5)) normed = ppca.fit_transform(df) lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=ldaShrinkage, n_components=n_components) lda.fit(normed, labels.values) lda.explained_variance_ratio_ = np.abs(lda.explained_variance_ratio_) / np.abs(lda.explained_variance_ratio_).sum() xy = lda.transform(normed) elif method == 'pls': if labels is None: raise ValueError('labels needed to perform PLS') if standardize: normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0) else: normed = df.apply(lambda vec: vec - vec.mean(), axis=0) pls = PLSRegression(n_components=n_components) pls.fit(normed, labels) pls.explained_variance_ratio_ = np.zeros(n_components) xy = pls.x_scores_ return xy, pls
def test_remove_zero_eig(): X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]]) # n_components=None (default) => remove_zero_eig is True kpca = KernelPCA() Xt = kpca.fit_transform(X) assert_equal(Xt.shape, (3, 0)) kpca = KernelPCA(n_components=2) Xt = kpca.fit_transform(X) assert_equal(Xt.shape, (3, 2)) kpca = KernelPCA(n_components=2, remove_zero_eig=True) Xt = kpca.fit_transform(X) assert_equal(Xt.shape, (3, 0))
def MyPCA(): X,y = circle_data() kpca = KernelPCA(kernel='rbf', fit_inverse_transform=True, gamma= 10) X_kpca = kpca.fit_transform(X) pca = PCA() x_pca = pca.fit_transform(X) return X_kpca
def kPCA_visualization1d(X, y): kpca = KernelPCA(kernel="linear", fit_inverse_transform=True, gamma=10, n_components=2) X_kpca = kpca.fit_transform(X) X_back = kpca.inverse_transform(X_kpca) pca = PCA(n_components=1) X_pca = pca.fit_transform(X) class_1 = [] class_0 = [] for i in range(0, len(y)): if y[i] == 1: class_1.append( list( X_kpca[i] )[0] ) else: class_0.append( list( X_kpca[i] )[0] ) print "check" print class_1[:10] import numpy from matplotlib import pyplot pyplot.hist(class_1, 50, alpha=0.5, label='class 1' ) pyplot.hist(class_0, 50, alpha=0.5, label='class 0') pyplot.legend(loc='upper right') pyplot.show()
def test_kernel_pca(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) for eigen_solver in ("auto", "dense", "arpack"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=True) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # non-regression test: previously, gamma would be 0 by default, # forcing all eigenvalues to 0 under the poly kernel assert_not_equal(X_fit_transformed, []) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1]) # inverse transform X_pred2 = kpca.inverse_transform(X_pred_transformed) assert_equal(X_pred2.shape, X_pred.shape)
def test_kernel_pca(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) def histogram(x, y, **kwargs): # Histogram kernel implemented as a callable. assert_equal(kwargs, {}) # no kernel_params that we didn't ask for return np.minimum(x, y).sum() for eigen_solver in ("auto", "dense", "arpack"): for kernel in ("linear", "rbf", "poly", histogram): # histogram kernel produces singular matrix inside linalg.solve # XXX use a least-squares approximation? inv = not callable(kernel) # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # non-regression test: previously, gamma would be 0 by default, # forcing all eigenvalues to 0 under the poly kernel assert_not_equal(X_fit_transformed.size, 0) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1]) # inverse transform if inv: X_pred2 = kpca.inverse_transform(X_pred_transformed) assert_equal(X_pred2.shape, X_pred.shape)
def test_compare_clinical_kernel(self): x_full, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1', standardize_numeric=False, to_numeric=False) trans = ClinicalKernelTransform() trans.fit(x_full) x = encode_categorical(standardize(x_full)) kpca = KernelPCA(kernel=trans.pairwise_kernel) xt = kpca.fit_transform(x) nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0) nrsvm.fit(xt, y) rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel=trans.pairwise_kernel, tol=1e-8, max_iter=1000, random_state=0) rsvm.fit(x, y) pred_nrsvm = nrsvm.predict(kpca.transform(x)) pred_rsvm = rsvm.predict(x) self.assertEqual(len(pred_nrsvm), len(pred_rsvm)) c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm) c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm) self.assertAlmostEqual(c1[0], c2[0]) self.assertTupleEqual(c1[1:], c2[1:])
class RegionSplitter_PCA_KMean(): def __init__(self, data, label): data_dim_num = len(data[0]) label_dim_num = len(label[0]) self.n_comp = max(1, data_dim_num) self.pca = PCA(n_components=self.n_comp) data = self.pca.fit_transform(data) data_zipped = list(zip(*data)) # k-mean cluster for the dimension self.clusterer = KMeans(n_clusters=2, init='k-means++') self.clusterer.fit(list(zip(*data_zipped))) def classify(self, data): if not isinstance(data, tuple): raise(TypeError, "data must be a tuple") data = tuple(self.pca.transform(data)[0]) group = self.clusterer.predict(data) return group == 0
def pca(X, gamma1): kpca = KernelPCA(kernel='rbf', fit_inverse_transform=False, gamma=gamma1) X_kpca = kpca.fit_transform(X) print('X', X.shape) print('alphas', kpca.alphas_.shape) print('lambdas', kpca.lambdas_.shape) #X_back = kpca.inverse_transform(X_kpca) return X_kpca
def isomap(self, num_dims=None, directed=None): '''Isomap embedding. num_dims : dimension of embedded coordinates, defaults to input dimension directed : used for .shortest_path() calculation ''' W = -0.5 * self.shortest_path(directed=directed) ** 2 kpca = KernelPCA(n_components=num_dims, kernel='precomputed') return kpca.fit_transform(W)
class StyloPCA(StyloClassifier): def __init__(self,corpus,n_components=2,kernel=None): StyloClassifier.__init__(self,corpus) data = self.data_frame[self.cols].values self.n_components = n_components self.kernel = kernel if not kernel: self.pca = PCA(n_components=self.n_components) else: self.pca = KernelPCA(kernel=kernel, gamma=10) self.pca_data = self.pca.fit_transform(StandardScaler().fit_transform(data)) def plot_pca(self, out_file=None): self.create_plot_pca() plt.show() # if out_file: # plt.savefig(out_file) def create_plot_pca(self): plt.figure(1) plt.clf() all_authors = set(self.data_frame["Author"]) for a in all_authors: rows = self.data_frame.loc[self.data_frame["Author"] == a] indices = self.data_frame.loc[self.data_frame["Author"] == a].index plt.plot(self.pca_data[indices,0],self.pca_data[indices,1], 'o', markersize=7,\ color=(random.random(),random.random(),random.random()), alpha=0.5, label=rows["Author_Orig"][indices[0]]) plt.xlabel(self.cols[0]) plt.ylabel(self.cols[1]) plt.legend() plt.title('Transformed stylometry data using PCA') def plot_explained_variance(self, out_file=None): self.create_plot_explained_variance() plt.show() def create_plot_explained_variance(self): if not self.kernel: evr = self.pca.explained_variance_ else: evr = self.pca.lambdas_ print evr fig = plt.figure() ax = fig.add_subplot(111) tot = sum(evr) var_exp = [(i / tot)*100 for i in sorted(evr, reverse=True)] cum_var_exp = np.cumsum(var_exp) plt.plot(range(1,len(cum_var_exp)+1),cum_var_exp, 'b*-') width = .8 plt.bar(range(1,len(var_exp)+1), var_exp, width=width) # ax.set_xticklabels() plt.grid(True) ax.set_ylim((0,110)) plt.xlabel('n_components') plt.ylabel('Percentage of variance explained') plt.title('Variance Explained vs. n_components')
def Kernel_PCA(HE_MI_train_test, kernel, invTran, degree): ''' 개요 - Kernel PCA 을 적용한다. ''' MyDataSet = HE_MI_train_test my_HEtraining = MyDataSet[0] my_MItraining = MyDataSet[1] my_HEtest = MyDataSet[2] my_MItest = MyDataSet[3] kpca = KernelPCA(kernel=kernel, fit_inverse_transform=invTran, degree=degree) HE_training_kpca = kpca.fit_transform(my_HEtraining) MI_training_kpca = kpca.fit_transform(my_MItraining) HE_test_kpca = kpca.fit_transform(my_HEtest) MI_test_kpca = kpca.fit_transform(my_MItest) return [HE_training_kpca, MI_training_kpca, HE_test_kpca, MI_test_kpca]
def main(): definition = load_definition() data = np.load(os.path.join(ROOT, definition.embedding)) uuids = np.load(os.path.join(ROOT, definition.uuids)) pca = KernelPCA(**definition.pca) tsne = TSNE(**definition.tsne) data = pca.fit_transform(data) data = tsne.fit_transform(data) plot_vectors(data, uuids, definition.sources, definition.output)
def kernelPCA(data, labels, new_dimension): print "start kernel pca..." if hasattr(data, "toarray"): data = data.toarray() start = time.time() pca = KernelPCA(fit_inverse_transform=True, gamma=10, n_components=new_dimension, alpha=2) reduced = pca.fit_transform(data) end = time.time() return (reduced, end-start)
def try_kpca(kernel, invTran, degree): ''' 개요 - Kernel PCA 을 적용한다. ''' MyDataSet = training_test(24, 150) my_HEtraining = MyDataSet[0] my_MItraining = MyDataSet[1] my_HEtest = MyDataSet[2] my_MItest = MyDataSet[3] from sklearn.decomposition import PCA, KernelPCA kpca = KernelPCA(kernel=kernel, fit_inverse_transform=invTran, degree=degree) HE_training_kpca = kpca.fit_transform(my_HEtraining) MI_training_kpca = kpca.fit_transform(my_MItraining) HE_test_kpca = kpca.fit_transform(my_HEtest) MI_test_kpca = kpca.fit_transform(my_MItest) return [HE_training_kpca, MI_training_kpca, HE_test_kpca, MI_test_kpca]
def fit(self,X, num, method='dijkstra'): # Construct k-neigh. graph knn = KNN(num).fit(X) #Find shortest path if method == 'dijkstra': result = dijkstra(knn) else: result = shortest_path(knn, method=method) #Multidimensional scaling #Can be used Kernel PCA model = KernelPCA(n_components=num) return model.fit_transform(result)
def test_kernel_pca_deterministic_output(): rng = np.random.RandomState(0) X = rng.rand(10, 10) eigen_solver = ('arpack', 'dense') for solver in eigen_solver: transformed_X = np.zeros((20, 2)) for i in range(20): kpca = KernelPCA(n_components=2, eigen_solver=solver, random_state=rng) transformed_X[i, :] = kpca.fit_transform(X)[0] assert_allclose( transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
def isomap(X, n_neighbors, metric): """ Based on sklearn, Author: Jake Vanderplas -- <*****@*****.**> License: BSD, (C) 2011 """ kng = kneighbors_graph(D, n_neighbors = n_neighbors, metric = metric) dist_matrix_ = graph_shortest_path(kng, method='auto', directed=False) kernel_pca_ = KernelPCA(n_components=2, kernel="precomputed", eigen_solver='auto') G = dist_matrix_ ** 2 G *= -0.5 return kernel_pca_.fit_transform(G)
def reduce_kpca(X, kern, retall=False): """ reduce_kpca(X, components, kern, retall=False) Reduce dim by Kernel PCA """ kpca = KernelPCA(kernel=kern, fit_inverse_transform=True) X_kpca = kpca.fit_transform(X) X_back = kpca.inverse_transform(X_kpca) if not retall: return X_kpca, X_back else: return X_kpca, X_back, kpca
def RunKPCAScikit(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) data = np.genfromtxt(self.dataset, delimiter=',') with totalTimer: # Get the new dimensionality, if it is necessary. dimension = re.search('-d (\d+)', options) if not dimension: d = data.shape[1] else: d = int(dimension.group(1)) if (d > data.shape[1]): Log.Fatal("New dimensionality (" + str(d) + ") cannot be greater " + "than existing dimensionality (" + str(data.shape[1]) + ")!") q.put(-1) return -1 # Get the kernel type and make sure it is valid. kernel = re.search("-k ([^\s]+)", options) try: if not kernel: Log.Fatal("Choose kernel type, valid choices are 'linear'," + " 'hyptan' and 'polynomial'.") q.put(-1) return -1 elif kernel.group(1) == "linear": model = KernelPCA(n_components=d, kernel="linear") elif kernel.group(1) == "hyptan": model = KernelPCA(n_components=d, kernel="sigmoid") elif kernel.group(1) == "polynomial": degree = re.search('-D (\d+)', options) degree = 1 if not degree else int(degree.group(1)) model = KernelPCA(n_components=d, kernel="poly", degree=degree) else: Log.Fatal("Invalid kernel type (" + kernel.group(1) + "); valid " + "choices are 'linear', 'hyptan' and 'polynomial'.") q.put(-1) return -1 out = model.fit_transform(data) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def project(X, kde = False, kernel = False, gamma = 10): if kernel: kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=gamma) reduced_data = kpca.fit_transform(X) else: pca = PCA(n_components=2).fit(X) print pca.explained_variance_ratio_ print pca.components_ reduced_data = pca.transform(X) if kde: with sns.axes_style("white"): sns.jointplot(reduced_data[:, 0], reduced_data[:, 1], kind="kde"); plt.show() plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) return reduced_data
def test_kernel_pca_sparse(): rng = np.random.RandomState(0) X_fit = sp.csr_matrix(rng.random_sample((5, 4))) X_pred = sp.csr_matrix(rng.random_sample((2, 4))) for eigen_solver in ("auto", "arpack"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=False) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1])
def kPCA_visualization2d(X, y): kpca = KernelPCA(kernel="linear", fit_inverse_transform=True, gamma=10, n_components=2) X_kpca = kpca.fit_transform(X) X_back = kpca.inverse_transform(X_kpca) pca = PCA(n_components=2) X_pca = pca.fit_transform(X) class_1 = [] class_0 = [] for i in range(0, len(y)): if y[i] == 1: class_1.append( X_kpca[i] ) else: class_0.append( X_kpca[i] ) class_0_x = [] class_0_y = [] class_1_x = [] class_1_y = [] for x in class_0: class_0_x.append( x[0] ) class_0_y.append( x[1] ) for x in class_1: class_1_x.append( x[0] ) class_1_y.append( x[1] ) # Plot #print principle component plt.title("kPCA kernel = linear") plt.plot( class_0_x, class_0_y, "ro") plt.plot( class_1_x, class_1_y, "go") plt.title("Projection by PCA") plt.xlabel("1st principal component") plt.ylabel("2nd component") plt.show()
def test_nested_circles(): # Test the linear separability of the first 2D KPCA transform X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0) # 2D nested circles are not linearly separable train_score = Perceptron().fit(X, y).score(X, y) assert_less(train_score, 0.8) # Project the circles data into the first 2 components of a RBF Kernel # PCA model. # Note that the gamma value is data dependent. If this test breaks # and the gamma value has to be updated, the Kernel PCA example will # have to be updated too. kpca = KernelPCA(kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.0) X_kpca = kpca.fit_transform(X) # The data is perfectly linearly separable in that space train_score = Perceptron().fit(X_kpca, y).score(X_kpca, y) assert_equal(train_score, 1.0)
def main(): filename1 = "HE.csv" filename2 = "MI.csv" csv_data = read_csv(filename1=filename1, filename2=filename2) csv_data = np.array(csv_data) total_matrix = [] matrix_key = [] train_mat = [] test_mat = [] for x in csv_data: for idx in x: matrix_key.append(idx) total_matrix.append(x[idx]) total_matrix = np.array(total_matrix) print total_matrix.shape y = [1] * 37 + [-1] * 208 print total_matrix[0] kpca = KernelPCA(n_components=2, kernel='rbf') A = kpca.fit_transform(total_matrix) ''' for x in total_matrix[:37]: plt.plot(x,'b') for x in total_matrix[37:]: plt.plot(x,'r') ''' for idx in A[:37]: plt.plot(idx[0], idx[1], 'bo') for idx in A[37:]: plt.plot(idx[0], idx[1], 'ro') plt.show()
def dim_reduce(self, reduce_method = None, n_components = None): ''' Only dimensionality reduction. :param reduce_method: The method for dimensionality reduction. Can be ... :n_components: Dimension ''' if reduce_method is None: return self.X_train elif reduce_method == 'pca': print('performing pca dimensionality reduction.') pca = PCA(n_components = n_components, whiten = False) self.X_train = pca.fit_transform(self.X_train) elif reduce_method =='kpca': print('performing kpca dimensionality reduction.') kpca = KernelPCA(n_components = n_components, kernel = 'rbf', eigen_solver = 'arpack') self.X_train = kpca.fit_transform(self.X_train)
def plotModuleEmbedding(dmatDf, labels, dropped=None, method='kpca', plotLabels=True, plotDims=[0,1], weights=None, txtSize='large'): """Embed cytokine correlation matrix to visualize cytokine clusters""" uLabels = np.unique(labels).tolist() n_components = max(plotDims) + 1 dmat = dmatDf.values if method == 'kpca': """By using KernelPCA for dimensionality reduction we don't need to impute missing values""" pca = KernelPCA(kernel='precomputed', n_components=n_components) gram = 1 - (dmat / dmat.max()) xy = pca.fit_transform(gram) elif method == 'tsne': xy = tsne.run_tsne(dmat) elif method == 'sklearn-tsne': tsneObj = TSNE(n_components=n_components, metric='precomputed', random_state=0) xy = tsneObj.fit_transform(dmat) colors = palettable.colorbrewer.get_map('Set1', 'qualitative', len(uLabels)).mpl_colors figh = plt.gcf() figh.clf() axh = figh.add_axes([0.03,0.03,0.94,0.94]) axh.axis('off') figh.set_facecolor('white') annotationParams = dict(xytext=(0,5), textcoords='offset points', size=txtSize) for cyi,cy in enumerate(dmatDf.columns): if not dropped is None and dropped[cy]: cyLab = '*' + cy alpha = 0.3 else: cyLab = cy alpha = 0.8 if plotLabels: axh.annotate(cyLab, xy=(xy[cyi,plotDims[0]], xy[cyi,plotDims[1]]), **annotationParams) col = colors[uLabels.index(labels[cyi])] if weights is None: s = 100 else: s = weights[cy] * 200 + 10 axh.scatter(xy[cyi,plotDims[0]], xy[cyi,plotDims[1]], marker='o', s=s, alpha=alpha, c=col) plt.draw()
# print np.shape(data_train_poly) # pca = PCA(n_components=2) # data_proj = pca.fit_transform(np.transpose(data_train_poly)) # print np.shape(data_proj) # ae_proj = data_proj[ae_index,:] # ao_proj = data_proj[ao_index,:] # uw_proj = data_proj[uw_index,:] # iy_proj = data_proj[iy_index,:] # aa_proj = data_proj[aa_index,:] # eh_proj = data_proj[eh_index,:] # In[34]: kpca = KernelPCA(kernel="poly", coef0=2, degree=2, n_components=2) data_train = np.transpose(data_train) data_proj = kpca.fit_transform(data_train) print np.shape(data_proj) ae_proj = data_proj[ae_index, :] ao_proj = data_proj[ao_index, :] uw_proj = data_proj[uw_index, :] iy_proj = data_proj[iy_index, :] aa_proj = data_proj[aa_index, :] eh_proj = data_proj[eh_index, :] # In[35]: plt.scatter(ae_proj[:, 0], ae_proj[:, 1], c='r', marker='o') plt.scatter(ao_proj[:, 0], ao_proj[:, 1], c='g', marker='o') plt.scatter(uw_proj[:, 0], uw_proj[:, 1], c='b', marker='o') plt.scatter(iy_proj[:, 0], iy_proj[:, 1], c='y', marker='o') plt.scatter(aa_proj[:, 0], aa_proj[:, 1], c='c', marker='o')
from sklearn.cross_validation import KFold from sklearn import svm from sklearn.feature_selection import SelectKBest from sklearn.decomposition import PCA, KernelPCA from sklearn import cross_validation from sklearn.cross_validation import StratifiedKFold import matplotlib.cm as cm # Import test data and labels import_test = sio.loadmat(file_loc + 'Test.mat') import_train = sio.loadmat(file_loc + 'Train.mat') X_train = import_train['Xtrain'] X_testing = import_test['Xtest'] Y_train = import_train['Ytrain'] pca = KernelPCA(kernel="rbf", degree=5, gamma=10) pca.fit_transform(X_train) #print(pca.explained_variance_ratio_) X_train = pca.transform(X_train) #k_fold = cross_validation.KFold(len(X_train), 5) Y_kf = Y_train.ravel() k_fold = StratifiedKFold(Y_kf, n_folds=5) print(k_fold) #X, X_test, Y, Y_test = cross_validation.train_test_split(X_train, Y_train, test_size=0.2, random_state=0) #y = Y.ravel() #X_test = X[401:,:] #X = X[:400,:] #X = X[:, :2] #Y_test = Y[401:,:] #Y = Y[:400,:] '''
import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import KernelPCA from sklearn import datasets import pandas as pd iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names data1 = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target']) kpca = KernelPCA(n_components=2, kernel="rbf") data1_kpca = kpca.fit_transform(data1) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), )
print('LDA_LR train/test accuracies %.3f/%.3f' % (lda_lr_train_score, lda_lr_test_score)) ##########LDA&SVM########################################## svm = SVC(kernel='linear', C=1.0, random_state=42) svm.fit(X_train_lda, y_train) lda_svm_train_pred = svm.predict(X_train_lda) lda_svm_test_pred = svm.predict(X_test_lda) lda_svm_train_score = accuracy_score(y_train, lda_svm_train_pred) lda_svm_test_score = accuracy_score(y_test, lda_svm_test_pred) print('LDA_SVM train/test accuracies %.3f/%.3f' % (lda_svm_train_score, lda_svm_test_score)) ##########kPCA############################################# kpca = KernelPCA(n_components=2, kernel='rbf', gamma=0.1) X_train_kpca = kpca.fit_transform(X_train_std) X_test_kpca = kpca.transform(X_test_std) svm = SVC(kernel='linear', C=1.0, random_state=42) svm.fit(X_train_kpca, y_train) kpca_train_pred = svm.predict(X_train_kpca) kpca_test_pred = svm.predict(X_test_kpca) kpca_train_score = accuracy_score(y_train, kpca_train_pred) kpca_test_score = accuracy_score(y_test, kpca_test_pred) print('KPCA train/test accuracies %f/%f' % (kpca_train_score, kpca_test_score)) gammas = [0.001, 0.01, 0.1, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20] kpca_lr_train_scores = [] kpca_lr_test_scores = [] kpca_svm_train_scores = []
def dimRed(self, method="MDS", mapName=None, return_embedding=True, n_components=2, **kwargs): """ Perform dimensionality reduction using the specified embedding method. Currently support algorithms are "KPCA", "MDS", "TSNE", and "UMAP". The number of components (i.e. desired number of dimensions to reduce the data to is automatically set to 2, but other options may be chosen). For other keyword arguements (algorithm parameter settings), please refer to the documentation for each algorithm: Args ---- method: str Which dimensionality reduciton algorithm to use. "KPCA", "MDS", "TSNE", or "UMAP". mapName: str Optional name for the particular embedding. return_embedding: bool Return the embedding infomration as a embedding dataclass object. Is also stored in the StructureMap.embeddings attribute. Useful for IPython environments (see example 2). n_components: int Number of components to reduce the data to. I.e. for a 2D map, n_components=2, for a 3D map, n_components=3. Documentation and helpful resources: ------------------------------------ k-PCA: scikit-learn documentation: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html MDS: scikit-learn documentation: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html TSNE: scikit-learn documentation: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html a useful resource advising on parameter settings and interpretation: https://distill.pub/2016/misread-tsne/ UMAP: UMAP documentation: https://umap-learn.readthedocs.io/en/latest/parameters.html """ assert self._k is not None, "Calculate kernel matrix first!" method = method.strip().upper() # inititiate embedding scheme algorithm. if method == "MDS": a = eval(f"{method}(dissimilarity='precomputed', **kwargs)") elif method == "KPCA": a = KernelPCA(n_components=n_components, kernel="precomputed", **kwargs) else: a = eval(f"{method}(metric='precomputed', **kwargs)") # extract coordinates. if method in ["MDS", "TSNE"]: c = a.fit(self.d).embedding_ elif method == "UMAP": c = a.fit_transform(self.d) elif method == "KPCA": # note k-PCA takes the kernel form, not the distance matrix. c = a.fit_transform(self._k) # store coordinates in dataclass, along with information about the # scaling/soap_parameter/embedding_parameter settings. if mapName is not None and mapName in [m.name for m in self.maps]: raise ValueError(f"mapName '{mapName}' already used!") if mapName is None: mapName = f"map_{len(self.embeddings)+1:02d}" # create the dataclass. e = embedding(mapName, method, self._soap_parameters, self._scaling, c) # Store embedding in embeddings set. self.embeddings |= {e} # optionally return the embedding. if return_embedding: return e
startInd = index * 100 endInd = (index + 1) * 100 allXData[startInd:endInd, :] = curX index = index + 1 for kk in range(len(validationIDs)): curFile = 'AlexNetFeatures2D/feats3D_conv2_' + validationIDs[kk] + '.npy' curX = np.load(curFile) startInd = index * 100 endInd = (index + 1) * 100 allXData[startInd:endInd, :] = curX index = index + 1 transformer = KernelPCA(n_components=20) #transformer = random_projection.GaussianRandomProjection() allXnew = transformer.fit_transform(allXData) print(allXnew.shape) allPats = np.zeros((1595, 20 * 100)) for kk in range(len(trainTestIDs) + len(validationIDs)): startInd = kk * 100 endInd = (kk + 1) * 100 curPat = allXnew[startInd:endInd, :] allPats[kk, :] = np.reshape(curPat, 20 * 100) allPatsNew = transformer.fit_transform(allPats) print(allPatsNew.shape) Xdata = allPatsNew[0:numTrainTest, :] Xvalid = allPatsNew[numTrainTest:(numTrainTest + numValid), :]
random_state=0) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() # Fit on training set only. scaler.fit(train_img) # Apply transform to both the training set and the test set. train_img = scaler.transform(train_img) test_img = scaler.transform(test_img) # Applying kernel PCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components=80, kernel='rbf') train_img = kpca.fit_transform(train_img) train_img = kpca.transform(train_img) pca.fit(train_img) pca.n_components_ train_img = pca.transform(train_img) test_img = pca.transform(test_img) from sklearn.linear_model import LogisticRegression #all parameters not specified are set to their defaults # default solver is incredibly slow which is why it was changed to 'lbfgs' logisticRegr = LogisticRegression(solver='lbfgs')
s=500) plt.legend(scatterpoints=1) plt.tight_layout() # plt.savefig('images/05_18.png', dpi=300) plt.show() # ## Kernel principal component analysis in scikit-learn # In[51]: from sklearn.decomposition import KernelPCA X, y = make_moons(n_samples=100, random_state=123) scikit_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=15) X_skernpca = scikit_kpca.fit_transform(X) plt.scatter(X_skernpca[y == 0, 0], X_skernpca[y == 0, 1], color='red', marker='^', alpha=0.5) plt.scatter(X_skernpca[y == 1, 0], X_skernpca[y == 1, 1], color='blue', marker='o', alpha=0.5) plt.xlabel('PC1') plt.ylabel('PC2') plt.tight_layout()
transformed_data = np.array(transformed_data) label = df['class'].unique() print(label) with plt.style.context("seaborn-darkgrid"): for l in zip(label): plt.scatter(transformed_data[y==l,0], transformed_data[y==l,1], label=l) plt.xlabel("PC 1") plt.ylabel("PC 2") plt.legend() plt.show() kpca0 = KernelPCA(n_components=2, kernel='poly') Y = kpca0.fit_transform(X) with plt.style.context("seaborn-darkgrid"): for l in label: plt.scatter(Y[y==l,0], Y[y==l,1],label=l) plt.xlabel("PC 1") plt.ylabel("PC 2") plt.legend() plt.show() from sklearn.decomposition import PCA pca = PCA(n_components=2) Y_ = pca.fit_transform(X) with plt.style.context("seaborn-darkgrid"): for l in label:
print('svm_train_score_cv:', LDA_svm_train_score_cv) print('svm_test_score_cv:', LDA_svm_test_score_cv) #kPCA transformation (Test several different values for Gamma) from sklearn.decomposition import KernelPCA print('\n' + 'kPCA Trasformation:') kPCA_df = pd.DataFrame() for g in [ 0.018, 0.01899, 0.019, 0.0195, 0.02, 0.03, 0.05, 0.055, 0.06, 0.065, 0.07, 0.08, 0.09, 0.1, 0.2 ]: kpca = KernelPCA(n_components=2, kernel='rbf', gamma=g) kPCA_X_train = kpca.fit_transform(X_train_std) kPCA_X_test = kpca.transform(X_test_std) array = {f: s for f, s in zip()} array['gamma'] = g #kPCA lr lr.fit(kPCA_X_train, y_train) kPCA_lr_train_score_cv = np.average( cross_val_score(lr, kPCA_X_train, y_train, cv=10)) kPCA_lr_test_score_cv = np.average( cross_val_score(lr, kPCA_X_test, y_test, cv=10)) array['lr_train_score_cv'] = kPCA_lr_train_score_cv array['lr_test_score_cv'] = kPCA_lr_test_score_cv #kPCA SVM svm.fit(kPCA_X_train, y_train)
x_train_sc) #Aplicando lo aprendido x_test_pca=pca.transform( x_test_sc) #LDA - Componentes principales from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA #Aplicando LDA lda = LDA(n_components=1) x_train_lda = lda.fit_transform(x_train_sc, y_train) x_test_lda = lda.transform(x_test) #Kernel PCA - Componentes principales from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components=1, kernel='rbf') x_train_kpca = kpca.fit_transform(x_train_sc) x_test_kpca = kpca.transform(x_test_sc) #Regresion logistica ############################################################# from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix logistic=LogisticRegression(random_state=0) # 1) Variables originales logistic.fit(x_train, y_train) #modificar aqui x y_est_train=logistic.predict(x_train) #modificar aqui x,y #Matriz de confusion cm=confusion_matrix(y_train, y_est_train) #modificar y prec_train=(cm[0,0]+cm[1,1])/np.sum(cm) #modificar nombre
import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles np.random.seed(0) X, y = make_circles(n_samples=400, factor=.3, noise=.05) kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) X_kpca = kpca.fit_transform(X) X_back = kpca.inverse_transform(X_kpca) pca = PCA() X_pca = pca.fit_transform(X) print X_pca.shape print X_back.shape # print X_kpca.shape # Plot results plt.figure() plt.subplot(2, 2, 1, aspect='equal') plt.title("Original space") reds = y == 0 blues = y == 1 plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor='k') plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor='k') plt.xlabel("$x_1$") plt.ylabel("$x_2$")
def kernelPCA(k, X): X = np.array(X) kpca = KernelPCA(n_components=k) kpcaresult = kpca.fit_transform(X) np.savetxt("KernelPCA_out.csv", kpcaresult, delimiter=",") return None
from sklearn.datasets import make_moons import matplotlib.pyplot as plt from sklearn.decomposition import KernelPCA import numpy as np from matplotlib.ticker import FormatStrFormatter import rbf_kernel_pca as RKP X, y = make_moons(n_samples=100, random_state=123) skKpca = KernelPCA(n_components=2, kernel='rbf', gamma=15) X_skernpca = skKpca.fit_transform(X) plt.scatter(X_skernpca[y == 0, 0], X_skernpca[y == 0, 1], color='r', marker='^', alpha=0.5) plt.scatter(X_skernpca[y == 1, 0], X_skernpca[y == 1, 1], color='b', marker='o', alpha=0.5) plt.xlabel('PC1') plt.ylabel('PC2') plt.show()
y = dataset.iloc[:, 4] # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Feature Scaling sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Applying Kernal PCA kpca = KernelPCA(n_components=2, kernel='rbf') X_train = kpca.fit_transform(X_train) X_test = kpca.transform(X_test) # Build logistic classifier clf = LogisticRegression(random_state=42) clf.fit(X_train, y_train) # Applying Linear Discriminant Analysis y_predict = clf.predict(X_test) # Find confusion matrix and copy data in the x_set and y_set cm = confusion_matrix(y_test, y_predict) # Find accuracy score print("Accuracy is: ", accuracy_score(y_test, y_predict) * 100, "%")
y = dataset.iloc[:, 4].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) from sklearn.decomposition import KernelPCA kernel_pca = KernelPCA(n_components=2, kernel='rbf') X_train = kernel_pca.fit_transform(X_train) X_test = kernel_pca.transform(X_test) from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1,
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_swiss_roll from sklearn.decomposition import KernelPCA from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42) if 0: rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.04) X_reduced = rbf_pca.fit_transform(X) lin_pca = KernelPCA(n_components=2, kernel="linear", fit_inverse_transform=True) rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.0433, fit_inverse_transform=True) sig_pca = KernelPCA(n_components=2, kernel="sigmoid", gamma=0.001, coef0=1, fit_inverse_transform=True) y = t > 6.9
#Importing the data set with pandas and taking the necessary variables dataset = pd.read_csv('Social_Network_Ads.csv') x = dataset.iloc[:, [2, 3]].values y = dataset.iloc[:, 4].values #Splitting the data into test and training set from sklearn.cross_validation import train_test_split as tts xTrain, yTrain, xTest, yTest = tts(x, y, test_size=0.25, random_state=0) #Feature Scaling from sklearn.preprocessing import StandardScaler ss = StandardScaler() xTrain = ss.fit_transform(xTrain) xTest = ss.transform(xTest) #Applying Kernel PCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components=2, kernel='rbf') xTrain = kpca.fit_transform(xTrain) xTest = kpca.fit_transform(xTest) #Checking for the most significant components by trial & error from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda ld = lda(n_components=2) xTrain = ld.fit_transform(xTrain, yTrain) xTest = ld.transform(xTest) #Fitting logistic regression to the training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(xTrain, yTrain) #Predicting the test set results yPred = classifier.predict(xTest) #Evaluating model performance by using Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(yTest, yPred)
#%% # convert to a sparse and compatible format for dimensionality reduction using sklearn sparse_corpus_tfidf = matutils.corpus2csc(corpus_tfidf) sparse_corpus_tfidf_transpose = sparse_corpus_tfidf.transpose() train_tfidf, test_tfidf, train_category, test_category = train_test_split(sparse_corpus_tfidf_transpose, united.ix[:,1], test_size = 0.2, random_state = seed) #%% print('starting dimensionality reduction') # reduce dimensions from sklearn.decomposition import KernelPCA reducer= KernelPCA(n_components = 30 , kernel="cosine", random_state=seed) corpus_train_tfidf_kpca = reducer.fit_transform(train_tfidf) corpus_test_tfidf_kpca = reducer.transform(test_tfidf) corpus_train_tfidf_reduced=corpus_train_tfidf_kpca corpus_test_tfidf_reduced =corpus_test_tfidf_kpca #print('starting tsne') #from sklearn.manifold import TSNE #reducer = TSNE(n_components = 30, learning_rate=1000.0, n_iter=1000, metric='cosine') #corpus_train_tfidf_reduced = reducer.fit_transform(corpus_train_tfidf_kpca) #corpus_test_tfidf_reduced = reducer.transform(corpus_test_tfidf_kpca) #%% import matplotlib.pyplot as plt
import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles np.random.seed(7) X, y = make_circles(n_samples=500, factor=0.2, noise=0.04) pca = PCA() X_pca = pca.fit_transform(X) kernel_pca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) X_kernel_pca = kernel_pca.fit_transform(X) X_inverse = kernel_pca.inverse_transform(X_kernel_pca) class_0 = np.where(y == 0) class_1 = np.where(y == 1) plt.figure() plt.title("Original data") plt.plot(X[class_0, 0], X[class_0, 1], "ko", mfc='none') plt.plot(X[class_1, 0], X[class_1, 1], "kx") plt.xlabel("1st dimension") plt.ylabel("2nd dimension") plt.figure() plt.plot(X_pca[class_0, 0], X_pca[class_0, 1], "ko", mfc='none') plt.plot(X_pca[class_1, 0], X_pca[class_1, 1], "kx") plt.title("Data transformed using PCA")
for doc in corpus_tfidf: print(doc) break # convert to a sparse and compatible format for dimensionality reduction using sklearn sparse_corpus_tfidf = matutils.corpus2csc(corpus_tfidf) sparse_corpus_tfidf_transpose = sparse_corpus_tfidf.transpose() #%% print('starting dimensionality reduction') # reduce dimensions from sklearn.decomposition import KernelPCA import numpy as np reducer = KernelPCA(n_components=700, kernel="cosine", random_state=seed) corpus_tfidf_kpca = reducer.fit_transform(sparse_corpus_tfidf_transpose) explained_variance = np.var(corpus_tfidf_kpca, axis=0) #total_variance = np.var(train_tfidf, axis=0) #explained_variance_ratio = explained_variance / np.sum(total_variance) explained_variance_ratio = explained_variance / np.sum(explained_variance) cum_explained_variance = np.cumsum(explained_variance_ratio) #%% import matplotlib.pyplot as plt plt.figure() plt.plot(explained_variance_ratio) #plt.figure() #plt.plot(cum_explained_variance)
value, counts = np.unique(y_train, return_counts=True) minority_class = value[np.argmin(counts)] majority_class = value[np.argmax(counts)] idx_min = np.where(y_train == minority_class)[0] idx_maj = np.where(y_train == majority_class)[0] full_X = np.concatenate((X_train, X_test)) full_y = np.concatenate((y_train, y_test)) number_of_clusters = len(idx_min) # Adding PCA Method transformer = KernelPCA(n_components=math.ceil(X_train.shape[1]/3), kernel='poly') X_transformed = transformer.fit_transform(full_X) # Training the kmean model kmeans = KMeans(n_clusters=number_of_clusters) kmeans.fit(full_X) points_under_each_cluster = {i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters)} centers = kmeans.cluster_centers_ # From each cluster removing the test instances for i in points_under_each_cluster.keys(): temp = [] for j in range(len(points_under_each_cluster[i])): if points_under_each_cluster[i][j] not in test_index: temp.append(points_under_each_cluster[i][j]) points_under_each_cluster[i] = np.array(temp)
print(cm) print('Train accuracy : ', round(accuracy_score(y_train, y_pred_t)*100, 2) , '%') #Predicting using Test Data y_pred = clf.predict(x_test) #Visualizing Predicted data using confusion matrix cm = confusion_matrix(y_test,y_pred) print(cm) print('Test accuracy : ', round(accuracy_score(y_test, y_pred)*100,2) , '%') #Graphical Visulization of Data #Plot Code Starts from sklearn.decomposition import KernelPCA pca = KernelPCA(n_components=2) principalComponents = pca.fit_transform(x,y) principalDf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2']) finalDf = pd.concat([principalDf, dataset[['CLASS']]], axis = 1) fig = plt.figure(figsize = (8,8)) ax = fig.add_subplot(1,1,1) ax.set_xlabel('Component 1', fontsize = 15) ax.set_ylabel('Component 2', fontsize = 15) ax.set_title('2 component PCA', fontsize = 20) targets = [0, 1, 2, 3] colors = ['r', 'g', 'b', 'y'] for target, color in zip(targets,colors): indicesToKeep = finalDf['CLASS'] == target ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
# Load libraries from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles # Create linearly inseparable data features, _ = make_circles(n_samples=1000, random_state=1, noise=0.1, factor=0.1) # Apply kernal PCA with radius basis function (RBF) kernel kpca = KernelPCA(kernel="rbf", gamma=15, n_components=1) features_kpca = kpca.fit_transform(features) print("Original number of features:", features.shape[1]) print("Reduced number of features:", features_kpca.shape[1])
def RunKPCAScikit(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) data = np.genfromtxt(self.dataset, delimiter=',') with totalTimer: # Get the new dimensionality, if it is necessary. dimension = re.search('-d (\d+)', options) if not dimension: d = data.shape[1] else: d = int(dimension.group(1)) if (d > data.shape[1]): Log.Fatal("New dimensionality (" + str(d) + ") cannot be greater " + "than existing dimensionality (" + str(data.shape[1]) + ")!") q.put(-1) return -1 # Get the kernel type and make sure it is valid. kernel = re.search("-k ([^\s]+)", options) try: if not kernel: Log.Fatal( "Choose kernel type, valid choices are 'linear'," + " 'hyptan' and 'polynomial'.") q.put(-1) return -1 elif kernel.group(1) == "linear": model = KernelPCA(n_components=d, kernel="linear") elif kernel.group(1) == "hyptan": model = KernelPCA(n_components=d, kernel="sigmoid") elif kernel.group(1) == "polynomial": degree = re.search('-D (\d+)', options) degree = 1 if not degree else int(degree.group(1)) model = KernelPCA(n_components=d, kernel="poly", degree=degree) elif kernel.group(1) == "cosine": model = KernelPCA(n_components=d, kernel="cosine", degree=degree) elif kernel.group(1) == "gaussian": model = KernelPCA(n_components=d, kernel="rbf", degree=degree) else: Log.Fatal( "Invalid kernel type (" + kernel.group(1) + "); valid " + "choices are 'linear', 'hyptan' and 'polynomial'.") q.put(-1) return -1 out = model.fit_transform(data) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() x_train = sc_x.fit_transform(x_train) x_test = sc_x.transform(x_test) #Dimension Reduction Technique (PCA): ''' we are considering Maxium 2 Eigen Value, which gives us the 2 Eigen Vector having Maximum 2 Variance distribution in those vector our Non Linear data is Mapped to Higher Dimension using Gaussian Kernerl Trick and then PCA will reduce the Dimension to 2 as inputted ''' from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components=2, kernel='rbf') x_train = kpca.fit_transform(x_train) x_test = kpca.transform(x_test) #Fitting the data to Logistic Reg Module: from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(x_train, y_train) #Prediction of Training Data: y_predict_train = classifier.predict(x_train) #Prediction of Testing Data: y_predict_test = classifier.predict(x_test) #Confusion Metrix to evaluate the prediction: from sklearn.metrics import confusion_matrix
def kpca_transformed(dataset_name, kernel_name, gamma): kpca = KernelPCA(n_components=2, kernel=kernel_name, gamma=gamma) X, y = datasets[dataset_name] X_transformed = kpca.fit_transform(X) return X, X_transformed, y
if (0): #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% NONLINEAR METHODS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% NONLINEAR METHODS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% NONLINEAR METHODS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# d = pair.pairwise_distances(Xtrain,Xtrain) aux = np.triu(d) sigma = np.sqrt(np.mean(np.power(aux[aux!=0],2)*0.5)) gamma = 1/(2*sigma**2) if (0): #%% K-PCA # Calculate accumulated variance kpca = KernelPCA(kernel="rbf",gamma=gamma) kpca.fit_transform(Xtrain) eigenvals = kpca.lambdas_[0:220] # Calculate classifiation scores for each component nComponents = np.linspace(1, 500, 100, endpoint=True) kpcaScores = np.zeros((5,np.alen(nComponents))) kpca = KernelPCA(n_components = Ntrain,kernel="rbf",gamma=gamma) kpca.fit(Xtrain) XtrainT = kpca.transform(Xtrain) XtestT = kpca.transform(Xtest) for i in range(len(nComponents)): kpcaScores[:,i] = util.classify(XtrainT[:,:nComponents[i]],XtestT[:,:nComponents[i]],labelsTrain,labelsTest)
def kernel_pca(data, dim=3): scikit_kpca = KernelPCA(n_components=dim, kernel='rbf', gamma=15) result = scikit_kpca.fit_transform(data) return result
print(X.shape, y.shape) print(X[:5]) print(y[:5]) fig = plt.figure() ax = fig.add_subplot(projection='3d') ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.hot) ax.set_xlabel('x') ax.set_ylabel('y') ax.set_zlabel('z') # ax.view_init(180, 0) plt.show() # Kernel PCA: 커널 트릭을 사용한 주성분 분성 lin_kpca = KernelPCA(n_components=2, kernel='linear', random_state=1) X_reduced = lin_kpca.fit_transform(X) print(X_reduced.shape) plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap=plt.cm.hot) plt.show() rbf_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=0.043, random_state=1) X_reduced = rbf_kpca.fit_transform(X) plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap=plt.cm.hot) plt.show() sigmoid_kpca = KernelPCA(n_components=2, kernel='sigmoid',