def pca_hash(x_train, XX, nbits, manhattan_hash = False, manhattan_bit = 2): """ Compute the hash code with Principal Component Analysis (PCA) Args: x_train: training data with shape (#ntest, #dimension of feature) XX: training and testing data with shape (#data, #dimension of feature) nbtis: the number of dimension of the resultant binary code Returns: Y: the compact binary code (#data, nbits) """ (n_train, _) = x_train.shape if manhattan_hash == True: nbits = int(ceil(nbits / manhattan_bit)) (eigvec, _) = pca(x_train, nbits) eigvec = eigvec.real Y = np.dot(XX, eigvec) print "Shape (training set) after pca: ", Y.shape #print Y # Y has the size (#test x #eigvalue) if manhattan_hash == True: Y = manhattan_quant(Y, n_train, nbits, manhattan_bit) else: Y = Y >= 0 Y = compactbit(Y) return Y
def dpca(dihedrals, unit='degree', verbose=False): """Perform a dihedral pca Input: Dihedral angle data: rows (first index) are the angles and columns (second index) observations unit: degree or radian Returns: Array of coordinates in the space spanned by the dihedral principal components """ # Create cartesian coordinate space of x = cos(phi), y = sin(phi) cartcoords = np.zeros([2 * dihedrals.shape[0], dihedrals.shape[1]], dtype=dihedrals.dtype) if unit == "degree": cosines = np.cos(const.pi / 180.0 * dihedrals) sines = np.sin(const.pi / 180.0 * dihedrals) elif unit == "radian": cosines = np.cos(dihedrals) sines = np.sin(dihedrals) else: print("Angular unit must be degree or radian but not {}".format(unit)) sys.exit(0) cos_idx = np.arange(0,2*dihedrals.shape[0],2) sin_idx = np.arange(1,2*dihedrals.shape[0],2) cartcoords[cos_idx,:] = cosines cartcoords[sin_idx,:] = sines # Compute pca eigvals, eigvecs = pca(cartcoords, verbose=verbose) # # Project data on principal components # if verbose: # print("Projecting data on principal components:", end="") # starttime = time.time() # projectedcoords = np.zeros_like(cartcoords) # for point_idx in range(cartcoords.shape[1]): # for eig_idx in range(eigvecs.shape[0]): # projectedcoords[eig_idx, point_idx] = np.dot(eigvecs[:,eig_idx], cartcoords[:,point_idx]) # if verbose: # print(" {:.2f} sec.".format(time.time() - starttime)) # Project data on principal components more efficiently if verbose: print("Projecting data on principal components:", end="") starttime = time.time() projectedcoords = np.zeros_like(cartcoords, dtype=cartcoords.dtype) msg = "" for eig_idx in range(eigvecs.shape[0]): if verbose: print(len(msg)*"\b", end="") msg = " {:3.0f}%".format(100.0*eig_idx / eigvecs.shape[0]) print(msg, end="") sys.stdout.flush() product = cartcoords * eigvecs[:,eig_idx].reshape([eigvecs.shape[0],1]) projectedcoords[eig_idx,:] = product.sum(0) if verbose: print(len(msg)*"\b", end="") print(" {:.2f} sec.".format(time.time() - starttime)) return eigvals, eigvecs, projectedcoords
def reduce3D(embeddings, model, anchors_pre_emb): anchors_post_emb = model.encode(anchors_pre_emb) data = np.append(embeddings, np.array(anchors_post_emb), axis=0) data_frame = pd.DataFrame(data) pca_model = pca(n_components=3) dict = pca_model.fit_transform(df) dim3 = np.array(dict['PC']) anchors = np.array([dim3[-3:]]) return dim3, anchors
def NN_dim_red(dim_red): #X,Y,cols,name = get_breast_cancer_data() X,Y,cols,name = get_wine_data() if dim_red=="pca": X = pca(X,2) elif dim_red=="ica": X = ica(X,2) elif dim_red=="rp": X = rp(X,2) elif dim_red=="cs": X = cs(X,Y) Y_ohc = one_hot_encoding(Y) X_train, X_test, Y_train, Y_test = train_test_split(X, Y_ohc, random_state=1,shuffle=True) op_shape = Y_ohc.shape[1] model = Sequential() model.add(Dense(32, input_dim=X.shape[1], activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(op_shape, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) results = model.fit( X_train, Y_train, epochs= 50, batch_size = 32, validation_data = (X_test, Y_test),verbose=0) plt.plot(results.history['acc'], label="Training") plt.plot(results.history['val_acc'], label="Testing") plt.legend(loc='lower right') plt.xlabel("Epochs") plt.ylabel("Accuracy") plt.title("NN_"+dim_red+"_"+str(name)) #plt.show() if dim_red!="original": plt.title("NN_"+dim_red+"_"+str(name)) plt.savefig("graphs/NN_"+dim_red+"_"+str(name)+".png") clf = MLPClassifier(solver='adam', hidden_layer_sizes=(32,16,4), random_state=0, activation='relu', max_iter = 50, batch_size=32) clf = clf.fit(X_train, Y_train) train_predict = clf.predict(X_train) test_predict = clf.predict(X_test) plt2 = plot_learning_curve(clf, "NN_"+dim_red+"_lc_"+str(name), X, Y, ylim=[0,1]) plt2.savefig("graphs/NN_"+dim_red+"_lc_"+str(name)) else: plt.title("NN_"+str(name)) plt.savefig("graphs/NN_"+str(name)+".png") clf = MLPClassifier(solver='adam', hidden_layer_sizes=(32,16,4), random_state=0, activation='relu', max_iter = 50, batch_size=32) clf = clf.fit(X_train, Y_train) train_predict = clf.predict(X_train) test_predict = clf.predict(X_test) plt2 = plot_learning_curve(clf, "NN_"+dim_red+"_lc_"+str(name), X, Y, ylim=[0,1]) plt2.savefig("graphs/NN_"+dim_red+"_lc_"+str(name))
def main(argv=None): # Lectura de archivo f = open('../../data/iris.data', 'r') lines = f.readlines() f.close() dataset = read_data() choice = '' while choice != 4: choice = input('1. Plot data\n2. PCA analysis\n3. Fisher\n4. Exit\n') if choice == 1: plot(dataset) elif choice == 2: pca(dataset) elif choice == 3: fischer(dataset) elif choice == 4: return else: print('Invalid input. Try again.')
def test(n=100): data = linear_testdata(n) w, v = pca(data) print(v) plt.plot(data[0,:], data[1,:], '.') plt.plot([0, v[0,0]], [0, v[1,0]], 'r') plt.plot([0, v[0,1]], [0, v[1,1]], 'g') plt.xlim(-0.5, 1.5) plt.ylim(-0.5, 1.5) plt.show()
def run_kmeans_pca(): X_raw,Y,cols,name = get_breast_cancer_data() #X_raw,Y,cols,name = get_wine_data() X = pca(X_raw,2) c = len(np.unique(Y)) kmeans = KMeans(n_clusters=c) kmeans.fit(X) y_kmeans = (kmeans.predict(X)) plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis') plt.xlabel("X1") plt.ylabel("X2") plt.title("Kmeans_pca_"+str(name)) plt.savefig("graphs/Kmeans_pca_"+str(name)+".png")
def run_gmm_pca(): X_raw, Y, cols, name = get_breast_cancer_data() #X_raw,Y,cols,name = get_wine_data() X = pca(X_raw, 2) c = len(np.unique(Y)) gmm = GaussianMixture(n_components=c) gmm.fit(X) y_pred = gmm.predict(X) plt.gca() plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis') plt.xlabel("X1") plt.ylabel("X2") plt.title("GMM_pca_" + str(name)) plt.savefig("graphs/GMM_pca_" + str(name) + ".png")
def set_up_clustering(self): """ Set up the clustering task by running PCA and splitting the data into training and testing sets. :return: None """ new_data, variances, eigenvectors = pca(self.data) # truncate dimensions to just the first two small_data = new_data[:2, :] # if you haven't implemented PCA yet, you can test GMM by replacing the above code with # small_data = self.data[:2, :] # split data for validation d, n = small_data.shape # use fraction of data for training self.train_inds = np.random.rand(n) < 0.5 self.train_data = small_data[:, self.train_inds] self.val_data = small_data[:, ~self.train_inds]
def test_pca(self): """ Perform PCA on the synthetic data and check that the returned values are as expected. :return: None """ new_data, variances, eigenvectors = pca(self.data) assert np.allclose(np.zeros(64), np.mean( new_data, 1)), "The data is not centered to be zero-mean." assert variances[0] + variances[1] > np.sum(variances[2:]), "Variance of the first two dimensions should " \ "be greater than the variance of the rest" assert np.sum(variances[:2]) > np.sum(variances[2:]), "Variances of first two dimensions were not larger than" \ "variances of the rest of the noise dimensions" assert np.var(eigenvectors[:, 0].T.dot(self.data)) > np.var(eigenvectors[:, 1].T.dot(self.data)), \ "First principle direction doesn't have more variance than second principle direction" assert np.var(eigenvectors[:, 1].T.dot(self.data)) > np.var(eigenvectors[:, 2].T.dot(self.data)), \ "Second principle direction doesn't have more variance than third principle direction" vector_0_1 = self.data[:, 1] - self.data[:, 0] new_vector_0_1 = new_data[:, 0] - new_data[:, 1] assert np.allclose(np.linalg.norm(vector_0_1), np.linalg.norm(new_vector_0_1)), "Distance between example 0 " \ "and 1 is not the same before" \ "and after PCA" assert np.allclose(eigenvectors.T.dot(eigenvectors), np.eye(64)), "Eigenvectors were not orthogonal" reconstructed = eigenvectors.dot(new_data) assert np.allclose(reconstructed[:, 0] - self.data[:, 0], reconstructed[:, 1] - self.data[:, 1]), \ "Reconstructed points are not similar."
data = sio.loadmat('ex7data1.mat') X = data['X'] plt.scatter(X[:, 0], X[:, 1]) plt.axis([0.5, 6.5, 2, 8]) plt.show() input('Program paused. Press enter to continue.\n') # =============== Part 2: Principal Component Analysis =============== # You should now implement PCA, a dimension reduction technique. You # should complete the code in pca.m # print('\nRunning PCA on example dataset.\n') X_norm, mu, sigma = featureNormalize(X) U, S = pca(X_norm) print(X_norm.shape) print(mu.shape) print(sigma.shape) print(U.shape) print(S.shape) drawLine(mu, mu + 1.5 * S[0] * U[:, 0]) drawLine(mu, mu + 1.5 * S[1] * U[:, 1]) plt.scatter(X[:, 0], X[:, 1]) plt.axis([0.5, 6.5, 2, 8]) plt.show() print('Top eigenvector: \nU[:, 0] = {}'.format(U[:, 0])) print('You should expect to see [-0.707107 -0.707107]') input('Program paused. Press ENTER to continue')
continue num += 1 # 最后选出来的列数代表维度(根据原始数据p*n line = line[1:] # 去掉第一个字符串 line = list(map(float, line)) data.append(line) f.close() print("Data has been read successfully.") print("The dimension is " + str(num)) print(data[0][0]) data = np.array( data ).T # 原始数据一列代表一个example,一行代表一个维度,现在要求转置,变成可以求PCA的形式,也就是一行代表一个example,一列代表一个维度。 print("Now reducing dimension...") lowDData = pca(dataMat=data, percentage=0, k=k_dimentions) print("Finished, the new dimension is :" + str(len(lowDData[0]))) print("Start writing new data...") destfile = '../../data_dimRed_' + str(PCA_percentage) + '.txt' print(len(lowDData)) f = open(destfile, 'w') for i in range(0, len(lowDData)): for j in range(0, len(lowDData[i])): f.write(str(lowDData[i][j]) + '\t') f.write('\n') end_time = time.time() duration = end_time - start_time print('Time cost: %fs.\n' % float(duration)) print("Finished the whole work.")
import scipy.optimize as op import matplotlib from matplotlib import pyplot as plt from pca import * if __name__ == '__main__': fig, ax = plt.subplots(1, 2, figsize=(10, 5)) #figsize=(10,5)用来控制生成图片的大小 ex7data1 = np.load('ex7data1.npz') x = ex7data1['X'] m, n = x.shape ax[0].scatter(x[:, 0], x[:, 1], c='b', marker='o') ax[0].set_title('Original Data') norm, mean, std = normalize(x) ax[1].scatter(norm[:, 0], norm[:, 1], c='b', marker='*') ax[1].set_title('Normalized Data') u, s = pca(norm) #得到特征向量和特征值 z = project(norm, u, 1) # recovery from projected data xr = recovery(z, u, 1) ax[1].scatter(xr[:, 0], xr[:, 1], c='r', marker='+') # reverse operation of normalization on approximate reconstruction xr xrr = revernorm(xr, mean, std) #标准化的逆过程 ax[0].scatter(xrr[:, 0], xrr[:, 1], c='r', marker='*') for i in range(0, m): print(x[i]) line0 = np.vstack((x[i], xrr[i])) print(line0) line1 = np.vstack((norm[i], xr[i])) ax[0].plot(line0[:, 0], line0[:, 1], 'k--')
datas_=datas(d) for i in data: for j in range(d): datas_.append_var(j, float(i[j])) for i in label: datas_.append_var(d,float(i)) return datas_ def pca(data,d): data_ = nolabel(data, d) pca_(data_, d) def fld(data,label,d): data_ = labelling(data,label,d) fld_(data_, d) def k_means_clustering(data,k): d=len(data[0]) data_=nolabel(data,d) k_means_clustering_(data_,k,d) def spectral_clustering(affinity_matrix,k): spectral_clustering_(affinity_matrix, k) y=pca(data,d) y=fld(data,label,d) labels=k_means_clustering(data,k) labels=spectral_clustering(affinity_matrix, k)
print(i / 200) if float(line[1]) < 10: continue num += 1 line = line[1:] line = list(map(float, line)) data.append(line) f.close() print("Data has been read successfully.") print("The dimension is " + str(num)) print(data[0][0]) data = np.array(data).T print("Now reducing dimension...") lowDData = pca(data, 0.99) print("Finished, the new dimension is :" + str(len(lowDData[0]))) print("Start writing new data...") destfile = '../../data_dimRed_0.99.txt' print(len(lowDData)) f = open(destfile, 'w') for i in range(0, len(lowDData)): for j in range(0, len(lowDData[i])): f.write(str(lowDData[i][j]) + '\t') f.write('\n') print("Finished the whole work.") # testArray = np.array([[4,3,2],[3,2,1],[2,0,0]]) # lowd,res = pca(testArray)
#element wise multiplication distance = np.inner(test_data -self.central_point[i], test_data - self.central_point[i]) #print distance distance = np.sum(distance) distances.append(distance) distances = np.array(distances) #print distances t = np.argmin(distances) return t k_means = K_means_classifier(10) all_data = np.vstack((train_image, test_image)) all_data = pca(all_data, topNfeat = 700) train_image = all_data[:60000] test_image = all_data[60000:] k_means.train(train_image, train_label) #print k_means.central_point cnt = 0 for i in range(1, num_test): t = k_means.predict(test_image[i]) if t == test_label[i]: cnt += 1 if i % 1000 == 0: print i print float(cnt) / i log.write('test size = ' + str(i) + ' test accuracy: ')
correlacao = [stats.pearsonr(n.array(col), n.array(coluna))[0] for col in colunas for coluna in colunas] for i in range(8): m.append(correlacao[i*8 : i*8 + 8]) # pearson == m print 'PEARSON' for linha in m: print [str(round(x, ndigits=2)) for x in linha] # cálculo PCA #T, P, E = pca.PCA_nipals(nn) matriz_cov, autovetores, autovalores, autovalores_prop, dados_finais = pca(nn) T = dados_finais P = autovetores.T E = autovalores princ = T[:,:2] # cálculo dos autovalores % print 'AUTOVALORES', E * 100 # contribuições c1 = P[0] c2 = P[1] cc1 = c1 / sum(abs(c1)) * 100 cc2 = c2 / sum(abs(c2)) * 100 print 'CONTRIBUICOES' print 'C1', [abs(x) for x in cc1]
col = 10 row = int(np.ceil(n / 10.0)) fig, ax = plt.subplots(row, col) #rnd = np.random.randint(0,np.size(x,0),n) rnd = np.arange(0, n) xl = x[rnd].reshape((n, picsize[0], picsize[1])) for i in range(0, row): for j in range(0, col): # if not transpose xl[i*col+j], the picture will show horizontally ax[i, j].imshow(xl[i * col + j].T, cmap=plt.cm.gray) ax[i, j].set_xticks([]) ax[i, j].set_yticks([]) ax[i, j].set_title(str) plt.axis('off') if __name__ == '__main__': k = 100 x = np.load('ex7faces.npz')['X'] norm, mean, std = normalize(x) u, s = pca(norm) z = project(norm, u, k) # recovery from projected data xr = recovery(z, u, k) randomShow(x, 50, (32, 32), 'x') #原始图像,包含1024个特征的图像 randomShow(norm, 50, (32, 32), 'norm') #经过标准化之后的图像 randomShow(z, 50, (10, 10), 'z') #经过pca之后压缩的图像,此时只用最重要的前100个特征来表示该图像 randomShow(xr, 50, (32, 32), 'xr') #使用前100个主成分复原的图像 plt.show()
img_3 = numpy.array(I_3).reshape(1,numpy.product(numpy.array(I_3).shape)) img_4 = numpy.array(I_4).reshape(1,numpy.product(numpy.array(I_4).shape)) img_5 = numpy.array(I_5).reshape(1,numpy.product(numpy.array(I_5).shape)) # print(img_1.shape) # print(img_2.shape) # print(img_3.shape) # print(img_4.shape) # print(img_5.shape) cat_dataset = numpy.vstack((img_1,img_2,img_3,img_4,img_5)) # print(cat_dataset.shape) #print(dataMat) lowDDataMat, reconMat = pca(cat_dataset, 500) numpy.save(file="./data/mat/lowDDataMat.npy", arr=lowDDataMat) numpy.save(file="./data/mat/reconMat.npy", arr=reconMat) print(lowDDataMat.shape) print(reconMat.shape) # reimg_1 = numpy.vsplit(reconMat,5)[0] # print(reimg_1.shape) # trans_img1 = numpy.reshape(reimg_1,numpy.array(I_1).shape) # print(trans_img1.shape) # reimg_1 = Image.fromarray(trans_img1).convert('RGB') # reimg_1.save('./data/reconimg/01.png')
fig = plt.figure() ax = fig.add_subplot(111) # flatten()方法能将matrix的元素变成一维的,A能使matrix变成array,A[0]或者数组数据 ax.scatter(dataMat[:, 0].flatten().A[0], dataMat[:, 1].flatten().A[0], marker='^', s=90, c='green') ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker='o', s=50, c='red') plt.show() if __name__ == "__main__": # 1 加载数据,并转化数据类型为float dataMat = loadDataSet('./testSet.txt') # print('加载原始特征数据:\n',dataMat) # 2 主成分分析降维特征向量设置 lowDmat, reconMat = pca(dataMat, 1) # print(shape(lowDmat)) # 只需要2个特征向量,和原始数据一致,没任何变化 # lowDmat, reconMat = pca(dataMat, 2) # print(shape(lowDmat)) # 3 将降维后的数据和原始数据一起可视化 show_picture(dataMat, reconMat)
line = f.readline() line = line[:-1].split('\t') if string.atof(line[1])<10: continue num+=1 line = line[1:] line = map(string.atof,line) data.append(line) f.close() print "Data has been read successfully." print "The dimension is "+str(num) data = np.array(data).T print "Now reducing dimension..." lowDData = pca(data,0.90) print "Finished, the new dimension is :"+str(len(lowDData[0])) print "Start writing new data..." destfile = '../data/data_dimRed.txt' f = open(destfile,'wb') for i in range(0,len(lowDData)): for j in range(0,len(lowDData[i])): f.write(str(lowDData[i][j])+'\t') f.write('\n') print "Finished the whole work." # testArray = np.array([[4,3,2],[3,2,1],[2,0,0]]) # lowd,res = pca(testArray) # print res
def compressITQ(mx, bit, iters): Y = pca(mx, bit) itq(Y, iters)
# %% import pca print(pca.__version__) # %% from sklearn.datasets import load_iris import pandas as pd from pca import pca # Initialize model = pca(n_components=3) # Dataset X = pd.DataFrame(data=load_iris().data, columns=load_iris().feature_names, index=load_iris().target) # Fit transform out = model.fit_transform(X) # Make plots model.scatter() ax = model.biplot(n_feat=4) ax = model.plot() # Make 3d plolts model.scatter3d() ax = model.biplot3d() # Normalize out PCs model = pca()
from pca import * # ---------------------------------------------- # # SCRIPT TO RUN PCA ON SLOO # # ---------------------------------------------- # # Filepath to SLOO Data trainName = "../../dataset/sloo/train.csv" testName = "../../dataset/sloo/test.csv" pca(trainName, testName)
print() print( "pca - can be followed by two distinct integers between 0 and 15 for custom use of columns" ) print( "isomap - must be followed by a positive integer determining the number of nearest neighbors" ) shutdown() mat, zoo_type, zoo_name = get_data_matrix() x = None if arg == "pca": if len(sys.argv) < 3: x = pca(mat) elif len(sys.argv) == 3: print("Invalid number of PCA integer arguments") shutdown() else: try: t1 = int(sys.argv[2]) t2 = int(sys.argv[3]) x = pca(mat, [t1, t2]) except: print("PCA integer argument invalid") shutdown() elif arg == "mds-data": mat = center_matrix(mat) x = mds_data(mat)
#Example: using PCA to reduce the dimensionality of # semiconductor manufacturing data #Author: Justin Nie #Date: 2018/2/15 from numpy import * from pca import * dataset = load_dataset('secom.data', ' ') data_mat = mat(dataset) data_mat = replace_nan(data_mat) check_eigen(data_mat, 20) low_data_mat, new_data_mat = pca(data_mat, 20) print(shape(low_data_mat)) print(shape(data_mat))
#-*-coding:utf-8-*- #对半导体材料数据降维 from pca import * dataMat = replaceNanWithMean() lowDataMat, reconMat = pca(dataMat, 6) print(lowDataMat)
print(shape(eigVals)) print(eigVals) print(shape(eigVects)) print(eigVects) import matplotlib.pyplot as plt Var = eigVals Var_sum = sum(Var) Var_rate = Var / Var_sum plt.plot(Var_rate[:20], 's-') plt.show() Var = eigVals Var_sum = sum(Var) Var_add = zeros_like(Var) for i in range(len(Var)): Var_add[i] = sum(Var[:i + 1]) / Var_sum plt.plot(Var_add[:20], 's-') plt.show() lowDMat, reconMat = pca(dataMat, 6) print(lowDMat) print(reconMat) lowDMat, reconMat = pca(dataMat, 20) print(lowDMat) print(reconMat)
from pca import * #import plotter model = pca('data-1.txt') #plotter.pca_plotter(model)