def main(args): # Read data file into numpy matrices with gzip.open(args.mnist_train_data, 'rb') as in_gzip: magic, num, rows, columns = struct.unpack('>IIII', in_gzip.read(16)) all_data = [np.array(struct.unpack('>{}B'.format(rows * columns), in_gzip.read(rows * columns))) for _ in range(60000)] # Read labels file into labels with gzip.open(args.mnist_train_labels, 'rb') as in_gzip: magic, num = struct.unpack('>II', in_gzip.read(8)) all_labels = struct.unpack('>60000B', in_gzip.read(60000)) pca = PCA(5) pca.fit(all_data) components = pca.return_components() components = np.reshape(components, (5, 28, 28)) one = PCA(5) one.fit() one_comp = pca.return_components() f, axarr = plt.subplots(1, 5, figsize=(18, 4), sharey=True) for i in range(5): axarr[i].imshow(components[i]) axarr[i].set_aspect('equal') axarr[i].set_title('Component {}'.format(i + 1)) plt.tight_layout() name = 'Hrach' plt.savefig('comps-{}.png'.format(name), dpi=320)
def buildPCA(self, marks): TableMarks, mean_shape = pa.GPA(marks) self.mean_shape = mean_shape marks = np.asarray(TableMarks) accuracy = 0.98 PCAmodel = PCA(marks, accuracy) return PCAmodel
def test_pca(): data_ingestor = DataIngestor() X, y, _, _ = data_ingestor.load_mnist() X = X.T pca = PCA() dimensionality = [1, 10, 100, 500, 784] element = [1, 2, 3, 4] fig, axes = plt.subplots(len(dimensionality), 1, sharey=True) plt.gray() for i, big_ax in enumerate(axes, start=0): big_ax.set_title('PCs = ' + str(dimensionality[i])) big_ax.tick_params(labelcolor=(1.,1.,1., 0.0), top='off', bottom='off', left='off', right='off') big_ax._frameon = False _, X_tilde = pca.compute_pca(X, dimensionality[i]) for j in range(len(element)): ax = fig.add_subplot(len(dimensionality), len(element), i*len(element) + j + 1) ax.imshow(X_tilde.T[element[j]].reshape([28,28])) plt.axis('off') plt.show() eigvals, _ = pca.compute_pca(X) reconstruction_error = [np.sum(eigvals[i:]) for i in range(len(eigvals))] plt.plot(range(len(reconstruction_error)), reconstruction_error) plt.axhline(0, color='black') plt.title('Average Construction Error') plt.show()
def get_data(pca_ON=False, print_shapes=False): data = pd.read_csv('mnist_train.csv').as_matrix() Xtrain = data[:-10000, 1:] Ytrain = data[:-10000, 0] Xtest = data[-10000:, 1:] Ytest = data[-10000:, 0] dataset = {} if pca_ON: pca = PCA(n_components=30) pca.fit(Xtrain) if print_shapes: print('\nEigenvectors size:', pca.evecs.shape) Xtrain = pca.transform(Xtrain) Xtest = pca.transform(Xtest) if print_shapes: print('\nXtrain: {}, Ytrain: {}'.format(Xtrain.shape, Ytrain.shape)) print('Xtest: {}, Ytest: {}'.format(Xtest.shape, Ytest.shape)) dataset['train'] = (Xtrain, Ytrain) dataset['test'] = (Xtest, Ytest) return dataset
def runPCA(data, elems=None, components=None, threshold=None): t_data = theano.shared(np.asarray(data, dtype=theano.config.floatX), name='data', borrow=True) if components is not None and threshold is not None: print('You Can' ' Run PCA Using Threshold And Components') exit(-1) t_components = None t_threshold = None if components is not None: t_components = theano.shared(value=components, name='components', borrow=True) elif threshold is not None: t_threshold = theano.shared(value=threshold, name='components', borrow=True) idx = T.lvector('idx') m_data = T.matrix('data') pca = PCA(data=m_data, components=t_components, threshold=t_threshold) theanoPCA = theano.function(inputs=[idx], outputs=pca.process(), givens={m_data: t_data[idx]}) if elems is None: elems = np.arange(len(data), dtype='int64') return theanoPCA(elems)
def main(args): # Read data file into numpy matrices with gzip.open(args.mnist_train_data, 'rb') as in_gzip: magic, num, rows, columns = struct.unpack('>IIII', in_gzip.read(16)) all_data = np.array([np.array(struct.unpack('>{}B'.format(rows * columns), in_gzip.read(rows * columns))) for _ in range(16000)]) with gzip.open(args.mnist_train_labels, 'rb') as in_gzip: magic, num = struct.unpack('>II', in_gzip.read(8)) all_labels = struct.unpack('>16000B', in_gzip.read(16000)) each_label = np.empty(10, dtype = object) for i in range(10): each_label[i] = all_data[np.array(all_labels) == i] pca = PCA(15) pca.fit(all_data) all_data_transform = pca.transform(all_data) kmeans_labels = KMeans(n_clusters=10, random_state=0).fit_predict(all_data_transform) each_cluster = np.empty(10, dtype = object) for i in range(10): each_cluster[i] = all_data_transform[:,:2][np.array(kmeans_labels) == i] f, axarr = plt.subplots(2, 10, figsize=(18, 4), sharey=True) for i in range(10): a = pca.transform(each_label[i]) axarr[0][i].scatter(a.T[0], a.T[1], s = 1) for i in range(10): axarr[1][i].scatter(each_cluster[i].T[0], each_cluster[i].T[1], s = 1) #plt.show() coincidence_matrix = np.zeros((10,10)).astype(int) for i in range(16000): coincidence_matrix[all_labels[i], kmeans_labels[i]]+=1 print(coincidence_matrix) plt.savefig("labels_vs_kmeans_clusters.jpg")
def test_pca(filename): # データセットの読み込み X = [] for l in open(filename).readlines(): data = l.split(' ') rec = [float(d) for d in data] X.append(rec) X = np.array(X) # 主成分分析 # (Trueを指定すると、分散で割ってnormalizeする) pca = PCA(X, False) print pca.eigenvalues / np.sum(pca.eigenvalues) # 寄与率を表示 accm = [] total = 0.0 for v in pca.eigenvalues: total += v accm.append(total) accm /= total plt.plot(accm, 'b-') plt.show() # 主成分空間への写像の表示 X_pca = pca.project(dim=2) plt.plot(X_pca, 'b.') plt.show()
def getLayerDimensionality(layer,index,inputDict): if layer.name.startswith(inputDict['ValidDimLayers']): # Must be either a single string or tuple of strings pcaObj = PCA(matrix=layer.get_weights()[0]) message = "Layer {}: Dimensionality: {:4.2f}".format(index,pcaObj.dimensionality()) else: message = "Layer {}: Dimensonality is N/A".format(index) return message
def main(args): # Read data file into numpy matrices with gzip.open(args.mnist_train_data, 'rb') as in_gzip: magic, num, rows, columns = struct.unpack('>IIII', in_gzip.read(16)) all_data = np.array([ np.array( struct.unpack('>{}B'.format(rows * columns), in_gzip.read(rows * columns))) for _ in range(16000) ]) with gzip.open(args.mnist_train_labels, 'rb') as in_gzip: magic, num = struct.unpack('>II', in_gzip.read(8)) all_labels = struct.unpack('>16000B', in_gzip.read(16000)) zeros = all_data[np.array(all_labels) < 0.5] #plt.imshow(all_data[0].reshape(28,28)) #plt.show() pca = PCA(5) pca.fit(all_data) print(pca.return_components().shape) components = pca.return_components().reshape(5, 28, 28) f, axarr = plt.subplots(1, 5, figsize=(18, 4), sharey=True) for i in range(5): axarr[i].imshow(components[i]) print(i, components[i].shape) axarr[i].set_aspect('equal') axarr[i].set_title('Component {}'.format(i + 1)) plt.tight_layout() name = 'TODO' # TODO: Remplace name with your name plt.savefig('comps-{}.png'.format(name), dpi=320)
def test_PCA_dtype(): """ Test that the initialization of a PCA class throws a type error for things that are not pandas dataframes """ some = "A wrong data type of type string" with pytest.raises(TypeError): PCA(some)
def train_PCA_train(): """ Test that PCA has a working train abstract method """ some = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) m = PCA(some) assert m.train()
def compress_images(DATA, k): pca = PCA(DATA, k) reconst = pca.perform_PCA() reconst = rescale_images(reconst) save_images(reconst)
def run_pca(): data = Data(FILENAME) d = 2 pca = PCA() pca.train(data.x1.T, d) plt.plot(pca.pc[0], pca.pc[1], 'ro') plt.savefig("pca") plt.clf()
def fit(self, X, y): self.pca = PCA(n_components=self.pca_components).fit(X) pca_projected = self.pca.project(X) self.lda = LDA(n_components=self.n_components).fit(pca_projected, y) self.subspace = np.dot(self.pca.pro_subspace, self.lda.pro_subspace) return self
def test_PCA_init(): """ Given a pandas dataframe, test the creation of a PCA class. """ some = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) m = PCA(some) data_2 = m.getData() assert some.equals(data_2)
def reduceLayerDimensionality(layer,index,inputDict): if layer.name.startswith(inputDict['ValidDimLayers']): # Must be either a single string or tuple of strings weights = layer.get_weights() pcaObj = PCA(matrix=weights[0]) weights[0] = pcaObj.filterMatrix(n=pcaObj.computeTargetPCs(targetRatio=inputDict['targetRatio'])) layer.set_weights(weights) message = "Layer {}: Reduced layer dimensionality".format(index) else: message = "Layer {}: Dimensionality unchanged".format(index) return(message)
def test(): a = array([[5, 9, 7], [3, 7, 4], [2, 3, 9]]) pca = PCA(a, 2) print "mean: %s" % pca.mean print "covar: %s" % pca.covar print "eval: %s" % pca.eval print "evec: %s" % pca.evec print "esort: %s" % pca.esort print "pc: %s" % pca.pc print "pca: %s" % pca.pca
def model(self, k): pca = PCA(self.X) U, S, V, compare = pca.SVDdecompose() # 不可去,会用到计算得到的结果 # 得到得分矩阵和载荷矩阵 T, P = pca.PCAdecompose(k) #print("得分矩阵T: ", T) #print("载荷矩阵P: ", P) mlr = MLR(T, self.Y) mlr.modelling() self.A = np.dot(P, mlr.A)
def test_PCA_convert(): """ Test that PCA has a working test abstract method """ some = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) m = PCA(some) m.train(2) results = m.convert(some) assert results.shape[0] == 4 assert results.shape[1] == 2
def __init__(self, index_test): data_test_ke = data_test[int(index_test)] pca = PCA(data_train) orang, pose = pca.calc_pca(data_test_ke) Input_LDA = {} Input_LDA['bobot'] = pca.bobot_train Input_LDA['proyeksi'] = pca.matrix_proyeksi Input_LDA['jumlah_kelas'] = ORL_face.data.shape[ 0] # jumlah semua kelas(40) bukan pose Input_LDA['jumlah_pose'] = len( ORL_face.list_data_train) # jumlah semua pose train Input_LDA['data_train'] = data_train self.input_LDA = Input_LDA jumlah_kelas = self.input_LDA['jumlah_kelas'] jumlah_pose_train = self.input_LDA['jumlah_pose'] jumlah_data = jumlah_kelas * jumlah_pose_train self.proyeksi_pca_baru = self.get_proyeksi_pca_baru( self.input_LDA['proyeksi'], jumlah_data, jumlah_kelas) self.input_LDA = self.get_input_LDA(self.input_LDA['data_train'], self.proyeksi_pca_baru) self.rata_per_kelas = self.get_rata_tiap_kelas(self.input_LDA, jumlah_kelas, jumlah_pose_train) self.rata_total_kelas = self.get_rata_total_kelas(self.input_LDA) self.Sb = self.get_between_class_scatter(self.rata_per_kelas, self.rata_total_kelas, jumlah_kelas) self.Sw = self.get_within_class_scatter(self.input_LDA, self.rata_per_kelas, jumlah_data, jumlah_kelas, jumlah_pose_train) self.eigen_value, self.eigen_vector = self.get_eigen(self.Sb, self.Sw) self.descending_eigen_vector = self.descending(self.eigen_value, self.eigen_vector) self.wFid = np.transpose( self.descending_eigen_vector[:, 0:jumlah_kelas - 1]) self.proyeksi = self.get_proyeksi(self.wFid, self.proyeksi_pca_baru) self.bobot_train = self.get_bobot(data_train, self.proyeksi) print("\nLDA", "==" * 30) print("proyeksi lama", self.proyeksi_pca_baru.shape) print("input LDA", self.input_LDA.shape) print("rata per kelas", self.rata_per_kelas.shape) print("rata semua kelas", self.rata_total_kelas.shape) print("Sb", self.Sb.shape) print("Sw", self.Sw.shape) print("eva", self.eigen_value.shape) print("eve", self.eigen_vector.shape, self.descending_eigen_vector.shape) print("wFid", self.wFid.shape) print("proyeksi", self.proyeksi.shape) print("bobot", self.bobot_train.shape)
def isCorrect(self, attemptFile, correctFile, transMatrixFile, standDevFile): errors = self.getErrors(correctFile, attemptFile) # transform back into array of joint angles invTransMatrix = 1 / PCA().readMatrix(transMatrixFile).transpose() jointErrors = np.dot(invTransMatrix, np.array(errors)) # read in standard deviation file to get joint angle error bounds sdVector = PCA().readVector(standDevFile) # find and report joint angle errors above the "acceptable" threshhold numBadJoints = 0 for i in range(jointErrors.shape[0]): if jointErrors[i] > sdVector[i] * 5: print( indexJoints.get(i) + ": " + str(jointErrors[i] * 180 / np.pi)) numBadJoints += 1 if numBadJoints == 0: print("true")
def get_descriptors(img, imageName, database): clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) img = clahe.apply(img) img = image_enhance.image_enhance(img) img = np.array(img, dtype=np.uint8) # Threshold ret, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) # Normalize to 0 and 1 range img[img == 255] = 1 # Thinning skeleton = skeletonize(img) skeleton = np.array(skeleton, dtype=np.uint8) skeleton = removedot(skeleton) # Creating Block Size of 144 x 96 total of 8 blocks for an image and then generating descriptors and keypoints # Storing these 4 descriptors and 8 keypoints of one image in a Mainlist and key(image name) is associated to represent this # list in a dictionary. So we store all these lists of individual in dictionary and pickling dictionary # Creating List Format ( [[keypoints][descriptors]]): List initialization DescriptorList = list() for i in range(0, 400, 200): for j in range(0, 274, 137): blockImg = img[i:i + 200, j:j + 137] # Harris corners harris_corners = cv2.cornerHarris(blockImg, 3, 3, 0.04) harris_normalized = cv2.normalize(harris_corners, 0, 255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32FC1) threshold_harris = 125; # Extract keypoints keypoints = [] for x in range(0, harris_normalized.shape[0]): for y in range(0, harris_normalized.shape[1]): if harris_normalized[x][y] > threshold_harris: keypoints.append(cv2.KeyPoint(y, x, 1)) # Define descriptor orb = cv2.ORB_create() # Compute descriptors _, des = orb.compute(blockImg, keypoints) # pca = PCA(2) # project from 32 to 2 dimensions # projected = pca.fit_transform(des) # print(des.shape) # print(projected.shape) Reduced_des = PCA(des) print(type(Reduced_des)) print(Reduced_des.shape) DescriptorList.append(Reduced_des) database.update({imageName: DescriptorList})
def main(): dim = 2 num_class = 3 dataset_dir = '../input/wine.csv' train_x, train_y, raw_data = data_loader(dataset_dir) pca = PCA(first_k=dim, use_threshold=False, threshold=0.5) proj = pca.fit(train_x) kmeans = KMeans(K=num_class) center, predict_y = kmeans.fit(proj) result = evaluate(proj, train_y, predict_y, k=num_class) visualization(center, proj, predict_y, dim) save_to_csv(raw_data, predict_y) print(result)
def analysisPCA(cryo_data, normalize=True): ### Get results on my own PCA on this dataset new_data = PCA(cryo_data, normalize=normalize) plotResults_2D(new_data, cryo_data.iloc[:,-1], 'Custom PCA Results on cryo Dataset - Normalized = '+str(normalize)) ### Get results to compare to using the sklearn version of PCA on this dataset pca = sklearn_PCA(n_components=2) if normalize: sklearn_data = sklearn_SS().fit_transform(cryo_data.iloc[:,:-1]) sklearn_new_data = pca.fit_transform(sklearn_data) else: sklearn_new_data = pca.fit_transform(cryo_data.iloc[:,:-1]) plotResults_2D(pd.DataFrame(sklearn_new_data), cryo_data.iloc[:,-1], 'Sklearn PCA Results on cryo Dataset - Normalized = '+str(normalize))
def analysisEM_GMM(cryo_data, use_PCA=True, normalize=True, title="EM_GMM Results"): ### Define a seed that doesn't push two Gaussians right next to each other np.random.seed(1) ### Reduce dimensionality to 2D new_data = [] if use_PCA: new_data = PCA(cryo_data, normalize=normalize) else: ### use_LDA new_data = LDA(cryo_data, user_dims=2, normalize=normalize) ### Run the EM_GMM algorithm to attempt to classify our data points EM_GMM(new_data, cryo_data.iloc[:,-1], 2, max_iters=10, title=title)
def pca_data(train_x, test_x, fold): pca = PCA() train_x, test_x = pca.process(train_x, test_x) #with open('train_pca_'+str(fold)+'.pkl', 'wb') as f1: #pickle.dump(train_x, f1) #with open('test_pca_'+str(fold)+'.pkl', 'wb') as f2: #pickle.dump(test_x, f2) #with open('train_pca_'+str(fold)+'.pkl', 'rb') as f1: #train_x = pickle.load(f1) #with open('test_pca_'+str(fold)+'.pkl', 'rb') as f2: #test_x = pickle.load(f2) train_x = train_x.astype(np.float, copy=True) test_x = test_x.astype(np.float, copy=True) return train_x, test_x
def computePCs(self): # 确定主成分 Percentage = 0.95 pca = PCA(self.X) U, S, V, compare = pca.SVDdecompose() self.kcount = compare.__len__() # 记录主成分最大数 comSum = 0 cSum = sum(compare) for i in compare: comSum += i if comSum / cSum >= Percentage: PCs = int(np.where(compare == i)[0][-1]) + 1 #print("主成分数k: ", PCs) self.k = PCs # 记录符合条件的最佳主成分数 break
def recommend_songs(self, songnumber): if use_pca: # Get reduced (2 dimensions) data using PCA # start_time() self.transformed = PCA(self.X) # stop_time("PCA") elif use_lem: # Get reduced (2 dimensions) data using LEM # start_time() self.transformed = LEM(self.X) # stop_time("LEM") # Get seed data point self.p = self.transformed[songnumber] # Get 20 nearest neighbors of seed self.idx = kNN(self.transformed, self.p, 20)[0]
def test_pca(n, m): # n: num of row # m: num of column for i in tqdm(range(n * m)): # make some toy data for random test test_data = np.random.rand(10, 100) # test_data: [10, 100] # set pca pca = decomposition.PCA(n_components=2) new_data = pca.fit_transform(test_data) # new_data: [10, 2] new_data_homemade = PCA(test_data, 2) # new_data_homemade: [10, 2] plt.subplot(n, m, i+1) plt.scatter(new_data[:, 0], new_data[:, 1], c='blue') plt.scatter(new_data_homemade[:, 0], new_data_homemade[:, 1], c='red') plt.show()
def test_pca2(self): pic_num = END_INDEX - START_INDEX images = loadFace() print(images) m, n = images[0].shape images_in = images.copy() #针对不同图片的降维,这会使图片趋同 pca = PCA(np.array([[image.reshape(-1) for image in images_in]])) pca_ims = pca.ret() for i in range(START_INDEX, END_INDEX): before_pca = Image.fromarray(images[i]) after_pca = Image.fromarray(pca_ims[0][i].reshape(m, n)) fig = plt.figure('pca') ax = fig.add_subplot(pic_num, 2, i * 2 + 1 - START_INDEX * 2) ax.imshow(before_pca, cmap='gray', vmin=0, vmax=255) ax = fig.add_subplot(pic_num, 2, i * 2 + 2 - START_INDEX * 2) ax.imshow(after_pca, cmap='gray', vmin=0, vmax=255) plt.show()