def getCxtSubspace(wl, dim, var_threshold=0.45): emb = [] for word in wl: if (word not in vecDict): print "non-exist:", word continue wordEmbed = dict[word] emb.append(wordEmbed) emb = np.array(emb) pca = PCA() pca.fit(emb) varList = pca.explained_variance_ratio_ cand = 0 varSum = 0 for var in varList: varSum += var cand += 1 if (varSum >= var_threshold): break pca = PCA(n_components=cand) pca.fit(emb) top_embed = pca.components_ print "dim:", len(top_embed.tolist()), cand return top_embed.tolist()
def hack_pca(filename): ''' Input: filename -- input image file name/path Output: img -- image without rotation ''' # YOUR CODE HERE # begin answer img_r = (plt.imread(filename)).astype(np.float64) data = [] for i in range(img_r.shape[0]): for j in range(img_r.shape[1]): if (img_r[i][j][3] > 0): tt = (img_r[i][j][0] * 299 + img_r[i][j][1] * 587 + img_r[i][j][2] * 114 + 500) / 1000 if (tt > 150): data.append([i, j]) data = np.array(data) w, _ = PCA(data) result = np.dot(data, w) result_min = np.min(result, axis=0) result_max = np.max(result, axis=0) result -= result_min result_max = np.max(result, axis=0) result[:, 0] = (result[:, 0] / result_max[0]) * 99 result[:, 1] = (result[:, 1] / result_max[1]) * 99 result_max = np.max(result, axis=0) new_img_r = np.zeros((100, 100, 3)) for i in range(result.shape[0]): new_img_r[-int(result[i][1])][-int(result[i][0])] = [100, 100, 100] return new_img_r
def main(): # Load dataset data = datasets.load_iris() X = normalize(data.data[data.target != 0]) y = data.target[data.target != 0] y[y == 1] = 0 y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1) clf = LogisticRegression(gradient_descent=True) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results pca = PCA() pca.plot_in_2d(X_test, y_pred, title="Logistic Regression", accuracy=accuracy)
def getCentVec(self, contextVecs): sample, rank, dim = contextVecs.shape contexts = np.reshape(contextVecs, (sample * rank, dim)) pca = PCA(n_components=1) pca.fit(contexts) return pca.components_[0]
def pic_handle(img_data, sd, ori_size): """ 做PCA处理之后,在还原到原来的维度,然后显示,之后输出信噪比 """ Pca = PCA(sd, img_data) c_data, w_star = Pca.pca() # 进行pca降维,获取投影矩阵 w_star = np.real(w_star) print(w_star) new_data = w_star * w_star.T * c_data + Pca.mean # 还原到原来的维度 total_img = [] # 图片混合 for i in range(Pca.data_size): if len(total_img) == 0: total_img = new_data[:, i].T.reshape(ori_size) else: total_img = np.hstack( [total_img, new_data[:, i].T.reshape(ori_size)]) # 计算信噪比 print('信噪比:') for i in range(Pca.data_size): a = psnr(np.array(data[:, i].T), np.array(new_data[:, i].T)) print('图', i, '的信噪比为:', a, 'dB') # 处理图片 total_img = np.array(total_img).astype(np.uint8) cv2.imwrite('pca image.jpg', total_img) # 图片显示 cv2.imshow('pca image', total_img) cv2.waitKey(0)
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # Construct nearest neighbour graph G = np.zeros([n, n]) for i in range(n): neighbours = np.argsort(D[i])[:self.nn + 1] for j in neighbours: G[i, j] = D[i, j] G[j, i] = D[j, i] # Compute ISOMAP distances D = utils.dijkstra(G) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X,X) D = np.sqrt(D) #TODO: D = self.construct_dist_graph(X , D) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z,f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def faceLoader() -> None: ''' Face loader and visualizer example code ''' gall = importGallery() print(gall.shape) gall = gall[:, :10] print(gall.shape) # Show first image plt.figure(0) plt.title('First face') n = 0 nComponents = 10 pca = PCA(nComponents) face = gall[:, :1] print(face.shape) # face = face.reshape(24576,1) # print(face.shape) mu, U, C, data = pca.train(gall) alpha = pca.to_pca(data) # print(alpha.shape) # faceId = gall.item(n)[0][0] # print('Face got face id: {}'.format(faceId)) face = alpha[:, :1] print(face.shape) face = face.reshape(192,128) plt.imshow(face, cmap='gray') plt.show()
def hack_pca(filename): ''' Input: filename -- input image file name/path Output: img -- image without rotation ''' img_r = (plt.imread(filename)).astype(np.float64) # 4 channels: R,G,B,A img_gray = img_r[:, :, 0] * 0.3 + img_r[:, :, 1] * 0.59 + img_r[:, :, 2] * 0.11 X_int = np.array(np.where(img_gray > 0)) X = X_int.astype(np.float64) D, N = X.shape eigen_vec, eigen_val = PCA(X) print(eigen_vec, eigen_val) Y = np.matmul(X.T, eigen_vec).T Y_int = Y.astype(np.int32) dmin = np.min(Y_int, axis=1).reshape(D, 1) Y_int = Y_int - dmin bound = np.max(Y_int, axis=1) + 1 new_img = np.zeros(bound) for t in range(Y_int.shape[1]): new_img[tuple(Y_int[:, t])] = img_gray[tuple(X_int[:, t])] new_img = new_img.T[::-1, ::-1] return new_img
def plot_iris(y, y_classes, maxit=25, *args, **kwargs): # np.random.seed(0) fig, ax = plot_grid(5) #Variational bayes vbpca = VBPCA(y, *args, **kwargs) for i in range(maxit): vbpca.update() plot_scatter(vbpca.transform(), y_classes, ax[0]) ax[0].set_title('VBPCA') #Laplace approximation lbpca = LBPCA(y.T) lbpca.fit(maxit) plot_scatter(lbpca.transform(2).T, y_classes, ax[1]) ax[1].set_title('LBPCA') #Streaming LBPCA stream = create_distributed(np.copy(y.T), 10) stream.randomized_fit(1) plot_scatter(stream.transform(y.T, 2).T, y_classes, ax[2]) ax[2].set_title('Batch BPCA') #Distributed LBPCA stream = create_distributed(np.copy(y.T), 10) stream.averaged_fit(maxit) plot_scatter(stream.transform(y.T, 2).T, y_classes, ax[3]) ax[3].set_title('Parallel BPCA') #PCA pca = PCA(y.T) plot_scatter(pca.fit_transform().T, y_classes, ax[4]) ax[4].set_title('PCA') plt.show()
def toyExample(): mat = scipy.io.loadmat('../data/toy_data.mat') data = mat['toy_data'] # TODO: Train PCA pca = PCA(-1) pca.train(data) print("Variance of the data") # TODO 1.2: Compute data variance to the S vector computed by the PCA data_variance = np.var(data, axis=1) print(data_variance) print(np.power(pca.S, 2) / data.shape[1]) # TODO 1.3: Compute data variance for the projected data (into 1D) to the S vector computed by the PCA Xout = pca.project(data, 1) print("Variance of the projected data") data_variance = np.var(Xout, axis=1) print(data_variance) print(np.power(pca.S[0], 2) / data.shape[1]) plt.figure() plt.title('PCA plot') plt.subplot(1, 2, 1) # Visualize given data and principal components # TODO 1.1: Plot original data (hint, use the plot_pca function pca.plot_pca(data) plt.subplot(1, 2, 2) # TODO 1.3: Plot data projected into 1 dimension pca.S[1] = 0 pca.plot_pca(Xout) plt.show()
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # D is symmetric matrix geoD = np.zeros((n, n)) # find nn-neighbours for i in range(n): sort = np.argsort(D[:, i]) neigh = np.setdiff1d(sort[0:self.nn + 1], i) # find the nn+1 smallest indexes that are not i for j in range(len(neigh)): t = neigh[j] geoD[i, t] = D[i, t] geoD[t, i] = D[t, i] D = utils.dijkstra(geoD) # for disconnected vertices (distance is Inf) # set their dist = max_dist(graph) # to encourage they are far away from each other D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def get_update7(id): entry_to_update = PCA_table.query.get_or_404(id) if request.method == 'POST': entry_to_update.clump = float(request.form['clump']) entry_to_update.unifsize = float(request.form['unifsize']) entry_to_update.unifshape = float(request.form['unifshape']) entry_to_update.margadh = float(request.form['margadh']) entry_to_update.singepisize = float(request.form['singepisize']) entry_to_update.barenuc = float(request.form['barenuc']) entry_to_update.blandchrom = float(request.form['blandchrom']) entry_to_update.normnucl = float(request.form['normnucl']) entry_to_update.mit = float(request.form['mit']) payload = [ entry_to_update.clump, entry_to_update.unifsize, entry_to_update.unifshape, entry_to_update.margadh, entry_to_update.singepisize, entry_to_update.barenuc, entry_to_update.blandchrom, entry_to_update.normnucl, entry_to_update.mit ] payload = np.array(payload) entry_to_update.cell_class = PCA(payload) try: db.session.commit() return redirect('/PCA') except: return 'Contact DBA' else: return render_template('PCA_update.html', entry_to_update=entry_to_update)
def hullselect(self): def selectHullPoints(data, n=20): """ select data points for pairwise projections of the first n dimensions """ # iterate over all projections and select data points idx = np.array([]) # iterate over some pairwise combinations of dimensions for i in combinations(range(n), 2): # sample convex hull points in 2D projection convex_hull_d = quickhull(data[i, :].T) # get indices for convex hull data points idx = np.append(idx, dist.vq(data[i, :], convex_hull_d.T)) idx = np.unique(idx) return np.int32(idx) # determine convex hull data points only if the total # amount of available data is >50 #if self.data.shape[1] > 50: pcamodel = PCA(self.data, show_progress=self._show_progress) pcamodel.factorize() idx = selectHullPoints(pcamodel.H, n=self._base_sel) # set the number of subsampled data self.nsub = len(idx) return idx
def compress(self, X): n = X.shape[0] k = self.k K = self.K # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) nbrs = np.argsort(D, axis=1)[:, 1:K + 1] G = np.zeros((n, n)) for i in range(n): for j in nbrs[i]: G[i, j] = D[i, j] G[j, i] = D[j, i] D = utils.dijkstra(G) D[D == np.inf] = -np.inf max = np.max(D) D[D == -np.inf] = max # Initialize low-dimensional representation with PCA Z = PCA(k).fit(X).compress(X) # Solve for the minimizer z = find_min(self._fun_obj_z, Z.flatten(), 500, False, D) Z = z.reshape(n, k) return Z
def hack_pca(filename): ''' Input: filename -- input image file name/path Output: img -- image without rotation ''' img_r = (plt.imread(filename)).astype(np.float64) / 255 img_r = rgb2gray(img_r) plt.imshow(img_r, cmap='gray') plt.show() m, n = img_r.shape xy = [] xyv = [] for i in range(m): for j in range(n): if img_r[i, j] > 0: xy.append((i, j)) xyv.append((i, j, img_r[i, j])) xy = np.array(xy) vector, value = PCA(xy) d = np.array(np.round(np.dot(xy, vector))).astype(np.int) min_xy = np.min(d, axis=0) d -= min_xy max_xy = np.max(d, axis=0) img = np.zeros((max_xy[1] + 1, max_xy[0] + 1)) for i in range(xy.shape[0]): img[max_xy[1] - d[i, 1], max_xy[0] - d[i, 0]] = xyv[i][2] plt.imshow(img, cmap='gray') plt.show() return img
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) sorted_indices = np.argsort(D) G = np.zeros((n, n)) for i in range(D.shape[0]): for j in range(self.nn + 1): G[i, sorted_indices[i, j]] = D[i, sorted_indices[i, j]] G[sorted_indices[i, j], i] = D[sorted_indices[i, j], i] dist = utils.dijkstra(G) dist[np.isinf(dist)] = dist[~np.isinf(dist)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, dist) Z = z.reshape(n, self.k) return Z
def __init__(self, img_dataset: np.ndarray): """ :param img_dataset: An image dataset, which is a matrix with the shape of (N x H x W), where: - N: number of images - H: height of images - W: width of images - each item of the matrix is an real value between 0 and 1 Notes: All images should have same width and height """ # Get the shape of the input data super().__init__() assert len(img_dataset.shape) == 3 self._n_samples, self._height, self._width = img_dataset.shape self.logger.info({ 'msg': 'Image dataset shape', 'shape': img_dataset.shape }) self._n_features = self._height * self._width # Flatten the images of shape (height, width) to vectors of length height x width self._flatten_dataset = img_dataset.reshape( (self._n_samples, self._n_features)) # Build the PCA transformer self._pca_transformer = PCA(self._flatten_dataset)
def hack_pca(filename, threshold=0.6): ''' Input: filename -- input image file name/path Output: img -- image without rotation ''' img_r = (plt.imread(filename)).astype(np.float64) / 255 # YOUR CODE HERE img = img_r[:, :, 0] * 0.299 + img_r[:, :, 1] * 0.587 + img_r[:, :, 2] * 0.114 H, W = img.shape data = [] for i in range(H): # x axis for j in range(W): # y axis if img[i, j] >= threshold: data.append([i, j]) data = np.array(data) N = data.shape[0] eigvectors, eigvalues = PCA(data) (vx, vy) = eigvectors[:, 0] (vx, vy) = (vx, vy) if vy >= 0 else (-vx, -vy) theta = -math.asin(-vx) * 180 / math.pi R = np.array([[vy, vx], [-vx, vy]]) # rotate matrix odata = np.matmul(data, R) odata -= np.min(odata, axis=0) odata = odata.astype(int) nH, nW = np.max(odata, axis=0) oimg = np.zeros((nH + 1, nW + 1)) for i in range(N): oimg[odata[i, 0], odata[i, 1]] = 1. return img, oimg, theta
def compress(self, X): n = X.shape[0] # nearest_neighbours = np.zeros((n, self.nn)) # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. adjacency_matrix = np.zeros((n, n)) nearest_neighbours = self.knn(X) for i, j in enumerate(nearest_neighbours): for neighbour in j: adjacency_matrix[i, neighbour] = D[i, neighbour] adjacency_matrix[neighbour, i] = D[neighbour, i] dijkstra = utils.dijkstra(adjacency_matrix) dijkstra[np.isinf(dijkstra)] = dijkstra[~np.isinf(dijkstra)].max() # Initialize low-dimensional representation with PCA Z = PCA(self.k).fit(X).compress(X) # Solve for the minimizer z = find_min(self._fun_obj_z, Z.flatten(), 500, False, dijkstra) Z = z.reshape(n, self.k) return Z
def main(): # reduce the dimensionality of the data to two dimension and plot the results. data = datasets.load_digits() X = data.data y = data.target # Project the data onto the 2 primary principal components X_trans = PCA().transform(X, 2) x1 = X_trans[:, 0] x2 = X_trans[:, 1] cmap = plt.get_cmap('viridis') colors = [cmap(i) for i in np.linspace(0, 1, len(np.unique(y)))] class_distr = [] # Plot the different class distributions for i, l in enumerate(np.unique(y)): _x1 = x1[y == l] _x2 = x2[y == l] _y = y[y == l] class_distr.append(plt.scatter(_x1, _x2, color=colors[i])) plt.legend(class_distr, y, loc=1) plt.suptitle("PCA Dimensionality Reduction") plt.title("Digit Dataset") plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.show()
def train_models(): images, labels, labels_dic = collect_dat_set() rec_eig = PCA(500, 5) if images: rec_eig.train(images, labels) return rec_eig, labels_dic
def test_pca(self): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) pca = PCA(n_comp=2) pca.fit(X) self.assertEqual( np.allclose(pca.explained_variance, np.array([0.9924, 0.0075]), atol=1e-3), True)
def testePCA(self): pca = PCA() matrizX = Matrizx() idModelo = self.txtIdModelo.get() matrizPrincipal = matrizX.selectMatrizXModeloMMM(idModelo) pca.testePCA(matrizPrincipal)
def test_transform_inverse_transform(self): X = self.data(500) pca = PCA(X) x = pca.transform(X, ndims=3) self.assertTrue( np.allclose(np.cov(x.T), np.eye(3), atol=1e-4, rtol=1e-2)) X_ = pca.inverse_transform(x) self.assertTrue(np.allclose(X, X_))
def toyExample() -> None: ## Toy Data Set mat = scipy.io.loadmat('../data/toy_data.mat') data = mat['toy_data'] data = importGallery() ## limit datafor testing purposes data = data[:, :144].T print(data.shape) ## Iris dataset. Just for testing purposes #iris = datasets.load_iris() #data = iris['data'].astype(np.float32) # a 150x4 matrix with features #data = data.T # TODO: Train PCA nComponents = 25 pca = PCA(nComponents) ## 1.1 Calculate PCA manuelly. SVD is following #pca.pca_manuel(data) ## 1.2 Calculate PCA via SVD mu, U, C, dataCenter = pca.train(data) ## 2. Transform RAW data using first n principal components alpha = pca.to_pca(dataCenter) ## 3. Backtransform alpha to Raw data Xout = pca.from_pca(alpha) print("Variance") # TODO 1.2: Compute data variance to the eigenvalue vector computed by the PCA print(f'Total Variance: {np.var(data)}') print(f'Eigenvalues: {C} \n') # TODO 1.3: Compute data variance for the projected data (into 1D) to the S vector computed by the PCA print(f'Total Variance Transform: {np.var(alpha)}') print(f'Mean Eigenvalues: {np.mean(C)}') ## Plot only if fewer than 2 components if nComponents == 2: plt.figure() plt.title('PCA plot') plt.subplot(1, 2, 1) # Visualize given data and principal components # TODO 1.1: Plot original data (hint, use the plot_pca function pca.plot_pca(data) plt.subplot(1, 2, 2) # TODO 1.3: Plot data projected into 1 dimension pca.plot_pca(Xout) plt.show() ## Plot variances else: x = np.arange(1, len(C) + 1) plt.bar(x, C) plt.show()
def generate_pca_embedding_files(): ''' Generate PCA embedding csv files for the experiments. ''' raw = genfromtxt('digits-raw.csv', delimiter=',') X = raw[:, 2:] pca = PCA(10) X_new = pca.fit_transform(X) raw_new = hstack((raw[:, :2], X_new)) savetxt('digits-pca-embedding.csv', raw_new, delimiter=',')
def test_invariance_to_many_transforms(self): X = self.data(500) pca = PCA(X) x = pca.transform(X, ndims=2) # error introduced here self.assertTrue( np.allclose(np.cov(x.T), np.eye(2), atol=1e-4, rtol=1e-2)) X1 = pca.inverse_transform(x) x1 = pca.transform(X1, ndims=2) X2 = pca.inverse_transform(x1) self.assertTrue(np.allclose(X1, X2))
def main(matrix, pcomps): pca = PCA(pcomps, matrix) # get covariance matrix - saved as instance var pca.covariance_matrix() # find evecs and evals of covariance matrix pca.evecs_and_evals() # get top x principal components aka evecs corresponding to top evals pca.principal_components() # reduce image on those components return pca.get_evec_matrix()
def main(): whiten = False if len(sys.argv) > 1 and sys.argv[1] == '--whiten': whiten = True del sys.argv[1] if len(sys.argv) <= 3: print 'Usage: %s pcaDims n_hidden learningRate' % sys.argv[0] sys.exit(1) # loads data like datasets = ((train_x, train_y), ([], None), (test_x, None)) datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz', '../data/upson_rovio_1/test_15_50000.pkl.gz') img_dim = 15 # must match actual size of training data print 'done loading.' pcaDims = int(sys.argv[1]) pca = PCA(datasets[0][0]) # train datasets[0][0] = pca.toPC(datasets[0][0], pcaDims, whiten=whiten) # train datasets[1][0] = pca.toPC( datasets[1][0], pcaDims, whiten=whiten) if len(datasets[1][0]) > 0 else array([]) # valid datasets[2][0] = pca.toPC(datasets[2][0], pcaDims, whiten=whiten) # test print 'reduced by PCA to' print('(%d, %d, %d) %d dimensional examples in (train, valid, test)' % (datasets[0][0].shape[0], datasets[1][0].shape[0], datasets[2][0].shape[0], datasets[0][0].shape[1])) # plot mean and principle components image = Image.fromarray( tile_raster_images(X=pca.meanAndPc(pcaDims).T, img_shape=(img_dim, img_dim), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save(os.path.join(resman.rundir, 'meanAndPc.png')) # plot fractional stddev in PCA dimensions pyplot.semilogy(pca.fracStd, 'bo-') if pcaDims is not None: pyplot.axvline(pcaDims) pyplot.savefig(os.path.join(resman.rundir, 'fracStd.png')) pyplot.clf() test_rbm(datasets=datasets, training_epochs=45, img_dim=img_dim, n_input=pcaDims if pcaDims else img_dim * img_dim, n_hidden=int(sys.argv[2]), learning_rate=float(sys.argv[3]), output_dir=resman.rundir, quickHack=False, visibleModel='real', initWfactor=.01, imgPlotFunction=lambda xx: pca.fromPC(xx, unwhiten=whiten))