def compress(self, X): n = X.shape[0] # nearest_neighbours = np.zeros((n, self.nn)) # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. adjacency_matrix = np.zeros((n, n)) nearest_neighbours = self.knn(X) for i, j in enumerate(nearest_neighbours): for neighbour in j: adjacency_matrix[i, neighbour] = D[i, neighbour] adjacency_matrix[neighbour, i] = D[neighbour, i] dijkstra = utils.dijkstra(adjacency_matrix) dijkstra[np.isinf(dijkstra)] = dijkstra[~np.isinf(dijkstra)].max() # Initialize low-dimensional representation with PCA Z = PCA(self.k).fit(X).compress(X) # Solve for the minimizer z = find_min(self._fun_obj_z, Z.flatten(), 500, False, dijkstra) Z = z.reshape(n, self.k) return Z
def hullselect(self): def selectHullPoints(data, n=20): """ select data points for pairwise projections of the first n dimensions """ # iterate over all projections and select data points idx = np.array([]) # iterate over some pairwise combinations of dimensions for i in combinations(range(n), 2): # sample convex hull points in 2D projection convex_hull_d = quickhull(data[i, :].T) # get indices for convex hull data points idx = np.append(idx, dist.vq(data[i, :], convex_hull_d.T)) idx = np.unique(idx) return np.int32(idx) # determine convex hull data points only if the total # amount of available data is >50 #if self.data.shape[1] > 50: pcamodel = PCA(self.data, show_progress=self._show_progress) pcamodel.factorize() idx = selectHullPoints(pcamodel.H, n=self._base_sel) # set the number of subsampled data self.nsub = len(idx) return idx
def __init__(self, img_dataset: np.ndarray): """ :param img_dataset: An image dataset, which is a matrix with the shape of (N x H x W), where: - N: number of images - H: height of images - W: width of images - each item of the matrix is an real value between 0 and 1 Notes: All images should have same width and height """ # Get the shape of the input data super().__init__() assert len(img_dataset.shape) == 3 self._n_samples, self._height, self._width = img_dataset.shape self.logger.info({ 'msg': 'Image dataset shape', 'shape': img_dataset.shape }) self._n_features = self._height * self._width # Flatten the images of shape (height, width) to vectors of length height x width self._flatten_dataset = img_dataset.reshape( (self._n_samples, self._n_features)) # Build the PCA transformer self._pca_transformer = PCA(self._flatten_dataset)
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # Construct nearest neighbour graph G = np.zeros([n, n]) for i in range(n): neighbours = np.argsort(D[i])[:self.nn + 1] for j in neighbours: G[i, j] = D[i, j] G[j, i] = D[j, i] # Compute ISOMAP distances D = utils.dijkstra(G) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X,X) D = np.sqrt(D) #TODO: D = self.construct_dist_graph(X , D) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z,f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def toyExample(): mat = scipy.io.loadmat('../data/toy_data.mat') data = mat['toy_data'] # TODO: Train PCA pca = PCA(-1) pca.train(data) print("Variance of the data") # TODO 1.2: Compute data variance to the S vector computed by the PCA data_variance = np.var(data, axis=1) print(data_variance) print(np.power(pca.S, 2) / data.shape[1]) # TODO 1.3: Compute data variance for the projected data (into 1D) to the S vector computed by the PCA Xout = pca.project(data, 1) print("Variance of the projected data") data_variance = np.var(Xout, axis=1) print(data_variance) print(np.power(pca.S[0], 2) / data.shape[1]) plt.figure() plt.title('PCA plot') plt.subplot(1, 2, 1) # Visualize given data and principal components # TODO 1.1: Plot original data (hint, use the plot_pca function pca.plot_pca(data) plt.subplot(1, 2, 2) # TODO 1.3: Plot data projected into 1 dimension pca.S[1] = 0 pca.plot_pca(Xout) plt.show()
def faceLoader() -> None: ''' Face loader and visualizer example code ''' gall = importGallery() print(gall.shape) gall = gall[:, :10] print(gall.shape) # Show first image plt.figure(0) plt.title('First face') n = 0 nComponents = 10 pca = PCA(nComponents) face = gall[:, :1] print(face.shape) # face = face.reshape(24576,1) # print(face.shape) mu, U, C, data = pca.train(gall) alpha = pca.to_pca(data) # print(alpha.shape) # faceId = gall.item(n)[0][0] # print('Face got face id: {}'.format(faceId)) face = alpha[:, :1] print(face.shape) face = face.reshape(192,128) plt.imshow(face, cmap='gray') plt.show()
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # D is symmetric matrix geoD = np.zeros((n, n)) # find nn-neighbours for i in range(n): sort = np.argsort(D[:, i]) neigh = np.setdiff1d(sort[0:self.nn + 1], i) # find the nn+1 smallest indexes that are not i for j in range(len(neigh)): t = neigh[j] geoD[i, t] = D[i, t] geoD[t, i] = D[t, i] D = utils.dijkstra(geoD) # for disconnected vertices (distance is Inf) # set their dist = max_dist(graph) # to encourage they are far away from each other D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) sorted_indices = np.argsort(D) G = np.zeros((n, n)) for i in range(D.shape[0]): for j in range(self.nn + 1): G[i, sorted_indices[i, j]] = D[i, sorted_indices[i, j]] G[sorted_indices[i, j], i] = D[sorted_indices[i, j], i] dist = utils.dijkstra(G) dist[np.isinf(dist)] = dist[~np.isinf(dist)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, dist) Z = z.reshape(n, self.k) return Z
def pic_handle(img_data, sd, ori_size): """ 做PCA处理之后,在还原到原来的维度,然后显示,之后输出信噪比 """ Pca = PCA(sd, img_data) c_data, w_star = Pca.pca() # 进行pca降维,获取投影矩阵 w_star = np.real(w_star) print(w_star) new_data = w_star * w_star.T * c_data + Pca.mean # 还原到原来的维度 total_img = [] # 图片混合 for i in range(Pca.data_size): if len(total_img) == 0: total_img = new_data[:, i].T.reshape(ori_size) else: total_img = np.hstack( [total_img, new_data[:, i].T.reshape(ori_size)]) # 计算信噪比 print('信噪比:') for i in range(Pca.data_size): a = psnr(np.array(data[:, i].T), np.array(new_data[:, i].T)) print('图', i, '的信噪比为:', a, 'dB') # 处理图片 total_img = np.array(total_img).astype(np.uint8) cv2.imwrite('pca image.jpg', total_img) # 图片显示 cv2.imshow('pca image', total_img) cv2.waitKey(0)
def plot_iris(y, y_classes, maxit=25, *args, **kwargs): # np.random.seed(0) fig, ax = plot_grid(5) #Variational bayes vbpca = VBPCA(y, *args, **kwargs) for i in range(maxit): vbpca.update() plot_scatter(vbpca.transform(), y_classes, ax[0]) ax[0].set_title('VBPCA') #Laplace approximation lbpca = LBPCA(y.T) lbpca.fit(maxit) plot_scatter(lbpca.transform(2).T, y_classes, ax[1]) ax[1].set_title('LBPCA') #Streaming LBPCA stream = create_distributed(np.copy(y.T), 10) stream.randomized_fit(1) plot_scatter(stream.transform(y.T, 2).T, y_classes, ax[2]) ax[2].set_title('Batch BPCA') #Distributed LBPCA stream = create_distributed(np.copy(y.T), 10) stream.averaged_fit(maxit) plot_scatter(stream.transform(y.T, 2).T, y_classes, ax[3]) ax[3].set_title('Parallel BPCA') #PCA pca = PCA(y.T) plot_scatter(pca.fit_transform().T, y_classes, ax[4]) ax[4].set_title('PCA') plt.show()
def compress(self, X): n = X.shape[0] k = self.k K = self.K # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) nbrs = np.argsort(D, axis=1)[:, 1:K + 1] G = np.zeros((n, n)) for i in range(n): for j in nbrs[i]: G[i, j] = D[i, j] G[j, i] = D[j, i] D = utils.dijkstra(G) D[D == np.inf] = -np.inf max = np.max(D) D[D == -np.inf] = max # Initialize low-dimensional representation with PCA Z = PCA(k).fit(X).compress(X) # Solve for the minimizer z = find_min(self._fun_obj_z, Z.flatten(), 500, False, D) Z = z.reshape(n, k) return Z
def getCentVec(self, contextVecs): sample, rank, dim = contextVecs.shape contexts = np.reshape(contextVecs, (sample * rank, dim)) pca = PCA(n_components=1) pca.fit(contexts) return pca.components_[0]
def main(): # Load dataset data = datasets.load_iris() X = normalize(data.data[data.target != 0]) y = data.target[data.target != 0] y[y == 1] = 0 y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1) clf = LogisticRegression(gradient_descent=True) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results pca = PCA() pca.plot_in_2d(X_test, y_pred, title="Logistic Regression", accuracy=accuracy)
def testePCA(self): pca = PCA() matrizX = Matrizx() idModelo = self.txtIdModelo.get() matrizPrincipal = matrizX.selectMatrizXModeloMMM(idModelo) pca.testePCA(matrizPrincipal)
def test_pca(self): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) pca = PCA(n_comp=2) pca.fit(X) self.assertEqual( np.allclose(pca.explained_variance, np.array([0.9924, 0.0075]), atol=1e-3), True)
def train_classifiers(all_training_idx, samples_training, color_channels): logging.debug("Training classifiers..") classifiers = [] for channel in color_channels: classifier = PCA(samples_training) classifier.train(channel[all_training_idx]) classifiers.append(classifier) return classifiers
def test_transform_inverse_transform(self): X = self.data(500) pca = PCA(X) x = pca.transform(X, ndims=3) self.assertTrue( np.allclose(np.cov(x.T), np.eye(3), atol=1e-4, rtol=1e-2)) X_ = pca.inverse_transform(x) self.assertTrue(np.allclose(X, X_))
def train_models(): images, labels, labels_dic = collect_dat_set() rec_eig = PCA(500, 5) if images: rec_eig.train(images, labels) return rec_eig, labels_dic
def generate_pca_embedding_files(): ''' Generate PCA embedding csv files for the experiments. ''' raw = genfromtxt('digits-raw.csv', delimiter=',') X = raw[:, 2:] pca = PCA(10) X_new = pca.fit_transform(X) raw_new = hstack((raw[:, :2], X_new)) savetxt('digits-pca-embedding.csv', raw_new, delimiter=',')
def main(): random.seed(1) img_dim = 15 # 10, 15, ... datasets = loadUpsonData('../data/upson_rovio_1/train_%d_50000.pkl.gz' % img_dim, '../data/upson_rovio_1/test_%d_50000.pkl.gz' % img_dim) print 'done loading.' train_set_x_data, train_set_y = datasets[0] pca = PCA(train_set_x_data) print 'done PCA.' image = Image.fromarray(tile_raster_images( X = train_set_x_data, img_shape = (img_dim,img_dim),tile_shape = (10,10), tile_spacing=(1,1))) image.save(os.path.join(resman.rundir, 'samplesData.png')) pyplot.figure() pyplot.subplot(221); pyplot.semilogy(pca.var); pyplot.title('pca.var') pyplot.subplot(222); pyplot.semilogy(pca.std); pyplot.title('pca.std') pyplot.subplot(223); pyplot.semilogy(pca.fracVar); pyplot.title('pca.fracVar') pyplot.subplot(224); pyplot.semilogy(pca.fracStd); pyplot.title('pca.fracStd') pyplot.savefig(os.path.join(resman.rundir, 'varstd.png')) pyplot.close() #font = ImageFont.truetype('/usr/share/fonts/truetype/ttf-lyx/cmr10.ttf', 10) font = ImageFont.truetype('/usr/share/texmf/fonts/opentype/public/lm/lmmono12-regular.otf', 14) def plotImage(xx, filename, str = None): arr = tile_raster_images(X = xx, img_shape = (img_dim,img_dim),tile_shape = (10,10), tile_spacing=(1,1)) arrHeight = arr.shape[0] if str is not None: arr = vstack((arr, zeros((20, arr.shape[1]), dtype = arr.dtype))) image = Image.fromarray(arr) if str is not None: draw = ImageDraw.Draw(image) draw.text((2, arrHeight+2), str, 255, font = font) draw = ImageDraw.Draw(image) image.save(os.path.join(resman.rundir, filename)) plotImage(pca.pc().T, 'pc.png') for dims in [1, 2, 5, 10, 20, 50, 100, 200, 225]: plotImage(pca.pcaAndBack(train_set_x_data, dims), 'samplesPCA_%03d.png' % dims, 'dims=%d' % dims) for ee, epsilon in enumerate([0, 1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 5]): plotImage(pca.toZca(train_set_x_data, dims, epsilon = epsilon), 'samplesZCA_%03d_%02d.png' % (dims, ee), 'dims=%d, eps=%s' % (dims, repr(epsilon)))
def main(): whiten = False if len(sys.argv) > 1 and sys.argv[1] == '--whiten': whiten = True del sys.argv[1] if len(sys.argv) <= 3: print 'Usage: %s pcaDims n_hidden learningRate' % sys.argv[0] sys.exit(1) # loads data like datasets = ((train_x, train_y), ([], None), (test_x, None)) datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz', '../data/upson_rovio_1/test_15_50000.pkl.gz') img_dim = 15 # must match actual size of training data print 'done loading.' pcaDims = int(sys.argv[1]) pca = PCA(datasets[0][0]) # train datasets[0][0] = pca.toPC(datasets[0][0], pcaDims, whiten = whiten) # train datasets[1][0] = pca.toPC(datasets[1][0], pcaDims, whiten = whiten) if len(datasets[1][0]) > 0 else array([]) # valid datasets[2][0] = pca.toPC(datasets[2][0], pcaDims, whiten = whiten) # test print 'reduced by PCA to' print ('(%d, %d, %d) %d dimensional examples in (train, valid, test)' % (datasets[0][0].shape[0], datasets[1][0].shape[0], datasets[2][0].shape[0], datasets[0][0].shape[1])) # plot mean and principle components image = Image.fromarray(tile_raster_images( X = pca.meanAndPc(pcaDims).T, img_shape = (img_dim,img_dim),tile_shape = (10,10), tile_spacing=(1,1))) image.save(os.path.join(resman.rundir, 'meanAndPc.png')) # plot fractional stddev in PCA dimensions pyplot.semilogy(pca.fracStd, 'bo-') if pcaDims is not None: pyplot.axvline(pcaDims) pyplot.savefig(os.path.join(resman.rundir, 'fracStd.png')) pyplot.clf() test_rbm(datasets = datasets, training_epochs = 45, img_dim = img_dim, n_input = pcaDims if pcaDims else img_dim * img_dim, n_hidden = int(sys.argv[2]), learning_rate = float(sys.argv[3]), output_dir = resman.rundir, quickHack = False, visibleModel = 'real', initWfactor = .01, imgPlotFunction = lambda xx: pca.fromPC(xx, unwhiten = whiten))
def toyExample() -> None: ## Toy Data Set mat = scipy.io.loadmat('../data/toy_data.mat') data = mat['toy_data'] data = importGallery() ## limit datafor testing purposes data = data[:, :144].T print(data.shape) ## Iris dataset. Just for testing purposes #iris = datasets.load_iris() #data = iris['data'].astype(np.float32) # a 150x4 matrix with features #data = data.T # TODO: Train PCA nComponents = 25 pca = PCA(nComponents) ## 1.1 Calculate PCA manuelly. SVD is following #pca.pca_manuel(data) ## 1.2 Calculate PCA via SVD mu, U, C, dataCenter = pca.train(data) ## 2. Transform RAW data using first n principal components alpha = pca.to_pca(dataCenter) ## 3. Backtransform alpha to Raw data Xout = pca.from_pca(alpha) print("Variance") # TODO 1.2: Compute data variance to the eigenvalue vector computed by the PCA print(f'Total Variance: {np.var(data)}') print(f'Eigenvalues: {C} \n') # TODO 1.3: Compute data variance for the projected data (into 1D) to the S vector computed by the PCA print(f'Total Variance Transform: {np.var(alpha)}') print(f'Mean Eigenvalues: {np.mean(C)}') ## Plot only if fewer than 2 components if nComponents == 2: plt.figure() plt.title('PCA plot') plt.subplot(1, 2, 1) # Visualize given data and principal components # TODO 1.1: Plot original data (hint, use the plot_pca function pca.plot_pca(data) plt.subplot(1, 2, 2) # TODO 1.3: Plot data projected into 1 dimension pca.plot_pca(Xout) plt.show() ## Plot variances else: x = np.arange(1, len(C) + 1) plt.bar(x, C) plt.show()
def main(): data = datasets.load_iris() X = normalize(data.data) y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, 0.3) knn = KNN(3) y_pred = knn.predict(X_test, X_train, y_train) accuracy = accuracy_score(y_pred, y_test) print("accuracy is ", accuracy) pca = PCA() pca.plot_in_2d(X_test, y_pred, title="knn", accuracy=accuracy)
def main(matrix, pcomps): pca = PCA(pcomps, matrix) # get covariance matrix - saved as instance var pca.covariance_matrix() # find evecs and evals of covariance matrix pca.evecs_and_evals() # get top x principal components aka evecs corresponding to top evals pca.principal_components() # reduce image on those components return pca.get_evec_matrix()
def getCxtSubspace(wl, dim, var_threshold=0.45): emb = [] for word in wl: if (word not in vecDict): print "non-exist:", word continue wordEmbed = dict[word] emb.append(wordEmbed) emb = np.array(emb) pca = PCA() pca.fit(emb) varList = pca.explained_variance_ratio_ cand = 0 varSum = 0 for var in varList: varSum += var cand += 1 if (varSum >= var_threshold): break pca = PCA(n_components=cand) pca.fit(emb) top_embed = pca.components_ print "dim:", len(top_embed.tolist()), cand return top_embed.tolist()
def Bonus3(): ''' Scatter plot of samples projected onto the first two eigenvectors. ''' raw = genfromtxt('digits-raw.csv', delimiter=',') X = raw[:, 2:] pca = PCA(2) X_new = pca.fit_transform(X) perm = permutation(X.shape[0])[:1000] labels = array(raw[perm, 1], dtype=int) colors = rand(10, 3)[labels, :] scatter(X_new[perm, 0], X_new[perm, 1], c=colors, alpha=0.9, s=10) show()
def compress(self, X): n = X.shape[0] k = self.k # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # Initialize low-dimensional representation with PCA Z = PCA(k).fit(X).compress(X) # Solve for the minimizer z = find_min(self._fun_obj_z, Z.flatten(), 500, False, D) Z = z.reshape(n, k) return Z
def hack_pca(filename, threshold=0.6): ''' Input: filename -- input image file name/path Output: img -- image without rotation ''' img_r = (plt.imread(filename)).astype(np.float64) / 255 # YOUR CODE HERE img = img_r[:, :, 0] * 0.299 + img_r[:, :, 1] * 0.587 + img_r[:, :, 2] * 0.114 H, W = img.shape data = [] for i in range(H): # x axis for j in range(W): # y axis if img[i, j] >= threshold: data.append([i, j]) data = np.array(data) N = data.shape[0] eigvectors, eigvalues = PCA(data) (vx, vy) = eigvectors[:, 0] (vx, vy) = (vx, vy) if vy >= 0 else (-vx, -vy) theta = -math.asin(-vx) * 180 / math.pi R = np.array([[vy, vx], [-vx, vy]]) # rotate matrix odata = np.matmul(data, R) odata -= np.min(odata, axis=0) odata = odata.astype(int) nH, nW = np.max(odata, axis=0) oimg = np.zeros((nH + 1, nW + 1)) for i in range(N): oimg[odata[i, 0], odata[i, 1]] = 1. return img, oimg, theta
def hack_pca(filename): ''' Input: filename -- input image file name/path Output: img -- image without rotation ''' img_r = (plt.imread(filename)).astype(np.float64) # 4 channels: R,G,B,A img_gray = img_r[:, :, 0] * 0.3 + img_r[:, :, 1] * 0.59 + img_r[:, :, 2] * 0.11 X_int = np.array(np.where(img_gray > 0)) X = X_int.astype(np.float64) D, N = X.shape eigen_vec, eigen_val = PCA(X) print(eigen_vec, eigen_val) Y = np.matmul(X.T, eigen_vec).T Y_int = Y.astype(np.int32) dmin = np.min(Y_int, axis=1).reshape(D, 1) Y_int = Y_int - dmin bound = np.max(Y_int, axis=1) + 1 new_img = np.zeros(bound) for t in range(Y_int.shape[1]): new_img[tuple(Y_int[:, t])] = img_gray[tuple(X_int[:, t])] new_img = new_img.T[::-1, ::-1] return new_img
def hack_pca(filename): ''' Input: filename -- input image file name/path Output: img -- image without rotation ''' img_r = (plt.imread(filename)).astype(np.float64) / 255 img_r = rgb2gray(img_r) plt.imshow(img_r, cmap='gray') plt.show() m, n = img_r.shape xy = [] xyv = [] for i in range(m): for j in range(n): if img_r[i, j] > 0: xy.append((i, j)) xyv.append((i, j, img_r[i, j])) xy = np.array(xy) vector, value = PCA(xy) d = np.array(np.round(np.dot(xy, vector))).astype(np.int) min_xy = np.min(d, axis=0) d -= min_xy max_xy = np.max(d, axis=0) img = np.zeros((max_xy[1] + 1, max_xy[0] + 1)) for i in range(xy.shape[0]): img[max_xy[1] - d[i, 1], max_xy[0] - d[i, 0]] = xyv[i][2] plt.imshow(img, cmap='gray') plt.show() return img
def main(): # reduce the dimensionality of the data to two dimension and plot the results. data = datasets.load_digits() X = data.data y = data.target # Project the data onto the 2 primary principal components X_trans = PCA().transform(X, 2) x1 = X_trans[:, 0] x2 = X_trans[:, 1] cmap = plt.get_cmap('viridis') colors = [cmap(i) for i in np.linspace(0, 1, len(np.unique(y)))] class_distr = [] # Plot the different class distributions for i, l in enumerate(np.unique(y)): _x1 = x1[y == l] _x2 = x2[y == l] _y = y[y == l] class_distr.append(plt.scatter(_x1, _x2, color=colors[i])) plt.legend(class_distr, y, loc=1) plt.suptitle("PCA Dimensionality Reduction") plt.title("Digit Dataset") plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.show()
def pcaSenEmb(sent_vecs, var_threshold=0.6): """ output: basis of context space """ pca = PCA() pca.fit(sent_vecs) var_list = pca.explained_variance_ratio_ cand = 0 var_sum = 0 for var in var_list: var_sum += var cand += 1 if (var_sum >= var_threshold): break basis = pca.components_ return basis
class PCANNClassifier(object): def __init__(self, trials, trial_angle, **kwargs): if "pca" in kwargs and not kwargs["pca"]: self.pca = None; else: # Prepare PCA only on training data: self.pca = PCA(np.vstack(trials), None, 0.95); tr = []; tr_a = []; for t,a in zip(trials, trial_angle): # Project it onto PCA space p = t; if self.pca: p = self.pca.proj(t); for i in range(len(p)): tr.append(p[i,:]); tr_a.append(a); # Prepare NN classifier self.nn = NearestNeighbors(n_neighbors=19) self.nn.fit(np.vstack(tr)); self.class_labels = tr_a; def classify(self, trials): rv = []; for tr in trials: p = tr; if self.pca: p = self.pca.proj(tr); p = np.array(p); votes = defaultdict(lambda: 0); for i in range(len(p)): dist, idx = self.nn.kneighbors(p[i,:]); for j in idx[0,:]: votes[self.class_labels[j]] += 1; max_a, max_c = max(votes.iteritems(), key = lambda x: x[1]); rv.append(max_a); return rv;
def __init__(self, trials, bin_width, trial_angles, trial_moves): self.trials = trials self.bin_width = bin_width self.trial_angles = trial_angles self.trial_moves = trial_moves self.pca = PCA(np.vstack(trials), 3); fig = plt.figure() self.ax = fig.add_subplot(111, projection='3d') self.ax.set_xlabel('PCA 1') self.ax.set_ylabel('PCA 2') self.ax.set_zlabel('PCA 3')
def update_w(self): """ compute new W """ def select_hull_points(data, n=3): """ select data points for pairwise projections of the first n dimensions """ # iterate over all projections and select data points idx = np.array([]) # iterate over some pairwise combinations of dimensions for i in combinations(range(n), 2): # sample convex hull points in 2D projection convex_hull_d = quickhull(data[i, :].T) # get indices for convex hull data points idx = np.append(idx, vq(data[i, :], convex_hull_d.T)) idx = np.unique(idx) return np.int32(idx) # determine convex hull data points using either PCA or random # projections method = 'randomprojection' if method == 'pca': pcamodel = PCA(self.data) pcamodel.factorize(show_progress=False) proj = pcamodel.H else: R = np.random.randn(self._base_sel, self._data_dimension) proj = np.dot(R, self.data) self._hull_idx = select_hull_points(proj, n=self._base_sel) aa_mdl = AA(self.data[:, self._hull_idx], num_bases=self._num_bases) # determine W aa_mdl.factorize(niter=50, compute_h=True, compute_w=True, compute_err=True, show_progress=False) self.W = aa_mdl.W self._map_w_to_data()
def __analyze(self, img): _data = PCA.load_image(img) data = self.__featrue.extract(_data) min = 65536 l = None _l = 97 for _d in self.__crops: _m = Analyzer.__compare(_d, data) if _m < min: min = _m l = chr(_l) _l+=1 return l
class View3D(object): def __init__(self, trials, bin_width, trial_angles, trial_moves): self.trials = trials self.bin_width = bin_width self.trial_angles = trial_angles self.trial_moves = trial_moves self.pca = PCA(np.vstack(trials), 3); fig = plt.figure() self.ax = fig.add_subplot(111, projection='3d') self.ax.set_xlabel('PCA 1') self.ax.set_ylabel('PCA 2') self.ax.set_zlabel('PCA 3') def scatter_plot(self): for angle, trial in zip(self.trial_angles, self.trials): projection = self.pca.proj_whiten(trial) projection = np.array(projection) xs = np.array(projection[0]) ys = np.array(projection[1]) zs = np.array(projection[2]) self.ax.scatter(xs, ys, zs, c = COLOR_MAP[angle]) plt.show() def line_plot(self): for angle, trial in zip(self.trial_angles, self.trials): projection = self.pca.proj_whiten(trial) projection = np.array(projection) xs = np.array(projection[0]) ys = np.array(projection[1]) zs = np.array(projection[2]) self.ax.plot(xs, ys, zs, c = COLOR_MAP[angle]) plt.show()
def __init__(self, trials, trial_angle, **kwargs): if "pca" in kwargs and not kwargs["pca"]: self.pca = None; else: # Prepare PCA only on training data: self.pca = PCA(np.vstack(trials), None, 0.95); tr = []; tr_a = []; for t,a in zip(trials, trial_angle): # Project it onto PCA space p = t; if self.pca: p = self.pca.proj(t); for i in range(len(p)): tr.append(p[i,:]); tr_a.append(a); # Prepare NN classifier self.nn = NearestNeighbors(n_neighbors=19) self.nn.fit(np.vstack(tr)); self.class_labels = tr_a;
def _choose_rule_pca(self, data, show_progress): """Our projections are onto the PCA vectors""" pca = PCA(data, num_bases=1) pca.factorize(show_progress) primary_vec = pca.W.reshape(self._data_dimension) return self._choose_rule_vecproject(data, primary_vec)
imlist = glob.glob("data/a_thumbs/*.jpg") #print imlist im = array(Image.open(imlist[0])) #imshow(im) #show() m, n = im.shape[0:2] imageCount = len(imlist) #create matrix to stor all flattened images imageMatrix = array([array(Image.open(im)).flatten() for im in imlist], 'f') #perform pca V, S, immean = PCA.pca(imageMatrix) #show some images (mean and 7 first modes figure() gray() subplot(2,4,1) imshow(immean.reshape(m,n)) for i in range(7): subplot(2,4,i+2) #convert images back to one dimension imshow(V[i].reshape(m,n)) #imshow(imageMatrix[i].reshape(m,n)) #show() f = open('font_pca_modes.pkl', 'wb')
from ResultsManager import resman from pca import PCA if __name__ == '__main__': resman.start('junk', diary = True) datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz', '../data/upson_rovio_1/test_15_50000.pkl.gz') #meanTrain = mean(datasets[0][0]) #stdTrain = std(datasets[0][0]) #datasets[0][0] = (datasets[0][0] - meanTrain) / stdTrain #datasets[2][0] = (datasets[2][0] - meanTrain) / stdTrain pca = PCA(datasets[0][0]) datasets[0][0] = pca.toZca(datasets[0][0], None, epsilon = .1) datasets[2][0] = pca.toZca(datasets[2][0], None, epsilon = .1) print 'done loading.' test_rbm(datasets = datasets, training_epochs = 45, img_dim = 15, # must match actual size of training data n_hidden = int(sys.argv[1]), learning_rate = float(sys.argv[2]), output_dir = resman.rundir, quickHack = False, visibleModel = 'real', initWfactor = .01, pcaDims = None)
def reduceModel(model, atoms, selstr): """Return reduced NMA model. Reduces a :class:`NMA` model to a subset of *atoms* matching a selection *selstr*. This function behaves differently depending on the type of the *model* argument. For ANM and GNM or other NMA models, this functions derives the force constant matrix for system of interest (specified by the *selstr*) from the force constant matrix for the *model* by assuming that for any given displacement of the system of interest, the other atoms move along in such a way as to minimize the potential energy. This is based on the formulation in in [KH00]_. For PCA models, this function simply takes the sub-covariance matrix for the selected atoms. :arg model: dynamics model :type model: :class:`ANM`, :class:`GNM`, or :class:`PCA` :arg atoms: atoms that were used to build the model :arg selstr: a selection string specifying subset of atoms""" linalg = importLA() if not isinstance(model, NMA): raise TypeError("model must be an NMA instance, not {0:s}".format(type(model))) if not isinstance(atoms, (AtomGroup, AtomSubset, AtomMap)): raise TypeError("atoms type is not valid") if len(atoms) <= 1: raise TypeError("atoms must contain more than 1 atoms") if isinstance(model, GNM): matrix = model._kirchhoff elif isinstance(model, ANM): matrix = model._hessian elif isinstance(model, PCA): matrix = model._cov else: raise TypeError("model does not have a valid type derived from NMA") if matrix is None: raise ValueError("model matrix (Hessian/Kirchhoff/Covariance) is not " "built") system = SELECT.getBoolArray(atoms, selstr) other = np.invert(system) n_sel = sum(system) if n_sel == 0: LOGGER.warning("selection has 0 atoms") return None if len(atoms) == n_sel: LOGGER.warning("selection results in same number of atoms, " "model is not reduced") return None if model.is3d(): system = np.tile(system, (3, 1)).transpose().flatten() other = np.tile(other, (3, 1)).transpose().flatten() ss = matrix[system, :][:, system] if isinstance(model, PCA): eda = PCA(model.getTitle() + " reduced") eda.setCovariance(ss) return eda, system so = matrix[system, :][:, other] os = matrix[other, :][:, system] oo = matrix[other, :][:, other] matrix = ss - np.dot(so, np.dot(linalg.inv(oo), os)) if isinstance(model, GNM): gnm = GNM(model.getTitle() + " reduced") gnm.setKirchhoff(matrix) return gnm, system elif isinstance(model, ANM): anm = ANM(model.getTitle() + " reduced") anm.setHessian(matrix) return anm, system elif isinstance(model, PCA): eda = PCA(model.getTitle() + " reduced") eda.setCovariance(matrix) return eda, system
def main(self, algo="KNN", textview=None): # Remplace "print" def print_output(text): if textview != None: buf = textview.get_buffer() buf.insert_at_cursor(text + "\n") textview.scroll_mark_onscreen(buf.get_insert()) else: log.info(text) # liste des types de set if self.validation == 1: listeTypesSet = ["train", "validation", "test"] else: listeTypesSet = ["train", "test"] # liste des resultats utilises pour les courbes listeRes=[] # creation des trainFile et testFile log.debug("Construction des fichiers d'entrainement") tools.constructLfwNamesCurrent( self.nbExemples ) #TODO ca ne sert plus a rien finalement ( nbClassesLFW, nbClassesORL ) = tools.trainAndTestConstruction( self.pourcentageTrain, self.nbExemples ) # Chargement des données dataTrain, dataTrainIndices, nClass = tools.loadImageData( "train", self.categorie) # tranformation pca print_output("Calcul des vecteurs propres...") pca_model = PCA( dataTrain ) pca_model.transform() # on transforme les donné dans un le "eigen space" ##### Recherche pas KNN if algo == "KNN": print_output("Début de l'algorithme des K plus proches voisins...") # On build le model pour recherche par KNN knn_model = KNN( pca_model.getWeightsVectors(), dataTrainIndices, nClass, self.K ) # On build le model pour Parzen parzen_model = ParzenWindows( pca_model.getWeightsVectors(), dataTrainIndices, nClass, self.Theta ) ## TEST ########################### #TODO Toute cette partie est a revoir pour sortir des graphes # de train, validation, test for trainTest in listeTypesSet: if trainTest == "train": dataTest, dataTestIndices = dataTrain, dataTrainIndices else : ### si l'on n'effectue pas de validation on concatene les entrees de test et de validation initiales pour obtenir le test #if "validation" not in listeTypesSet: #dataTestInitial, dataTestInitialIndices, nClass = tools.loadImageData( "test", self.categorie ) #dataValidation, dataValidationIndices, nClass = tools.loadImageData( "validation", self.categorie ) #dataTest = np.zeros(dataTestInitial.size + dataValidation.size) #dataTestIndices = np.zeros( dataTest.size ) #dataTest[ : dataTestInitial.size], dataTestIndices[ : dataTestInitial.size] = dataTestInitial, dataTestInitialIndices #dataTest[dataTestInitial.size : ], dataTestIndices[dataTestInitial.size : ] = dataValidation, dataValidationIndices #else: dataTest, dataTestIndices, nClass = tools.loadImageData( trainTest, self.categorie ) print_output("Projection des données de test...") dataTest_proj = pca_model.getProjection( dataTest ) # compteurs de bons résultats nbGoodResult = 0 nbGoodResult2 = 0 nbGoodResult3 = 0 t_start = time.clock() for i in range(0, int( dataTest.shape[1] )): # k = 1, pour réference # on force k knn_model.setK( 1 ) result1NN = knn_model.compute_predictions( dataTest_proj[:,i] ) if(result1NN == dataTestIndices[i]): nbGoodResult += 1 # k = n # replace k a ca position initial knn_model.setK( self.K ) resultKNN = knn_model.compute_predictions( dataTest_proj[:,i] ) if(resultKNN == dataTestIndices[i]): nbGoodResult2 += 1 resultParzen = parzen_model.compute_predictions( dataTest_proj[:,i] ) if(resultParzen == dataTestIndices[i]): nbGoodResult3 += 1 out_str = "Classic method: "+ str( result1NN ) +" | KNN method: "+ str( resultKNN ) +" | KNN+Parzen method: "+ str( resultParzen ) +" | Expected: "+ str( dataTestIndices[i] ) +"\n" # +1 car l'index de la matrice commence a 0 print_output(out_str) resClassic = (float(nbGoodResult) / float(dataTest.shape[1])) * 100. out_str = "\nAccuracy with classic method: %.3f" % resClassic + "%\n" resKNN = (nbGoodResult2 / float(dataTest.shape[1])) * 100. out_str += "Accuracy with KNN method (k="+ str( self.K ) +"): %.3f" % resKNN + "%\n" res = (nbGoodResult3 / float(dataTest.shape[1])) * 100. out_str += "Accuracy with KNN + Parzen window method (theta="+ str( self.Theta ) +"): %.3f" % res + "%\n" print_output(out_str) t_stop = time.clock() log.info("Temps total: %.4fs\n" % float(t_stop-t_start)) #### recupere les valeurs finale de l'erreur listeRes.append( 100 - resClassic ) listeRes.append( 100 - resKNN ) listeRes.append( 100 - res ) #### Recherche pas NNET elif algo == "NNET": print_output("Début de l'algorithme du Perceptron multicouche...") # parametre, donnees, etc... dataTrain = pca_model.getWeightsVectors() dataTrainTargets = (dataTrainIndices - 1).reshape(dataTrainIndices.shape[0], -1) #! contrairement au KNN le NNET prends les vecteurs de features en ligne et non pas en colonne train_set = np.concatenate((dataTrain.T, dataTrainTargets), axis=1) # recuperation des données de validation dataValidation, dataValidationIndices, nClass = tools.loadImageData( "validation", self.categorie ) print_output("Projection des données de validation...") dataValidation_proj = pca_model.getProjection( dataValidation ) dataValidationTargets = (dataValidationIndices - 1).reshape(dataValidationIndices.shape[0], -1) validation_set = np.concatenate((dataValidation_proj.T, dataValidationTargets), axis=1) # recuperation des données de test dataTest, dataTestIndices, nClass = tools.loadImageData( "test", self.categorie ) print_output("Projection des données de test...") dataTest_proj = pca_model.getProjection( dataTest ) dataTestTargets = (dataTestIndices - 1).reshape(dataTestIndices.shape[0], -1) test_set = np.concatenate((dataTest_proj.T, dataTestTargets), axis=1) # On build et on entraine le model pour recherche par KNN nnet_model = NeuralNetwork( dataTrain.shape[0], self.n_hidden, nClass, self.lr, self.wd ) if self.validation == 1: train_out, valid_out, test_out = nnet_model.train( train_set, self.n_epoch, self.batch_size, valid_set=validation_set, test_set=test_set) else : train_out, test_out = nnet_model.train( train_set, self.n_epoch, self.batch_size, test_set=test_set) # affichage des courbes d'entrainement x = [] y = [] y_err = [] color = [] legend = [] legend_err = [] filename = IMG_DIR + "Risque__Epoch_"+ str(self.n_epoch) +"_Hidden_"+ str(self.n_hidden) +"_Lr_"+ str(self.lr) +"_L2_"+ str(self.wd) + "_Categorie_" + str(self.categorie) + "_Batch_" + str(self.batch_size) + "_" filename_err = IMG_DIR + "Erreur_classification__Epoch_"+ str(self.n_epoch) +"_Hidden_"+ str(self.n_hidden) +"_Lr_"+ str(self.lr) +"_L2_"+ str(self.wd) + "_Categorie_" + str(self.categorie) + "_Batch_" + str(self.batch_size) + "_" train_out = np.array(train_out) x.append(np.array(xrange(train_out.shape[0]))) # parametres courbes train color.append('g-') legend.append("R Train") filename += "_Train" y.append(train_out[:,0]) y_err.append(train_out[:,1]) legend_err.append("Err Train") filename_err += "_Train" # parametre courbes validation if self.validation == 1: valid_out = np.array(valid_out) x.append(np.array(xrange(valid_out.shape[0]))) y.append(valid_out[:,0]) y_err.append(valid_out[:,1]) color.append('b-') legend.append("R Validation") legend_err.append("Err Validation") filename += "_Validation" filename_err += "_Validation" # parametre courbes test test_out = np.array(test_out) x.append(np.array(xrange(test_out.shape[0]))) y.append(test_out[:,0]) y_err.append(test_out[:,1]) color.append('r-') legend.append("R Test") legend_err.append("Err Test") filename += "_Test" filename_err += "_Test" # affichage title = u"\nEpoque: " + str(self.n_epoch) + " - Taille du batch: " + str(self.batch_size) + u" - Neurones cachés: " + str(self.n_hidden) + "\nL2: " + str(self.wd) + " - Taux d'apprentissage: " + str(self.lr) + u" - Catégorie: " + str(self.categorie) tools.drawCurves(x, y, color, legend, bDisplay=True, filename=filename, title=title, xlabel="Epoque", ylabel=u"Risque régularisé") tools.drawCurves(x, y_err, color, legend_err, bDisplay=True, filename=filename_err, title=title, xlabel="Epoque", ylabel="Erreur classification") #### construction fichier pour courbes ameliorees if self.stock == 1 : fichier = open("curvErrorNNet"+''.join( ''.join( title.split(' ') ).split('\n') ),"w") fichier.write("#epoch errorTrain errorValidation errorTest\n") if len(x) == 3: for j in range(len( x[0] )): fichier.write(str( x[0][j] )+" "+str( y[0][j] )+" "+str( y[1][j] )+" "+str( y[2][j] )+"\n") fichier.close() """ /!\ Cette partie n'est plus utile car effectué dans le nnet durant le train ## TEST ########################### #TODO Toute cette partie est a revoir pour sortir des graphes # de train, validation, test # compteurs de bons résultats nbGoodResult = 0 for i in range(0, int( dataTest.shape[1] )): # resultNNET = np.argmax(nnet_model.compute_predictions( dataTest_proj[:,i] ), axis=1)[0] if(resultNNET == dataTestTargets[i]): nbGoodResult += 1 out_str = "Result: "+ str( resultNNET ) + " | Expected: "+ str( dataTestTargets[i] ) +"\n" # +1 car l'index de la matrice commence a 0 print_output(out_str) res = (float(nbGoodResult) / float(dataTest.shape[1])) * 100. out_str = "\nAccuracy : %.3f" % res + "%\n" print_output(out_str) """ return listeRes
def reduceModel(model, atoms, select): """Return reduced NMA model. Reduces a :class:`~.NMA` model to a subset of *atoms* matching *select*. This function behaves differently depending on the type of the *model* argument. For :class:`.ANM` and :class:`.GNM` or other :class:`.NMA` models, force constant matrix for system of interest (specified by the *select*) is derived from the force constant matrix for the *model* by assuming that for any given displacement of the system of interest, other atoms move along in such a way as to minimize the potential energy. This is based on the formulation in [KH00]_. For :class:`.PCA` models, this function simply takes the sub-covariance matrix for selection. :arg model: dynamics model :type model: :class:`.ANM`, :class:`.GNM`, or :class:`.PCA` :arg atoms: atoms that were used to build the model :type atoms: :class:`.Atomic` :arg select: an atom selection or a selection string :type select: :class:`.Selection`, str :returns: (:class:`.NMA`, :class:`.Selection`)""" linalg = importLA() if not isinstance(model, NMA): raise TypeError('model must be an NMA instance, not {0:s}' .format(type(model))) if not isinstance(atoms, (AtomGroup, AtomSubset, AtomMap)): raise TypeError('atoms type is not valid') if len(atoms) <= 1: raise TypeError('atoms must contain more than 1 atoms') if isinstance(model, GNM): matrix = model._kirchhoff elif isinstance(model, ANM): matrix = model._hessian elif isinstance(model, PCA): matrix = model._cov else: raise TypeError('model does not have a valid type derived from NMA') if matrix is None: raise ValueError('model matrix (Hessian/Kirchhoff/Covariance) is not ' 'built') if isinstance(select, str): system = SELECT.getBoolArray(atoms, select) n_sel = sum(system) if n_sel == 0: raise ValueError('select matches 0 atoms') if len(atoms) == n_sel: raise ValueError('select matches all atoms') if isinstance(atoms, AtomGroup): ag = atoms which = np.arange(len(atoms))[system] else: ag = atoms.getAtomGroup() which = atoms._getIndices()[system] sel = Selection(ag, which, select, atoms.getACSIndex()) elif isinstance(select, AtomSubset): sel = select if isinstance(atoms, AtomGroup): if sel.getAtomGroup() != atoms: raise ValueError('select and atoms do not match') system = np.zeros(len(atoms), bool) system[sel._getIndices()] = True else: if atoms.getAtomGroup() != sel.getAtomGroup(): raise ValueError('select and atoms do not match') elif not sel in atoms: raise ValueError('select is not a subset of atoms') idxset = set(atoms._getIndices()) system = np.array([idx in idxset for idx in sel._getIndices()]) else: raise TypeError('select must be a string or a Selection instance') other = np.invert(system) if model.is3d(): system = np.tile(system, (3,1)).transpose().flatten() other = np.tile(other, (3,1)).transpose().flatten() ss = matrix[system,:][:,system] if isinstance(model, PCA): eda = PCA(model.getTitle() + ' reduced') eda.setCovariance(ss) return eda, system so = matrix[system,:][:,other] os = matrix[other,:][:,system] oo = matrix[other,:][:,other] matrix = ss - np.dot(so, np.dot(linalg.inv(oo), os)) if isinstance(model, GNM): gnm = GNM(model.getTitle() + ' reduced') gnm.setKirchhoff(matrix) return gnm, sel elif isinstance(model, ANM): anm = ANM(model.getTitle() + ' reduced') anm.setHessian(matrix) return anm, sel elif isinstance(model, PCA): eda = PCA(model.getTitle() + ' reduced') eda.setCovariance(matrix) return eda, sel
kernelpot = KernelPotential(options) # FILL KERNEL generate = False if generate: for struct in structures: print struct.label kernelpot.acquire(struct, 1., label=struct.label) print kernelpot.IX.shape np.savetxt('out.kernelpot.ix.txt', kernelpot.IX) else: IX = np.loadtxt('out.kernelpot.ix.txt') kernelpot.importAcquire(IX, 1.) # KERNEL PCA pca = PCA() pca.compute(IX, normalize_mean=False, normalize_std=False) #pca = IPCA() #pca.compute(IX, normalize_mean=False, normalize_std=False) # ============================= # CHECK COMPONENT NORMALIZATION # ============================= ones_vec = np.zeros(567) ones_vec.fill(1.) np.savetxt('out.pca.unnorm.txt', pca.unnormBlock(ones_vec)) # ================= # INDEX CUTOFF SCAN