def test_fitCov(self): """ Test PCA using covariance matrix approach. """ testRequestedDim = 1 generatedDatasetsGaussian = 10 # for covariance PCA testDataSet = np.array([[0, 1, 2, 3, 4, 0, 0], [0, 0, 0, 0, 0, 1, -1]], dtype=np.float64) expRes = np.array([[-1.42857, -0.42857, 0.57143, 1.57143, 2.57143, -1.42857, -1.42857]]) # Beware the 2-dimensionality resCov = pca.pca(testDataSet, testRequestedDim, "cov") npt.assert_array_almost_equal(expRes, resCov) ## with random generated datas for i in range (generatedDatasetsGaussian): ## non memmap-version testDataSet = np.load("unitTest/testData/" + "testGaussianClasses" + str(i+1) + ".npy") expRes = np.load("unitTest/testData/" + "testGaussianClassesTransformed" + str(i+1) + ".npy") resCov = pca.pca(testDataSet, 1, "cov") npt.assert_array_almost_equal(expRes, resCov) ## memmap-version testDataSet = np.memmap("unitTest/testData/" + "testGaussianClassesMmap" + str(i+1) + ".npy", dtype="float64", mode="r+", shape=(2,2000)) resCov = pca.pca(testDataSet, 1, "cov") npt.assert_array_almost_equal(expRes, resCov)
def mainTest(X_train, X_test, y_train, y_test, k): print("--Test 1--") M = 3 # PCA Work print("\nTraining data:") comp_1 = pca.pca(X_train, M) X_train_t = pca.transform(X_train, comp_1) print("\nTesting data:") comp_2 = pca.pca(X_test, M) X_test_t = pca.transform(X_test, comp_2) # Print base results. print("\nBefore PCA - Dim ", len(X_train[0])) classifier = svm.train(X_train, y_train, k, C=None) info = svm.classify(classifier, X_test, return_sums=True) printResults(info[1], y_test, info[0]) # Print transformed results. print("After PCA - Dim ", M) X_train = X_train_t X_test = X_test_t classifier = svm.train(X_train, y_train, k, C=None) info = svm.classify(classifier, X_test, return_sums=True) printResults(info[1], y_test, info[0])
def predict(self, X, V_truncate, gmm): assert X.ndim == 4 print("Extracting Fisher features on testing data") n = X.shape[0] ret = [] local_feature_extractor = load_features.get_feature_extractor( self.local_feature_extractor_name) local_features = local_feature_extractor.predict(X, unflatten=True) if self.local_feature_extractor_name == 'hog': # local_features is a 3d array _, V_truncate = pca(local_features.reshape( -1, local_features.shape[-1]), components=n_components) elif self.local_feature_extractor_name == 'sift': # local_features is a list of 2d arrays _, V_truncate = pca(_concat_2d_arrays(local_features), components=n_components) else: raise Exception("Unknown local feature extractor") local_features_pca = [] for i in range(n): local_features_pca.append( numpy.array(numpy.matrix(local_features[i]) * V_truncate)) fisher_vector = FisherVector(self.nclasses, len(local_features_pca[0][0]), gmm.pi, gmm.mu, gmm.sigma) for i in tqdm(range(n)): ret.append(fisher_vector.predict(local_features_pca[i])) return numpy.array(ret)
def relativeExtremaSegments(self, rawData, maxMin="max", minSegSize=50): from scipy.signal import argrelmax, argrelmin PCs = pca(rawData, n_components=1)[0] if maxMin == 'max': return argrelmax(PCs[:,0], order=minSegSize)[0] if maxMin == 'min': return argrelmin(PCs[:,0], order=minSegSize)[0]
def DimensionalityReduction(d, eigenvalue_filename, eigenvector_filename): print "start to PCA" d_pca = pca(np.array(d)) d_eigenvalue = [i[0] for i in d_pca] d_eigenvector = [i[1] for i in d_pca] if eigenvector_filename.__len__() != 0: #there exits an eigenvector filename eigenvector_output = open(eigenvector_filename,'w') for i in range(len(d_eigenvector)): for j in range(len(d_eigenvector[i])): eigenvector_output.write(str(d_eigenvector[i][j])+"\t") eigenvector_output.write("\n") eigenvector_output.close() else: #this is an empty eigenvector filename pass eigenvalue_output = open(eigenvalue_filename,'w') if eigenvalue_filename.__len__() != 0: #there exists an eigenvalue filename d_eigenvalue_total = 0.0 for i in range(len(d_eigenvalue)): d_eigenvalue_total += d_eigenvalue[i] d_eigenvalue_sum = 0.0 for i in range(len(d_eigenvalue)): d_eigenvalue_sum += d_eigenvalue[i]/d_eigenvalue_total print >> eigenvalue_output, d_eigenvalue[i]/d_eigenvalue_total, "\t", d_eigenvalue_sum eigenvalue_output.close() else: #this is an empty eigenvalue filename pass return d_eigenvalue, d_eigenvector, d_eigenvalue_total
def PCABySensor(self, data, n_components=3): dataBySensor = self.dataBySensor(data) pcaDict = {} for k,v in dataBySensor.items(): pcaDict[k] = pca(v,n_components)[0] pcaDict['Time'] = dataBySensor['Time'] return pcaDict
def shit_plot(): optvals = np.genfromtxt('./data/optvals.csv', delimiter=',') errs = np.genfromtxt('./data/errs.csv') npts = optvals.shape[0] npts_toplot = 0 optvals_toplot = np.empty((npts, 3)) for i in range(npts): if errs[i] < 8e-9: optvals_toplot[npts_toplot] = optvals[i] npts_toplot = npts_toplot + 1 print npts_toplot optvals_toplot = optvals_toplot[:npts_toplot, :] optvals_toplot[:, 2] = optvals_toplot[:, 2] * 1e8 optvals_toplot[:, 2] = optvals_toplot[:, 2] - np.amin(optvals_toplot[:, 2]) print optvals_toplot[:, 2] sing_vals, right_sing_vect = pca.pca(optvals_toplot) print sing_vals print right_sing_vect fig = plt.figure() ax = fig.add_subplot(111) proj = np.dot(optvals_toplot, right_sing_vect[:, :2]) ax.scatter(proj[:, 0], proj[:, 1], c=optvals_toplot[:, 2]) ax.set_ylim((np.amin(proj[:, 1]), np.amax(proj[:, 1]))) plt.show(fig)
def main(): print("PCA") pca.pca() print("RandomForest") rf.rf(2) print("KNN") knn.knn(2) print("SVC") svc.svc() print("GRID_SVC") svc.gridSearchScore() print("Logistic") logistic.Logistic().fit() print("DNN Classifier") classifier_model = classifier.classifier() classifier_model.fit()
def plot_clustering_2d(encodings, myCluster, output, **kw): if myCluster != 0: if kw['sof'] == 'sample': data = np.array(encodings)[1:, 1:].astype(float) else: data = np.array(encodings).T[1:, 1:].astype(float) labels = np.array(myCluster)[0:, 1:].reshape(-1, ) e = '' try: Y = tsne.tsne(data, 2, 50, 20.0) except RuntimeWarning as e: Y = pca.pca(data, n_components=2) df = pd.DataFrame({'X': Y[:, 0], 'Y': Y[:, 1], 'L': labels}) fig = plt.figure(0) mySet = set(labels) if len(mySet) > 5: plt.scatter(Y[:, 0], Y[:, 1], 20, labels) else: for l in mySet: newData = df.loc[df.loc[:, "L"] == l, :] plt.scatter(np.array(newData.X), np.array(newData.Y), 20, label="Cluster_%s" % l) plt.legend(loc='best') plt.savefig('%s.png' % output) plt.close(0)
def getLowDimensionalSegments(highDimensionalData,n_components=2,plt=False,title="Latent space segments"): (lowDimensionalData,explainedVariance) = pca.pca(highDimensionalData,n_components) (mins,maxs) = segment.segmentationPoints(lowDimensionalData[:,0]) segments = pl.split(lowDimensionalData,maxs)[1:-1] if plt: plot.plotGridOf2Ds(segments,title) return (segments,explainedVariance)
def osp_helper(hsi_data, tgt_sig, kwargs): n_dim_ss = kwargs['n_dim_ss'] # see Eismann, pp670 n_band, n_pixel = hsi_data.shape mu = np.mean(hsi_data, 1) mu = mu[:, np.newaxis] x = hsi_data - mu # get PCA rotation, no dim reduction _, _, evecs, _, _ = pca(hsi_data, 1) s = tgt_sig - mu # get a subspace that theoretically encompasses the background B = evecs[:, :n_dim_ss] PB = B @ np.linalg.pinv(B.T @ B) @ B.T PperpB = np.eye(n_band) - PB f = s.T @ PperpB osp_data = np.zeros(n_pixel) for i in range(n_pixel): osp_data[i] = f @ x[:, i] return osp_data, {}
def test_pca(): """Generates a noisy 2d plane embedded in 3d, then performs PCA to find better coordinates for the data""" # define a planar equation z = lambda x, y: 3*x + y + 4 npoints = 200 stdev = 1.0 noise = stdev*np.random.normal(size=npoints) xvals = np.random.uniform(low=2, high=6, size=npoints) yvals = np.random.uniform(low=2, high=10, size=npoints) zvals = z(xvals, yvals) + noise data = np.transpose(np.array([xvals, yvals, zvals])) pcomp, pvar = pca.pca(data, 2) # plot projections along various components ncomp = pvar.shape[0] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') cs = ['b', 'r', 'g'] data_avg = np.average(data, 0) centered_data = data - data_avg # do not bother plotting "full" projection, which would give back the original data # proj = np.dot(pcomp.T, centered_data.T) # print np.dot(pcomp.T, pcomp), pvar # ax.scatter(proj[0,:]+data_avg[0], proj[1,:]+data_avg[1], 0, c='g') proj = np.dot(pcomp, np.dot(pcomp.T, centered_data.T)) ax.scatter(data[:,0], data[:,1], data[:,2], alpha=1.0, s=80, c='#96031E') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_zlabel('z') # hide labels and grid, too squashed/noisy # ax.grid(False) ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0)) ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0)) ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0)) plt.tick_params(axis='both', which='major', labelsize=0) # ax.scatter(proj[0,:]+data_avg[0], proj[1,:]+data_avg[1], proj[2,:]+data_avg[2], color='g') # # plot lines from orig data to planar projection # for i in range(npoints): # pts = np.array((proj[:,i] + data_avg, data[i,:])) # ax.plot(pts[:,0], pts[:,1], pts[:,2], c='y') # sort based on z val and wireframe # data_avg.shape = (3,1) # sorted_indices = np.argsort(np.linalg.norm(proj + data_avg, axis=0)) # proj = proj[:,sorted_indices] # fake pca result, true in the limit of infinite data # xgrid, ygrid = np.meshgrid(np.linspace(2,6,10), np.linspace(2,10,10)) # ax.plot_wireframe(xgrid, ygrid, z(xgrid, ygrid), color='#12227A', alpha=0.5) # grid_coord = (2,2) # for i in range(2): # ax.plot((xgrid[grid_coord], xgrid[grid_coord] + pcomp[0,i]), (ygrid[grid_coord], ygrid[grid_coord] + pcomp[1,i]), (z(xgrid[grid_coord], ygrid[grid_coord]), z(xgrid[grid_coord], ygrid[grid_coord]) + pcomp[2,i]), c='k') for ii in xrange(0,360,1): ax.view_init(elev=20.0, azim=ii) if ii >= 180: xgrid, ygrid = np.meshgrid(np.linspace(2,6,10), np.linspace(2,10,10)) ax.plot_wireframe(xgrid, ygrid, z(xgrid, ygrid), color='#12227A', alpha=0.5) plt.savefig('/home/alexander/workspace/sloppy_models/rawlings_model/figs/pca/pca' + str(ii) + '.png')
def test_plot_combinations(self): X = load_iris().data labels=load_iris().feature_names y=load_iris().target X = pd.DataFrame(data=load_iris().data, columns=load_iris().feature_names, index=load_iris().target) param_grid = { 'n_components':[None, 0.01, 1, 0.95, 2, 100000000000], 'row_labels':[None, [], y], 'detect_outliers' : [None, 'ht2','spe'], } allNames = param_grid.keys() combinations = it.product(*(param_grid[Name] for Name in allNames)) combinations=list(combinations) for combination in combinations: model = pca(n_components=combination[0]) model.fit_transform(X) assert model.plot() assert model.biplot(y=y, SPE=True, hotellingt2=True) assert model.biplot3d(y=y, SPE=True, hotellingt2=True) assert model.biplot(y=y, SPE=True, hotellingt2=False) assert model.biplot(y=y, SPE=False, hotellingt2=True) assert model.biplot(y=y, SPE=False, hotellingt2=False)
def segment(k,m, inciset, trainingset, radiographs, colors, leftout, mode = 0): # get image training set trainimgs = [radiographs[i] for i in trainingset] # read landmarks from file lmtrain,lmtest = landmarks.get(trainingset) # align all landmarks, plot depending on mode aligns, means = landmarks.align(lmtrain) if mode == 0: ui.plotalign(colors, means, aligns) # do pca, plot depending on mode eva, evc = pca.pca(aligns, means) if mode == 0: ui.plotpca(means,eva,evc) # get initial estimate, manual or auto depending on mode # draw init also depending on mode est, greymodels = model.estimate(k, m, inciset, means, trainimgs, lmtrain, radiographs[leftout], colors, mode) if mode == 2: ui.plotinit(est, radiographs[leftout], colors, leftout) # fit init estimate and get plot mask if mode == 0 or mode == 1: X = fit.fit(est, inciset, eva, evc, means, greymodels, radiographs[leftout], k, m, 3.0) mask = ui.plotfit(radiographs[leftout], list(est), X, len(inciset), colors) return mask
def csd_anomaly(hsi_img, n_dim_bg, n_dim_tgt, tgt_orth): """ Complementary Subspace Detector assumes background and target are complementary subspaces of PCA variance ranked space Ref: A. Schaum, "Joint subspace detection of hyperspectral targets," 2004 IEEE Aerospace Conference Proceedings (IEEE Cat. No.04TH8720), 2004, pp. 1824 Vol.3. doi: 10.1109/AERO.2004.1367963 inputs: hsi_image - n_row x n_col x n_band n_dim_bg - number of leading dimensions to assign to background subspace n_dim_tgt - number of dimensions to assign to target subspace use empty matrix, [], to use all remaining after background assignment tgt_orth - True/False, set target subspace orthogonal to background subspace 8/7/2012 - Taylor C. Glenn 5/5/2018 - Edited by Alina Zare 11/2018 - Python Implementation by Yutai Zhou """ n_row, n_col, n_band = hsi_img.shape n_pixel = n_row * n_col hsi_data = hsi_img.reshape((n_pixel, n_band), order='F').T # PCA rotation, no reduction pca_data, _, evecs, evals, _ = pca(hsi_data, 1) # whiten the data so that later steps are equivalent to Mahalanobis distance z = np.diag(1 / np.sqrt(evals)) @ pca_data # figure out background and target subspaces bg_rg = np.array(range(0, n_dim_bg)) if tgt_orth: # set target to orthogonal complement of background if n_dim_tgt is None: n_dim_tgt = n_band - n_dim_bg tgt_rg = np.array(range(n_dim_bg, n_dim_tgt)) else: # target and background overlap if n_dim_tgt is None: n_dim_tgt = n_band tgt_rg = np.array(range(0, n_dim_tgt)) # set background and target subspaces B = evecs[:, bg_rg] S = evecs[:, tgt_rg] # run the detector csd_data = np.zeros(n_pixel) for i in range(n_pixel): Sz = S.T @ z[:, i] Bz = B.T @ z[:, i] csd_data[i] = Sz.T @ Sz - Bz.T @ Bz csd_out = csd_data.reshape(n_row, n_col, order='F') return csd_out
def __initialise_latent(self, reduced_dimensionality): """ Initialises latent variables with Principal Component Analysis and keeps a copy for resetting purposes. """ if self.__init_latent.shape[0] == 0 or self.__init_latent.shape[ 1] != reduced_dimensionality: self.__init_latent = pca(self._Y, reduced_dimensionality) self._X = np.copy(self.__init_latent)
def manuallySegment(inputFile, listOfSegmentationPoints, outputFilesPrefix): data = readRaw(inputFile)[:,4:] pcaData = pca(data,3)[0] segments = np.split(data,listOfSegmentationPoints) pcaSegments = np.split(pcaData,listOfSegmentationPoints) for i,seg,pcaSeg in zip(range(len(segments)),segments, pcaSegments): np.savetxt("%s%i%s"%(outputFilesPrefix,i,"RAW.txt"),seg, delimiter=",") np.savetxt("%s%i%s"%(outputFilesPrefix,i,"PCA.txt"),pcaSeg, delimiter=",")
def nonopt_correlations(): corr_results = {} for i, tsp in enumerate(timbrespace_db.keys()): print('Processing', tsp) corr_results[tsp] = {} target_data = load.timbrespace_dismatrix(tsp, timbrespace_db) for rs in sorted(representations): aud_repres = load.timbrespace_features( tsp, representations=[rs], window=None, timbrespace_db=None, verbose=False)[rs] tab_red = [] rs_type = rs.split('_')[-1] if rs_type == 'strf': n_components = 1 for i in range(len(aud_repres)): # print('PCA on sound %02i' % (i + 1)) strf_reduced = pca.pca( np.absolute(aud_repres[i]), aud_repres[i].shape[1], n_components=n_components).flatten() tab_red.append(strf_reduced / np.max(strf_reduced)) tab_red = np.transpose(np.asarray(tab_red)) elif rs_type == 'spectrogram' or rs_type == 'mps': for i in range(len(aud_repres)): tab_red.append(aud_repres[i].flatten()) tab_red = np.transpose(np.asarray(tab_red)) elif rs_type == 'spectrum': for i in range(len(aud_repres)): tab_red.append(aud_repres[i]) # 128 x nb sounds (time or freq?) tab_red = np.transpose(np.asarray(tab_red)) input_data = tab_red / np.mean(np.std(tab_red, axis=0)) # plt.plot(input_data) # plt.show() ndims, ninstrus = input_data.shape[0], input_data.shape[1] no_samples = ninstrus * (ninstrus - 1) / 2 idx_triu = np.triu_indices(target_data.shape[0], k=1) target_v = target_data[idx_triu] mean_target = np.mean(target_v) std_target = np.std(target_v) kernel = np.zeros((ninstrus, ninstrus)) for i in range(ninstrus): for j in range(i + 1, ninstrus): kernel[i, j] = np.sum( np.power(input_data[:, i] - input_data[:, j], 2)) kernel_v = kernel[idx_triu] mean_kernel = np.mean(kernel_v) std_kernel = np.std(kernel_v) Jn = np.sum( np.multiply(kernel_v - mean_kernel, target_v - mean_target)) Jd = (no_samples - 1) * std_target * std_kernel corr_results[tsp][rs] = Jn / Jd print(' {} : {}'.format(rs, Jn / Jd)) pickle.dump(corr_results, open('correlations_results.pkl', 'wb'))
def pca_rho_kappa_embedding_figs(): """Performs PCA on a collection of network stationary states arising from a range of $m$ and $\kappa$ values, plots projection of data along PC1 and PC2, also plots variances. **To generate data:**:: ./rho_kappa_embedding 0 2000 cp ./embedding_data/rho_kappa_graph_embeddings.csv ./manuscript-materials/data cp ./embedding_data/rho_kappa_params.csv ./manuscript-materials/data """ # project graph embeddings with PCA embeddings = np.genfromtxt(data_directory + 'rho_kappa_graph_embeddings.csv', delimiter=',') k = 6 pcs, variances = pca(embeddings, k) projections = np.dot(pcs.T, embeddings.T) # (k, n) array # plot variances fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(np.arange(1,k+1), variances) ax.semilogy(np.arange(1,k+1), variances) ax.set_xlabel(r'$i$') ax.set_ylabel(r'$\sigma^2_i$') # # plot projection along first and second principal component params = np.genfromtxt(data_directory + 'rho_kappa_params.csv', delimiter=',') # color by log(kappa) fs = 64 s = 50 fig = plt.figure() ax = fig.add_subplot(111) c = ax.scatter(projections[0], projections[1], c=np.log10(params[:,1]), s=s) cb = fig.colorbar(c) cb.set_label(r'$\log(\kappa)$', fontsize=fs) ax.set_xlabel(r'$w_1$', fontsize=fs) ax.set_ylabel(r'$w_2$', fontsize=fs) ax.set_xlim((1.05*np.min(projections[0]), 1.4*np.max(projections[0]))) ax.set_ylim(bottom=1.05*np.min(projections[1])) formatter = FormatAxis(ax, has_zaxis=False) formatter.format('x', projections[0], '%d', nticks=3) formatter.format('y', projections[1], '%d', nticks=3) fig.subplots_adjust(bottom=0.15) # color by rho fig = plt.figure() ax = fig.add_subplot(111) c = ax.scatter(projections[0], projections[1], c=params[:,0], s=s) cb = fig.colorbar(c) cb.set_label(label=r'$\frac{2m}{n}$', fontsize=1.5*fs) ax.set_xlabel(r'$w_1$', fontsize=fs) ax.set_ylabel(r'$w_2$', fontsize=fs) ax.set_xlim((1.05*np.min(projections[0]), 1.4*np.max(projections[0]))) ax.set_ylim(bottom=1.05*np.min(projections[1])) formatter = FormatAxis(ax, has_zaxis=False) formatter.format('x', projections[0], '%d', nticks=3) formatter.format('y', projections[1], '%d', nticks=3) fig.subplots_adjust(bottom=0.15) plt.show()
def test_hand_written(c=200, epsilon=0.0001, max_iter=10000, kernel=linear_kernel, parallel=False, skip_rate=0, vrate=0.90): train_dir = 'data/Ch02/digits/trainingDigits' test_dir = 'data/Ch02/digits/testDigits' begin_progress('Reading train data') train_xs, train_ys = load_digits(train_dir, skip_rate) end_progress() begin_progress('Reading test data') test_xs, test_ys = load_digits(test_dir, skip_rate) end_progress() pcs, means = pca.pca(train_xs, vrate=vrate) train_xs = pca.transform(train_xs, pcs, means) test_xs = pca.transform(test_xs, pcs, means) print("Dimension reduction from {} to {}".format(*pcs.shape)) begin_progress('Train svms') num_classes = 10 svms = [None] * num_classes if not parallel: k_cache = create_k_cache(train_xs, kernel) for i in range(num_classes): k, os = train_svm(i, train_xs, train_ys, c, epsilon, max_iter, kernel, k_cache) svms[k] = os progress() else: def done_hook(future): nonlocal svms i, svm = future.result() svms[i] = svm progress() with ProcessPoolExecutor(max_workers=5) as executor: futures = [executor.submit(train_svm, i, train_xs, train_ys, c, epsilon, max_iter, kernel) for i in range(num_classes)] for future in futures: future.add_done_callback(done_hook) end_progress() print('Testing svms:') train_er = multi_get_error_rate(svms, train_xs, train_ys) * 100 print('SVM handwritten error rate on train set: %{}'.format(train_er)) test_er = multi_get_error_rate(svms, test_xs, test_ys) * 100 print('SVM handwritten error rate on test set: %{}'.format(test_er)) return train_er, test_er
def __init__(self,datacube,method='cor',verbose=True,radii=None,\ path=None,name='PCA',header=None): """ Constructor of the pca_imagecube class. Input: - datacube: a 3d numpy array - method: 'cov' for covariance (default option), 'cor' for correlation or 'ssq' for sum of squares - verbose: True or False if you want some information printed on the terminal - radii: an array containing the radii in pixels of the annuli in which the PCA must be calculated. For instance: radii=[10,100,200] means the PCA will be computed in 2 annuli defined by 10px-100px and 100px-200px. By default, assumes te whole image is used. - path: the path where results must be saved. If no path is specified, then results can't be saved. - name: a string, all output files will start with this name. A good practice here is to use the name of the target and or date of observation. By default it is 'PCA' - header: the header to use for the output files. """ if datacube.ndim != 3: raise IndexError('The input datacube must be a 3D numpy array !') self.nframes, self.ny, self.nx = datacube.shape if radii is None: radii = [ 0, int(np.round(np.sqrt((self.ny // 2)**2 + (self.nx // 2)**2))) ] self.method = method self.set_path(path, verbose=verbose) self.set_prefix(name + '_' + method + '_' + '-'.join(['{0:d}'.format(i) for i in radii])) self.header = header distarr = distance_array((self.ny, self.nx), verbose=False) self.region_map = np.zeros((self.ny, self.nx), dtype=int) self.nb_annuli = len(radii) - 1 self.Nobj_array = np.ndarray(self.nb_annuli) self.pca_array = [] self.x_indices_array = [] self.y_indices_array = [] if verbose: print('There are {0:d} frames and {1:d} regions.'.format( self.nframes, self.nb_annuli)) for i in range(self.nb_annuli): y_indices, x_indices = np.where( np.logical_and(distarr >= radii[i], distarr < radii[i + 1])) self.y_indices_array.append(y_indices) self.x_indices_array.append(x_indices) self.Nobj_array[i] = len(y_indices) self.region_map[y_indices, x_indices] = i + 1 data = datacube[:, y_indices, x_indices].T # Transpose is used to get a shape (Nobj x Katt) where Katt is the number of frames of the datacube self.pca_array.append(pca.pca(data, method=method, verbose=verbose)) if verbose: self.pca_array[i].print_explained_inertia(modes=5)
def pca_features(image): im = np.array(Image.open(image).convert('L')) process_image(image, 'empire.sift') features = np.loadtxt('empire.sift') os.remove('empire.sift') V, S, m = pca.pca(features) V = V[:50] features = array([dot(V, f - m) for f in features]) np.savetxt('fea.txt', features) print 'done'
def test_for_outliers_and_transparency(self): X = np.array(np.random.normal(0, 1, 500)).reshape(100, 5) outliers = np.array(np.random.uniform(5, 10, 25)).reshape(5, 5) X = np.vstack((X, outliers)) model = pca(alpha=0.05) # Fit transform out = model.fit_transform(X) assert X[out['outliers']['y_bool'],:].shape[0]==5 assert out['outliers'].shape[1]==5 ######## TEST FOR HT2 ######### model = pca(alpha=0.05, detect_outliers=['ht2']) # Fit transform out = model.fit_transform(X) assert X[out['outliers']['y_bool'],:].shape[0]==5 ######## TEST FOR SPE/DMOX ######### model = pca(alpha=0.05, detect_outliers=['spe']) # Fit transform out = model.fit_transform(X) assert 'y_bool_spe' in out['outliers'].columns ######## TEST WITHOUT OUTLIERS ######### model = pca(alpha=0.05, detect_outliers=None) # Fit transform out = model.fit_transform(X) assert out['outliers'].empty ######## TEST FOR TRANSPARENCY WITH MATPLOTLIB VERSION ######### assert model.scatter(alpha_transparency=0.1) assert model.scatter3d(alpha_transparency=0.1) assert model.biplot(alpha_transparency=0.1) assert model.biplot3d(alpha_transparency=0.1) assert model.scatter(alpha_transparency=None) assert model.scatter3d(alpha_transparency=None) assert model.biplot(alpha_transparency=None) assert model.biplot3d(alpha_transparency=None) assert model.scatter(alpha_transparency=0.5) assert model.scatter3d(alpha_transparency=0.5) assert model.biplot(alpha_transparency=0.5) assert model.biplot3d(alpha_transparency=0.5)
def pca_contour(): """Plots the pca of the ellipsoid, which projects into a two-dimensional ellipse as expected""" data = np.genfromtxt('./data/output/contour_KVSt_to_dmaps.csv', skip_header=1, delimiter=',') npts_to_dmaps = 5000 slice_size = data.shape[0]/npts_to_dmaps data = data[::slice_size] npts = data.shape[0] ndims = 2 pcs, variances = pca(data, ndims) plot_dmaps.plot_xy(np.dot(pcs[:,0].T, data.T), np.dot(pcs[:,1].T, data.T), color=data[:,1]/data[:,2], scatter=True)
def pca_features(image): im = np.array(Image.open(image).convert('L')) process_image(image, 'empire.sift') features = np.loadtxt('empire.sift') os.remove('empire.sift') V, S, m = pca.pca(features) V = V[:50] features = array([dot(V, f-m) for f in features]) np.savetxt('fea.txt', features) print 'done'
def main(): # prepare the dataset print('Parsing training set...') train_dir = os.path.join(args.data_root, 'OLHWDB1.1trn') train_set = olhwdb.OLHWDB(train_dir) print('Parsing test set...') test_dir = os.path.join(args.data_root, 'OLHWDB1.1tst') test_set = olhwdb.OLHWDB(test_dir) train_x = torch.from_numpy(train_set.x) #.to(args.gpu_id) train_y = train_set.y test_x = torch.from_numpy(test_set.x) #.to(args.gpu_id) test_y = test_set.y # Compress the data with PCA if args.dims is set if args.dims is not None: assert (args.dims < train_x.size(1)) print('Compressing dataset to %d dims...' % (args.dims)) train_x, sigma, mean = pca.pca(train_x, args.dims) test_x = test_x - mean.view(1, -1) test_x = test_x.matmul(sigma) # create lvq model net = lvq.LVQ(train_x.size(1), train_set.num_classes, args.k) # net.to(args.gpu_id) # init lvq prototypes with kmeans net.init_prototypes(train_x, train_y, args.kmeans_iter) test_acc = test(net, test_x, test_y) best_acc = test_acc print('Test acc for k-means initialization: %.2f' % (test_acc)) # create optimizer optimizer = optim.lvq2_1(net.prototype, net.label, 0.1, 0.25) for epoch in range(args.epochs): print('Epoch: %d | %d...' % (epoch + 1, args.epochs)) start = time.time() idx = [i for i in range(train_x.size(0))] np.random.shuffle(idx) for i in idx: x = train_x[i, :].view(1, -1) y = train_y[i] d = net(x) optimizer.step(x, y, d) test_acc = test(net, test_x, test_y) if test_acc > best_acc: best_acc = test_acc end = time.time() print('Runtime: %.2f min. Test acc: %.2f. Best acc: %.2f' % ((end - start) / 60, test_acc, best_acc))
def elbowCore(channelDataAll, a, k, iRate, schedule): n = np.shape(channelDataAll[0])[1] # 列数 p = len(channelDataAll) # 页数 sub = n >> a rates_C = [] rates_U = [] rates_S = [] for g in range(1 << a): # 显示进度 schedule[1] += 1 tmpSchedule = schedule[1] print(u'共' + str(schedule[0]) + u'部分,' + u'第' + str(tmpSchedule) + u'部分开始!') channelData = [] for h in range(p): channelDataPage = channelDataAll[h] channelData.append(channelDataPage[:, g * sub:(g + 1) * sub]) covMatrixList = tools.getCovMatrixList(channelData) allCovMatrix = tools.matrixListToMatrix(covMatrixList) # 对协方差进行聚类 centroids, clusterAssment = kmeans.KMeansOushi(allCovMatrix, k) centroidList = tools.matrixToMatrixList(centroids) # 计算原信道信息量、协方差矩阵特征值、变换矩阵 informations, SigmaList, UList = tools.getInformations(covMatrixList) # 分析PCA效果,计算信息量保留程度 tmpRates = pca.pca(channelData, informations, centroidList, clusterAssment, iRate)[3][0][:, 1] rates_C.append(np.mean(tmpRates)) # 对变换矩阵进行聚类 allU = tools.matrixListToMatrix_U(UList) weights = tools.matrixListToMatrix_U(SigmaList) centroids, clusterAssment = kmeans.KMeansOushi_U(allU, k, weights, iRate) centroidList = tools.matrixToMatrixList_U(centroids) # 分析PCA效果,计算信息量保留程度 tmpRates = pca.pca_U(channelData, informations, centroidList, clusterAssment, iRate)[3][0][:, 1] rates_U.append(np.mean(tmpRates)) # 不聚类,直接PCA tmpRates = pca.pca_S(SigmaList, iRate)[0][:, 1] rates_S.append(np.mean(tmpRates)) # 显示进度 print(u'共' + str(schedule[0]) + u'部分,' + u'第' + str(tmpSchedule) + u'部分完成,' + u'已完成' + str(schedule[1]) + u'部分,' + u'完成度:' + '%.2f%%' % (schedule[1] / schedule[0] * 100) + u'!') rate_C = np.mean(rates_C) rate_U = np.mean(rates_U) rate_S = np.mean(rates_S) return rate_S.real, rate_C.real, rate_U.real
def handler(req): uriParts = req.uri.split("/") tmp = uriParts.index("chemspace") if len(uriParts) == tmp + 2: ## return the list of available spaces req.content_type = "text/xml" req.write(_getChemicalSpaceDocument([("default", "AlogP, TPSA, num rot bond, MW", 4)])) return apache.OK if len(uriParts) < tmp + 3 and len(uriParts) < tmp + 4: return apache.HTTP_NOT_FOUND spaceDef = uriParts[tmp + 1] if spaceDef not in ["default"]: return apache.HTTP_NOT_FOUND ## see if we have a number of components specified try: numComponent = int(uriParts[tmp + 2]) molecules = [x.strip() for x in ("/".join(uriParts[(tmp + 3) :])).split(",")] except: ## wasn't a single number numComponent = 2 molecules = [x.strip() for x in ("/".join(uriParts[(tmp + 2) :])).split(",")] ## get descriptor values if len(molecules) < 3: return apache.HTTP_NOT_FOUND descriptors = [] for molecule in molecules: data = _getDescriptors(molecule) descriptors.append(data) if numComponent > len(descriptors[0]): numComponent = len(descriptors[0]) ## do PCA import pca import numpy data = numpy.asarray(descriptors) mean, pcs, norm_pcs, variances, positions, norm_positions = pca.pca(data, "svd") centeredData = data - mean scores = numpy.dot(centeredData, numpy.transpose(pcs)) scores = scores[: (scores.shape[0]), :numComponent] headers_in = req.headers_in try: accept = headers_in["Accept"] accept = accept.split(",") except KeyError, e: ## we don't throw an exception, since at least one client ## (Google Spreadsheets) does not provide an Accept header accept = ["text/html"]
def test_kPCA(self): generatedDatasetsCircle = 5 # for kPCA with random generated datas for i in range (generatedDatasetsCircle): testDataSet = np.load("unitTest/testData/" + "testCircles" + str(i+1) + ".npy") expRes = np.load("unitTest/testData/" + "testCirclesTransformed" + str(i+1) + ".npy") resKpca = pca.pca(testDataSet, 1, "kernel") npt.assert_array_almost_equal(expRes, resKpca)
def test_fitSvd(self): """ Test PCA using SVD approach. """ testRequestedDim = 1 generatedDatasetsGaussian = 5 # for svd PCA testDataSet = np.array([[0, 1, 2, 3, 4, 0, 0], [0, 0, 0, 0, 0, 1, -1]], dtype=np.float64) expRes = np.array([[-1.42857, -0.42857, 0.57143, 1.57143, 2.57143, -1.42857, -1.42857]]) # Beware the 2-dimensionality resSvd = pca.pca(testDataSet, testRequestedDim, "svd") npt.assert_array_almost_equal(expRes, resSvd) ## with random generated datas for i in range (generatedDatasetsGaussian): testDataSet = np.load("unitTest/testData/" + "testGaussianClasses" + str(i+1) + ".npy") expRes = np.load("unitTest/testData/" + "testGaussianClassesTransformed" + str(i+1) + ".npy") resSvd = pca.pca(testDataSet, 1, "svd") npt.assert_array_almost_equal(expRes, resSvd)
def bootstrapListOfFiles( files, directory="", firstGuess={"M0": 0.0, "SEP": 10.0, "PA": 90.0}, maxResiduals=None, doNotFit=None, N=50, plot=False, ): """ bootstraped version of fitListOfFiles. LIMITATIONS: Only works with one Target, i.e. PA and SEP result in arcseconds. 'PCA' is the principal component reduction of the error ellipse """ res = [] p = Pool() cb_boot(None, init=True) for k in range(N): seed = np.random.randint(1e9) p.apply_async(f_boot, (files, directory, firstGuess, doNotFit, seed), callback=cb_boot) p.close() p.join() res = cb_boot(None, ret=True) # delta DEC X = np.array([z["BEST"]["SEP"] * np.cos(z["BEST"]["PA"] * np.pi / 180) for z in res]) # delta RA cos(DEC) Y = np.array([z["BEST"]["SEP"] * np.sin(z["BEST"]["PA"] * np.pi / 180) for z in res]) p = pca.pca(np.transpose(np.array([(X - X.mean()), (Y - Y.mean())]))) err0 = p.coef[:, 0].std() err1 = p.coef[:, 1].std() if plot: pyplot.figure(10) pyplot.clf() pyplot.axes().set_aspect("equal", "datalim") pyplot.plot(Y, X, ".k", label="bootstrapped positions", alpha=0.5) pyplot.legend() pyplot.ylabel(r"$\Delta$ dec [arcsec]") pyplot.xlabel(r"$\Delta$ RA $\cos$(dec) [arcsec]") # results in usual coordinate frame. # units: arcseconds result = { "Delta RA cos(DEC)": Y, "Delta DEC": X, "AVG Delta RA cos(DEC)": Y.mean(), "AVG Delta DEC": X.mean(), "PCA": (list(p.base[0]), list(p.base[1])), "errs": (err0, err1), } return result
def main(): rootdir = 'fotos' kernel_degree = 2 kernel_ctx = 1 kernel_denom = 30 people_number = 6 train_number_kpca = 6 test_number_kpca = 4 train_number_pca = 3 test_number_pca = 7 action_op = input('Elegi la accion que queres realizar:\n 1 -> Testear un metodo(pca o kpca) \n 2 -> Clasificar una imagen \n Eleccion(1 o 2): ') method_op = input('Elegi el metodo:\n 1 -> PCA \n 2 -> KPCA \n Eleccion(1 o 2): ') if action_op == '1': if method_op == '1': pca.pca(rootdir, people_number, train_number_pca, test_number_pca) elif method_op == '2': kpca.kpca(rootdir, people_number, train_number_kpca, test_number_kpca, kernel_denom, kernel_ctx, kernel_degree) else: print("Invalid method") exit(1) elif action_op == '2': name_face = input('Introducir nombre de la persona.\nOpciones:\tagustin\n\t\taugusto\n\t\tcatalina\n\t\tfrancisco\n\t\tguido\n\t\tnicolas\nSu eleccion: ') number_face = input('Introducir numero de la foto[1-10]: ') if method_op == '1': pca.classify_face_by_pca(rootdir, people_number, 6, name_face, number_face) elif method_op == '2': kpca.classify_face_by_kpca(rootdir, people_number, 4, name_face, number_face) else: print("Invalid method") exit(1) else: print("Invalid action") exit(1)
def visualize_km(data, classes, k=3, num_components=5, perplexity=30, alpha=0.5): # Compute k-means without dimensionality reduction labels_without = kmeans(data, k)[:, -1] # Compute k-means with PCA pca_data = pca(data, False)[:, :num_components] labels_pca = kmeans(pca_data, k)[:, -1] # Project data to t-SNE tsne = manifold.TSNE(2, perplexity=perplexity) tsne_data = tsne.fit_transform(data) fig1, axes1 = plt.subplots(3, 2, figsize=(8, 8)) plt.subplots_adjust(top=0.961, bottom=0.062, left=0.1, right=0.991, hspace=0.4, wspace=0.3) # Plot original labels with 2 principal components (PCA) axes1[0, 0].set_title('(a) Original labels (PCA)') axes1[0, 0].set_xlabel('1st principal component') axes1[0, 0].set_ylabel('2nd principal component') axes1[0, 0].scatter(pca_data[:, 0], pca_data[:, 1], c=classes, alpha=alpha, cmap="rainbow") # Plot original labels with 2 principal components (t-SNE) axes1[0, 1].set_title('(b) Original labels (t-SNE)') axes1[0, 1].set_xlabel('1st principal component') axes1[0, 1].set_ylabel('2nd principal component') axes1[0, 1].scatter(tsne_data[:, 0], tsne_data[:, 1], c=classes, alpha=alpha, cmap="rainbow") # Plot k-means without dim red (PCA) axes1[1, 0].set_title('(c) K-Means with original data (PCA)') axes1[1, 0].set_xlabel('1st principal component') axes1[1, 0].set_ylabel('2nd principal component') axes1[1, 0].scatter(pca_data[:, 0], pca_data[:, 1], c=labels_without, alpha=alpha, cmap="rainbow") # Plot k-means without dim red (t-SNE) axes1[1, 1].set_title('(d) K-Means with original data (t-SNE)') axes1[1, 1].set_xlabel('1st principal component') axes1[1, 1].set_ylabel('2nd principal component') axes1[1, 1].scatter(tsne_data[:, 0], tsne_data[:, 1], c=labels_without, alpha=alpha, cmap="rainbow") # Plot k-means after PCA (PCA) axes1[2, 0].set_title('(e) K-Means after PCA (PCA)') axes1[2, 0].set_xlabel('1st principal component') axes1[2, 0].set_ylabel('2nd principal component') axes1[2, 0].scatter(pca_data[:, 0], pca_data[:, 1], c=labels_pca, alpha=alpha, cmap="rainbow") # Plot k-means after PCA (t-SNE) axes1[2, 1].set_title('(f) K-Means after PCA (t-SNE)') axes1[2, 1].set_xlabel('1st principal component') axes1[2, 1].set_ylabel('2nd principal component') axes1[2, 1].scatter(tsne_data[:, 0], tsne_data[:, 1], c=labels_pca, alpha=alpha, cmap="rainbow") plt.show()
def get_mapping(self, labels): self.z_mapping = [] self.c_mapping = [] for i in self.elements: temp = self.compute_z(labels[i]) self.z_mapping.append(self.compute_z(labels[i])) self.z_mapping = np.asarray(self.z_mapping) Y, P, mu = pca.pca(np.asarray(self.z_mapping), 1) for y in Y: self.c_mapping.append(1 if y > 0 else 0) self.c_mapping = np.asarray(self.c_mapping)
def data_pre_treatment(training_data, validation_data): training_data, validation_data = pca(training_data, validation_data) log( "CLASSIF", "Transformed datasets using PCA.\nTraining Data: {} vectors; \ Validation Data: {} vectors".format(len(training_data), len(validation_data)), time_start) return training_data, validation_data
def linearRegression(tr, te, m, Zee, k=False): trainData = tr testData = te Z = Zee if(k): featureVectors = kMeans(trainData, m) else: #pca matrix transposed mean, featureVectors = pca.pca(trainData, m) #to extract 1st column: pcaMatT[:, 0] trainData = np.matrix(trainData).transpose() testData = np.matrix(testData).transpose() featureVectors = np.pad(featureVectors, ((0, 1),(0, 0)), 'constant', constant_values = 1) #get compressed training data ctData = featureVectors * trainData ctestData = featureVectors * testData Phi = ctData.transpose() # Compute the Wopt Wopt = (inv(Phi.transpose() * Phi) * Phi.transpose() * Z.transpose()).transpose() # print(Wopt.shape) SEkTrain = 0 MRTrain = 0 SEkTest = 0 MRTest = 0 # Calculate the mean square errors and misclassification ratio for the training and testing for i in range (0, 1000): SEkTrain += pow(norm((Wopt * ctData[:,i] - Z[:,i])[:-1,0]), 2) #Removing the padding SEkTrain /= 1000 for i in range(0, 1000): MRTrain += getMCBool(Wopt, ctData[:,i], Z[:, i]) MRTrain /= 1000.0 for i in range (0, 1000): test = pow(norm((Wopt * ctestData[:,i] - Z[:,i])[:-1,0]), 2) #Removing the padding SEkTest += test SEkTest /= 1000 for i in range(0, 1000): MRTest += getMCBool(Wopt, ctestData[:,i], Z[:, i]) MRTest /= 1000.0 #print SEkTrain, MRTrain #print SEkTest, MRTest return SEkTrain, MRTrain, SEkTest, MRTest
def projections(table, numdim): matrix = [] for i in range(len(table.data[0])): tmp = [] for j in range(len(table.name)-1): #print tables[ite][1][0].data[j][i] tmp += [float(table.data[j][i])] matrix += [tmp] M = numpy.matrix(matrix) #C = table.data[table.klass[0]] px, py = pca.pca(M, numdim) # generated projected numdim of dimensionality from PCA #LDAM = lda.lda(M, C, numdim) return widen(table, px, py)
def plotDisp2D(all_loc): ZMat = pca(all_loc.T, k).T figure = plt.figure() ax = figure.add_subplot(111) ax.scatter(ZMat[:, 0].tolist(), ZMat[:, 1].tolist(), s=50, c='green', marker='.') plt.xlabel('x1') plt.ylabel('x2') plt.savefig("THREE2TWO.png") plt.show()
def main(mlp_experiment, net, nCxt, outLayer, feat_dir): ### Use dropout??? if sum(net.dropouts): useDropout = True else: useDropout = False print 'useDropout', useDropout ### Create output directory outFeatDir = 'some path to output features/' # feat template: if your output feat directories obey a particular structure, have empty directories at this path feat_template='path for template directory structure needed, if needed' os.system('cp -R '+ feat_template + ' ' + feat_dir + outFeatDir) assert( os.path.isdir(feat_dir + outFeatDir + 'test_feat_16k/test_07') ) print 'Output feat_dir:', feat_dir + outFeatDir ### Training files featList = 'files_train_noisy_16k.txt' #List of training data files Nframes=5438715; #Total number of vectors in the dataset print 'Transforming training features pre-PCA... ' t1 = time.time() X = apply_nn_train_prePCA(net, nCxt, outLayer, feat_dir, featList, outFeatDir, Nframes, useDropout) t2 = time.time() print 'Total time taken for xfing training prePCA: ', (t2 - t1)/60, 'minutes' np.save('X_prePCA_scale0.05', X) # X = np.load('train_prePCA_X_likeMatlab.npy') print print 'Performing PCA...' P = pca(X,39) print print 'Transforming Training features post-PCA...' t1 = time.time() featList = 'files_train_noisy_16k_prePCA.txt' apply_nn_train_PCA(P, feat_dir, featList, outFeatDir) t2 = time.time() print 'Total time taken for applying PCA to training data:', (t2 - t1)/60, 'minutes' ### Testing data print print 'Transforming Test data...' featList = 'files_test_16k.txt' t1 = time.time() apply_nn_test(P, net, nCxt, outLayer, feat_dir, featList, outFeatDir, useDropout) t2 = time.time() print 'Total time taken for xfing testing features: ', (t2 - t1)/60, 'minutes'
def main(): # change include read number of columns data, labels = read("Homework2_pca_c.txt", 12) # data = data.astype(np.float) my_pca_res = mypca.pca(data) sklearn_pca_res = skap.apply_pca(data) sklearn_svd_res = skap.apply_svd(data) sklearn_tsne_res = skap.apply_tsne(data) vs.visualization(my_pca_res, labels, 'my_pca', 'PC') vs.visualization(sklearn_pca_res, labels, 'sklearn_pca', 'PC') vs.visualization(sklearn_svd_res, labels, 'sklearn_svd', 'SV') vs.visualization(sklearn_tsne_res, labels, 'sklearn_tsne', 'tSNE')
def st_sne( data, dim, layers=2, perplexity=30, verbose=True, E=[] ): '''Space-Time Embedding data is the NxDIM data matrix, dim is the embedding dimension (dim<<DIM), return the Nxdim embedding coordinates ''' if data.shape[1] > 30: if verbose: print( 'PCA %d->%d' % (data.shape[1], 30) ) data = pca.pca( data, 30 ) return st_sned( dist2( data ), dim, layers, perplexity, verbose, E )
def __init__(self, x, y, inputs, eta_b=0.3, eta_n=0.1, nSize=0.5, alpha=1, usePCA=1, useBCs=0, eta_bfinal=0.03, eta_nfinal=0.01, nSizefinal=0.05): self.nData = np.shape(inputs)[0] self.nDim = np.shape(inputs)[1] self.mapDim = 2 self.x = x self.y = y self.eta_b = eta_b self.eta_bfinal = eta_bfinal self.eta_n = eta_n self.eta_nfinal = eta_nfinal self.nSize = nSize self.nSizefinal = nSizefinal self.alpha = alpha self.map = np.mgrid[0:1:np.complex(0, x), 0:1:np.complex(0, y)] self.map = np.reshape(self.map, (2, x * y)) if usePCA: dummy1, dummy2, evals, evecs = pca.pca(inputs, 2) self.weights = np.zeros((self.nDim, x * y)) for i in range(x * y): for j in range(self.mapDim): self.weights[:, i] += (self.map[j, i] - 0.5) * 2 * evecs[:, j] else: self.weights = (np.random.rand(self.nDim, x * y) - 0.5) * 2 self.mapDist = np.zeros((self.x * self.y, self.x * self.y)) if useBCs: for i in range(self.x * self.y): for j in range(i + 1, self.x * self.y): xdist = np.min((self.map[0, i] - self.map[0, j]) ** 2, (self.map[0, i] + 1 + 1. / self.x - self.map[0, j]) ** 2, (self.map[0, i] - 1 - 1. / self.x - self.map[0, j]) ** 2, (self.map[0, i] - self.map[0, j] + 1 + 1. / self.x) ** 2, (self.map[0, i] - self.map[0, j] - 1 - 1. / self.x) ** 2) ydist = np.min((self.map[1, i] - self.map[1, j]) ** 2, (self.map[1, i] + 1 + 1. / self.y - self.map[1, j]) ** 2, (self.map[1, i] - 1 - 1. / self.y - self.map[1, j]) ** 2, (self.map[1, i] - self.map[1, j] + 1 + 1. / self.y) ** 2, (self.map[1, i] - self.map[1, j] - 1 - 1. / self.y) ** 2) self.mapDist[i, j] = np.sqrt(xdist + ydist) self.mapDist[j, i] = self.mapDist[i, j] else: for i in range(self.x * self.y): for j in range(i + 1, self.x * self.y): self.mapDist[i, j] = np.sqrt( (self.map[0, i] - self.map[0, j]) ** 2 + (self.map[1, i] - self.map[1, j]) ** 2) self.mapDist[j, i] = self.mapDist[i, j]
def main(): data_base1 = 'List03\Databases\KC1.csv' data_base2 = 'List03\Databases\CM1.csv' columns_names = "loc,v(g),ev(g),iv(g),n,v,l,d,i,e,b,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects".split( ',') df = pd.read_csv(data_base1, names=columns_names) #Change daba_base1 or 2 data = df.iloc[:, :-1].copy() #Data without target target = df['defects'] #Target class_values = df['defects'].unique() #Number of Classes k_components = 3 #[1,3,5,9,15,20] #Components for PCA #PCA, LDA instances pca_instance = pca.pca(data, target) lda_instance = lda.lda(df, target, class_values) #PCA---------------------------------------------------------------------- cov_matriz = pca_instance.cov_matriz() eigenvalues, eigenvectors = pca_instance.get_eigen_value_vector(cov_matriz) eigen_vec = pca_instance.get_eigenvecs(eigenvalues, eigenvectors, k_components) pca_instance.normalize() new_dataset = pca_instance.change_base(eigen_vec, pca_instance.normalize_data) #LDA--------------------------------------------------------------------- mean_vectors = lda_instance.calc_mean_vect() data_class = lda_instance.get_data_per_class() s_w = lda_instance.calc_sw(mean_vectors, data_class) s_b = lda_instance.calc_sb(mean_vectors) eig_pairs = lda_instance.get_eigs(s_w, s_b) lda_components = lda_instance.get_k_eigenvcs(eig_pairs, len(class_values) - 1) new_space = pd.DataFrame(lda_instance.transform(lda_components)) skf = StratifiedKFold(n_splits=3) #Number of folds knns = [1, 3, 5] print("Components PCA :%.1d" % k_components) for j in knns: print("KNN = %.1d" % j) print("PCA") accuracy_pca = pca_instance.knn(new_dataset, j, skf) accuracy_without_pca = pca_instance.knn(data, j, skf) print("Acurracy with PCA:%.3f " % np.mean(accuracy_pca)) print("Acurracy without PCA:%.3f\n" % np.mean(accuracy_without_pca)) print("LDA") accuracy_lda = lda_instance.knn(new_space, j, skf) accuracy_without_lda = lda_instance.knn(data, j, skf) print("Acurracy with LDA:%.3f " % np.mean(accuracy_lda)) print("Acurracy without LDA:%.3f\n" % np.mean(accuracy_without_lda))
def train_and_save(a): process_data('train/','train_sifts/',a) #process training data features,labels = read_gesture_feature_labels('train_sifts/') classnames = unique(labels) #sorted lists of unique class names V,S,m = pca.pca(features) #keep most important dimensions dims = 50 V = V[:dims] features = array([dot(V,f-m) for f in features]) blist = [features[where (labels==c)[0]] for c in classnames] with open('features.pkl', 'wb') as f: pickle.dump(blist,f) pickle.dump(classnames,f) pickle.dump(V,f) pickle.dump(m,f)
def __init__(self,x,y,inputs,eta_b=0.3,eta_n=0.1,nSize=0.5,alpha=1,usePCA=1,useBCs=0,eta_bfinal=0.03,eta_nfinal=0.01,nSizefinal=0.05): self.nData = np.shape(inputs)[0] self.nDim = np.shape(inputs)[1] # output map size # TODO make more universal self.mapDim = 2 self.x = x self.y = y self.eta_b = eta_b self.eta_bfinal = eta_bfinal self.eta_n = eta_n self.eta_nfinal = eta_nfinal self.nSize = nSize self.nSizefinal = nSizefinal self.alpha = alpha self.map = np.mgrid[0:1:complex(0,x),0:1:complex(0,y)] self.mapDim = 2 self.map = np.reshape(self.map,(2,x*y)) # weights initialization if usePCA: dummy1,dummy2,evals,evecs = pca.pca(inputs,2) self.weights = np.zeros((self.nDim,x*y)) for i in xrange(self.x*self.y): for j in range(self.mapDim): self.weights[:,i] += (self.map[j,i]-0.5)*2*evecs[:,j] else: # random values from the interval <-1,1> self.weights = (np.random.rand(self.nDim,x*y)-0.5)*2 # pre-computing the map distances self.mapDist = np.zeros((self.x*self.y,self.x*self.y)) if useBCs: for i in xrange(self.x*self.y): for j in xrange(i+1,self.x*self.y): xdist = np.min([(self.map[0,i]-self.map[0,j])**2,(self.map[0,i]+1+1./self.x-self.map[0,j])**2,(self.map[0,i]-1-1./self.x-self.map[0,j])**2,(self.map[0,i]-self.map[0,j]+1+1./self.x)**2,(self.map[0,i]-self.map[0,j]-1-1./self.x)**2]) ydist = np.min([(self.map[1,i]-self.map[1,j])**2,(self.map[1,i]+1+1./self.y-self.map[1,j])**2,(self.map[1,i]-1-1./self.y-self.map[1,j])**2,(self.map[1,i]-self.map[1,j]+1+1./self.y)**2,(self.map[1,i]-self.map[1,j]-1-1./self.y)**2]) self.mapDist[i,j] = np.sqrt(xdist+ydist) self.mapDist[j,i] = self.mapDist[i,j] else: for i in xrange(self.x*self.y): for j in xrange(i+1,self.x*self.y): self.mapDist[i,j] = np.sqrt((self.map[0,i] - self.map[0,j])**2 + (self.map[1,i] - self.map[1,j])**2) self.mapDist[j,i] = self.mapDist[i,j]
def train(self, root_training_images_folder): self.projected_classes = [] self.list_of_arrays_of_images, self.labels_list, \ list_of_matrices_of_flattened_class_samples = \ read_images(root_training_images_folder) # create matrix to store all flattened images images_matrix = np.array([np.array(Image.fromarray(img)).flatten() for img in self.list_of_arrays_of_images],'f') # perform PCA self.eigenfaces_matrix, variance, self.mean_Image = pca.pca(images_matrix) # Projecting each class sample (as class matrix) and then using the class average as the class weights for comparison with the Target image for class_sample in list_of_matrices_of_flattened_class_samples: class_weights_vertex = self.project_image(class_sample) self.projected_classes.append(class_weights_vertex.mean(0))
def run_pca(sdata, pca_fraction=0.85, eigenvector_weight=0.25): """ Create a binary matrix via gen_matrix, normalise it, and then run PCA to reduce dimensionality. Usage: run_pca(sdata, pca_fraction, eigenvector_weight sdata - parsers.Parse object with sample data as raw sequences pca_fraction - The top pca_fraction fraction of principle components to keep eigenvector_weight - The top fraction of SNPs to keep which occur with high weights in those principle components Returns: modified parsers.Parse object This function runs makeplot once the data in sdata has been converted to binary and then normalised. It calls console to log its results to screen and to logfile. """ console = display.ConsoleDisplay(logname = 'PCA results') M = numpy.array([ x.data for x in sdata.samples ]) console.log("Normalising %sx%s matrix" % (len(sdata.samples), len(sdata.samples[0].data))) M = pca.normalise(M, log2=False, sub_medians=False, center=True, scale=False) #Only center the data #Unrolling pca.select_genes_by_pca... V = pca.pca(M, pca_fraction) #From SVD SNP_indices = pca.select_genes(V, eigenvector_weight) console.log("Found %s principle components in the top %s fraction" % (len(V), pca_fraction)) #166 console.log("Found %s reliable SNPs occurring with high weight (top %s by absolute value)" % (len(SNP_indices), eigenvector_weight)) #410 #Don't reduce dimensionality right away, we need to take a picture for i in xrange(len(sdata.samples)): sdata.samples[i].data = M[i] makeplot(sdata, V, 'PCA results - All samples') #Reduce dimensions for i in xrange(len(sdata.samples)): sdata.samples[i].data = M[i].take(SNP_indices) return sdata
def plotSegments(self, rawData, segPoints="pca", subplot=False, applyPCA=False): if "currentTime" in rawData.columns: rawData = rawData.drop("currentTime",axis=1) if not subplot: plt.figure(figsize=(11,9)) if segPoints == "pca": segPoints = self.pca_segmenter.findSegments(rawData) if segPoints == "minExtrema": segPoints = self.relativeExtremaSegments(rawData, maxMin="min") if segPoints == "maxExtrema": segPoints = self.relativeExtremaSegments(rawData, maxMin="max") if applyPCA: plt.plot(pca(rawData, n_components=1)[0]) else: plt.plot(rawData) for s in segPoints: if s <= len(rawData): plt.axvline(s,color='black',linewidth=2) if not subplot: plt.show()
def setup_X(self): ''' PCA project the observation matrix ''' # # transpose then mean-substract the matrix; # these are necessary steps for PCA # # first transpose; resulting in a FxN # matrix where F is the number of # features and N is the number of # instances self.indices_to_ids, self.X = self.unlabeled_datasets[0].to_numpy_arr(indicator=True, build_id_dict=True) X_t = self.X.T # substract the mean from each row X_t_bar = [r-r.mean() for r in X_t] # build a new matrix X_t_bar = numpy.array(X_t_bar) # now run pca on the mean substracted # transposed matrix self.V,P = pca.pca(X_t_bar) self.full_P = P #pdb.set_trace() # keep only the top r principal components drop_n = 0 if self.r is not None: drop_n = P.shape[1]-self.r P = P[:,:-drop_n] self.P = P.T # finally, project X onto the lower-dimensional # space. note that PX will be an r x N matrix # wherein each column corresponds to an # instance projected onto the top r principal # components self.PX = numpy.dot(P.T, X_t)
def findSegments(self, rawData, minSegSize=20): PCs = pca(rawData, n_components=2)[0] minSegs = argrelmin(PCs[:,0],order=minSegSize)[0] maxSegs = argrelmax(PCs[:,0],order=minSegSize)[0] minSegs_secondComponentValues = [[PCs[:,1][s]] for s in minSegs] maxSegs_secondComponentValues = [[PCs[:,1][s]] for s in maxSegs] clf=cluster.KMeans(2) min_clusters = clf.fit_predict(minSegs_secondComponentValues) max_clusters = clf.fit_predict(maxSegs_secondComponentValues) indexes_min_cluster0 = [s for (s,c) in zip(minSegs,min_clusters) if c == 0] indexes_min_cluster1 = [s for (s,c) in zip(minSegs,min_clusters) if c == 1] indexes_max_cluster0 = [s for (s,c) in zip(maxSegs,max_clusters) if c == 0] indexes_max_cluster1 = [s for (s,c) in zip(maxSegs,max_clusters) if c == 1] values_min_cluster0 = map(lambda x: PCs[:,0][x], indexes_min_cluster0) values_min_cluster1 = map(lambda x: PCs[:,0][x], indexes_min_cluster1) values_max_cluster0 = map(lambda x: PCs[:,0][x], indexes_max_cluster0) values_max_cluster1 = map(lambda x: PCs[:,0][x], indexes_max_cluster1) average_min_cluster0 = abs(sum(values_min_cluster0)/float(len(values_min_cluster0))) average_min_cluster1 = abs(sum(values_min_cluster1)/float(len(values_min_cluster1))) average_max_cluster0 = abs(sum(values_max_cluster0)/float(len(values_max_cluster0))) average_max_cluster1 = abs(sum(values_max_cluster1)/float(len(values_max_cluster1))) max_average = max(average_min_cluster0, average_min_cluster1, average_max_cluster0, average_max_cluster1) if max_average == abs(average_min_cluster0): return indexes_min_cluster0 if max_average == abs(average_min_cluster1): return indexes_min_cluster1 if max_average == abs(average_max_cluster0): return indexes_max_cluster0 if max_average == abs(average_max_cluster1): return indexes_max_cluster1
Created on Jun 14, 2011 @author: Song Yu ''' from numpy import * import matplotlib import matplotlib.pyplot as plt import pca def replaceNanWithMean(): datMat = pca.loadDataSet('secom.data', ' ') numFeat = shape(datMat)[1] for i in range(numFeat): meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number) datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean return datMat dataMat = replaceNanWithMean() lowDDataMat, reconMat, total, varPercentage = pca.pca(dataMat, topNfeat=9999999) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(range(1, 51), varPercentage[:50], marker='^') plt.xlabel('Principal Component Number') plt.ylabel('Percentage of Variance') plt.show()
x = r0 + 8.0 y = 1.0*r1 + x xcord1.append(x) ycord1.append(y) elif groupNum == 2: x = r0 + 0.0 y = 1.0*r1 + x xcord2.append(x) ycord2.append(y) fw.write("%f\t%f\t%d\n" % (x, y, groupNum)) fw.close() fig = plt.figure() ax = fig.add_subplot(211) ax.scatter(xcord0,ycord0, marker='^', s=90) ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') ax.scatter(xcord2,ycord2, marker='v', s=50, c='yellow') ax = fig.add_subplot(212) myDat = pca.loadDataSet('testSet3.txt') lowDDat,reconDat = pca.pca(myDat[:,0:2],1) label0Mat = lowDDat[nonzero(myDat[:,2]==0)[0],:2][0] #get the items with label 0 label1Mat = lowDDat[nonzero(myDat[:,2]==1)[0],:2][0] #get the items with label 1 label2Mat = lowDDat[nonzero(myDat[:,2]==2)[0],:2][0] #get the items with label 2 #ax.scatter(label0Mat[:,0],label0Mat[:,1], marker='^', s=90) #ax.scatter(label1Mat[:,0],label1Mat[:,1], marker='o', s=50, c='red') #ax.scatter(label2Mat[:,0],label2Mat[:,1], marker='v', s=50, c='yellow') ax.scatter(label0Mat[:,0],zeros(shape(label0Mat)[0]), marker='^', s=90) #ax.scatter(label1Mat[:,0],zeros(shape(label1Mat)[0]), marker='o', s=50, c='red') #ax.scatter(label2Mat[:,0],zeros(shape(label2Mat)[0]), marker='v', s=50, c='yellow') plt.show()
for i in range (numberOfTestUnits): x = np.random.multivariate_normal([10, 10], [[1,0],[0,50]], size).T y = np.random.multivariate_normal([10, 60], [[1,0],[0,50]], size).T x = np.concatenate((x,y), axis=1) np.save("unitTest/testData/" + "testGaussianClasses" + str(i+1), x) # Memory mapped version xm = np.memmap("unitTest/testData/" + "testGaussianClassesMmap" + str(i+1) + ".npy", dtype="float64", mode="w+", shape=(2, size*2)) xm[:] = x xm.flush() #do PCA x = np.load("unitTest/testData/" + "testGaussianClasses" + str(i+1) + ".npy") p = pca.pca(x, 1, mode="svd") # save data of pca np.save("unitTest/testData/" + "testGaussianClassesTransformed" + str(i+1), p) # Generate concentric circles x, y = dts.make_circles(n_samples=1000, noise=0.1, factor=0.25) y = np.reshape(y, (1000,1)) # These are class labels: 0, 1 x = np.concatenate((x,y), axis=1) np.save("unitTest/testData/" + "testCircles" + str(i+1), x.T) # Memory mapped circles #xm = np.memmap("unitTest/testData/" + "testCirclesMmap" + str(i+1) + ".npy", dtype="float64", mode="w+", shape=(3, 1000)) #xm[:] = x.T #xm.flush()
def plotAll(self, sensors=['gyroX','gyroY','gyroZ','accelX','accelY','accelZ','magX','magY','magZ'], LR='L', segment=None, applyPCA=False): plt.figure(figsize=(12, 8), dpi=80) foot = plt.subplot(511) plt.title("Foot") data = self.feet[2][LR][sensors] if segment: self.plotSegments(data, segPoints=segment, subplot=True, applyPCA=applyPCA) elif applyPCA: plt.plot(pca(data, n_components=1)[0]) else: data.plot(ax=foot) plt.xlabel("") plt.setp(foot.get_xticklabels(), visible=False) shin = plt.subplot(512, sharex=foot) plt.title("Shin") data = self.shins[2][LR][sensors] if segment: self.plotSegments(data, segPoints=segment, subplot=True, applyPCA=applyPCA) elif applyPCA: plt.plot(pca(data, n_components=1)[0]) else: data.plot(ax=shin) plt.xlabel("") plt.setp(shin.get_xticklabels(), visible=False) thigh = plt.subplot(513, sharex=foot) plt.title("Thigh") data = self.thighs[1][LR][sensors] if segment: self.plotSegments(data, segPoints=segment, subplot=True, applyPCA=applyPCA) elif applyPCA: plt.plot(pca(data, n_components=1)[0]) else: data.plot(ax=thigh) plt.xlabel("") plt.setp(thigh.get_xticklabels(), visible=False) hip = plt.subplot(514, sharex=foot) plt.title("Hip") data = self.hips[1][LR][sensors] if segment: self.plotSegments(data, segPoints=segment, subplot=True, applyPCA=applyPCA) elif applyPCA: plt.plot(pca(data, n_components=1)[0]) else: data.plot(ax=hip) plt.xlabel("") plt.setp(hip.get_xticklabels(), visible=False) chest = plt.subplot(515, sharex=foot) plt.title("Chest") data = self.chest[1][LR][sensors] if segment: self.plotSegments(data, segPoints=segment, subplot=True, applyPCA=applyPCA) elif applyPCA: plt.plot(pca(data, n_components=1)[0]) else: data.plot(ax=chest) plt.show()
from PIL import Image import pca import numpy as np import pylab import os indir = 'data/a_thumbs' imlist = [os.path.join(indir, f) for f in os.listdir(indir)] im = np.array(Image.open(imlist[0])) m, n = im.shape[0:2] count = len(imlist) immatrix = np.array([np.array(Image.open(i)).flatten() for i in imlist], 'f') V, S, immean = pca.pca(immatrix) pylab.figure() pylab.gray() pylab.subplot(2, 4, 1) pylab.imshow(immean.reshape(m, n)) for i in range(7): pylab.subplot(2, 4, i+2) pylab.imshow(V[i].reshape(m, n)) pylab.show()