def build_classifier(train_data_x_in, train_data_y, classifier_in="svc_basic"): print "Attempting to build classifier." train_data_x = train_data_x_in transformer = "" # classifier = grid_search.GridSearchCV(svm.SVC(), parameters).fit(train_data_x, train_data_y) if classifier_in == "svc_basic": classifier = svm.SVC() print "Selection was basic svm.SVC." elif classifier_in == "svc_extensive": classifier = svm.SVC(kernel="linear", C=0.025, gamma=0.01) print "Selection was extensive svm.SVC, with linear kernel, C==0.025 and gamma==0.01." elif classifier_in == "kneighbors_basic": transformer = RandomizedPCA(n_components=2000) train_data_x = transformer.fit_transform(train_data_x) classifier = KNeighborsClassifier() print "Selection was KNeighbors basic, using RandomizedPCA to transform data first. n_components==2000." elif classifier_in == "bagging_basic": classifier = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) print "Selection was Bagging basic, with max_samples==0.5 and max_features==0.5." elif classifier_in == "spectral_basic": transformer = SpectralEmbedding(n_components=2000) train_data_x = transformer.fit_transform(train_data_x) classifier = KNeighborsClassifier() print "Selection was Spectral basic, using svm.SVC with Spectral data fitting. n_components==2000." # default to SVC in case of any sort of parsing error. else: print "Error in selecting classifier class. Reverting to SVC." classifier = svm.SVC() classifier.fit(train_data_x, train_data_y) print "Doing classifier estimation." return classifier, train_data_x, transformer
def reduce_features(features, var_explained=0.9, n_components=0, verbose=False): """ Performs feature reduction using PCA. Automatically selects nr. components for explaining min_var_explained variance. :param features: Features. :param var_explained: Minimal variance explained. :param n_components: Nr. of components. :param exclude_columns: Columns to exclude. :param verbose: Verbosity. :return: Reduced feature set. """ if n_components == 0: # Run full PCA to estimate nr. components for explaining given # percentage of variance. estimator = RandomizedPCA() estimator.fit_transform(features) variance = 0.0 for i in range(len(estimator.explained_variance_ratio_)): variance += estimator.explained_variance_ratio_[i] if variance > var_explained: n_components = i + 1 if verbose: print('{} % of variance explained using {} components'. format(var_explained, n_components)) break # Re-run PCA with only estimated nr. components estimator = RandomizedPCA(n_components=n_components) features = estimator.fit_transform(features) return features
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def pca(imageData=[]): labels = ["shoe", "shirt"] is_train = np.random.uniform(0, 1, len(imageData)) <= 0.7 y = np.where(np.array(labels) == "shirt", 1, 0) train_x, train_y = imageData[is_train], imageData[is_train] test_x, test_y = imageData[is_train == False], y[is_train == False] pca = RandomizedPCA(n_components=2) X = pca.fit_transform(imageData) df = pd.DataFrame({ "x": X[:, 0], "y": X[:, 1], "label": np.where(y == 1, "shoe", "shirt") }) colors = ["red", "yellow"] for label, color in zip(df['label'].unique(), colors): mask = df['label'] == label pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) pl.legend() pl.show() pca2 = RandomizedPCA(n_components=5) train_x = pca2.fit_transform(train_x) test_x = pca2.transform(test_x) print train_x[:5] knn = KNeighborsClassifier() knn.fit(train_x, train_y) return 0
def reduce_features(features, var_explained=0.9, n_components=0, verbose=False): """ Performs feature reduction using PCA. Automatically selects nr. components for explaining min_var_explained variance. :param features: Features. :param var_explained: Minimal variance explained. :param n_components: Nr. of components. :param exclude_columns: Columns to exclude. :param verbose: Verbosity. :return: Reduced feature set. """ if n_components == 0: # Run full PCA to estimate nr. components for explaining given # percentage of variance. estimator = RandomizedPCA() estimator.fit_transform(features) variance = 0.0 for i in range(len(estimator.explained_variance_ratio_)): variance += estimator.explained_variance_ratio_[i] if variance > var_explained: n_components = i + 1 if verbose: print('{} % of variance explained using {} components'.format(var_explained, n_components)) break # Re-run PCA with only estimated nr. components estimator = RandomizedPCA(n_components=n_components) features = estimator.fit_transform(features) return features
def pcaAndPlot(X, x_to_centroids, centroids, no_dims = 2): pca = RandomizedPCA(n_components=no_dims) x_trans = pca.fit_transform(X) x_sizes = np.full((x_trans.shape[0]), 30, dtype=np.int) plt.scatter(x_trans[:, 0], x_trans[:, 1], s=x_sizes, c=x_to_centroids) centroids_trans = pca.fit_transform(centroids) centroids_col = np.arange(centroids.shape[0]) centroids_sizes = np.full((centroids.shape[0]), 70, dtype=np.int) plt.scatter(centroids_trans[:, 0], centroids_trans[:, 1], s=centroids_sizes, c=centroids_col) plt.show()
def principal_component_analysis(x): sizes = np.shape(x) cols = sizes[1] # Obtain the Principal Components, which are ordered by eigenvalues principal_components = RandomizedPCA(n_components=cols) principal_components.fit_transform(x) eigenvalues = principal_components.explained_variance_ # Maximum eigenvalues reflect importance of each feature feature_order = np.argsort(eigenvalues)[::-1][:cols] return feature_order
def read_data_sets(): class DataSets(object): pass NUM_CLASSES = 7 start = time.time() data_sets = DataSets() # Load the training data mat_contents = sio.loadmat('labeled_images.mat') train_labels = mat_contents['tr_labels'] train_identities = mat_contents['tr_identity'] train_images = mat_contents['tr_images'] # Load the test data mat_contents = sio.loadmat('public_test_images.mat') test_images = mat_contents['public_test_images'] test_set_length = len(test_images[0][0]) # Flatten images test_images = flattenImages(test_images) train_images = flattenImages(train_images) # Split train into validation set of size ~ test_set_length train_images, train_labels, validation_images, validation_labels = splitSet( train_images, train_labels, train_identities, test_set_length) # Convert labels to one hot vectors train_labels = convertToOneHot(train_labels, NUM_CLASSES) validation_labels = convertToOneHot(validation_labels, NUM_CLASSES) # Normalize the images sd = np.sqrt(np.var(train_images) + 0.01) train_images = (train_images - np.mean(train_images)) / sd sd = np.sqrt(np.var(validation_images) + 0.01) validation_images = (validation_images - np.mean(validation_images)) / sd pca = RandomizedPCA(n_components=15) train_images = pca.fit_transform(train_images) validation_images = pca.fit_transform(validation_images) # Setup the matrixes into an accessible data set class data_sets.train_set = DataSet(train_images, train_labels) data_sets.validation_set = DataSet(validation_images, validation_labels) data_sets.test_set = DataSet(test_images, np.zeros((len(test_images), NUM_CLASSES))) print('Finished setting up data! Took {} seconds'.format(time.time() - start)) return data_sets
def get_features_from_images_PCA(img_dir,data_set): """ Takes in a directory and gets all the images from it and extracts the pixel values, flattens the matrix into an array and performs principle component analysis to get representative subset of features from the pixel values of the image. """ print "\nExtracting features from given images..." img_names = [f for f in os.listdir(img_dir)] images = [img_dir+ f for f in os.listdir(img_dir)] #print images print "\nConverting images to vectors" data = [] for image in images: # print image img = img_to_matrix(image) img = flatten_image(img) data.append(img) print "Converting image data to numpy array" time.sleep(5) data = np.array(data) print "Finished Conversion" time.sleep(5) print "\nPerforming PCA to get reqd features" features = [] pca = RandomizedPCA(n_components=14) for i in xrange(len(data)/100): if features == []: split = data[0:100] features = pca.fit_transform(split) else: split = data[100*i:100*(i+1)] features = np.concatenate((features,pca.fit_transform(split)),axis=0) print "Writing feature data to file" f = open(data_set+"_extracted_features.txt","w") for i in xrange(len(img_names)): s = str(img_names[i]) for value in features[i]: s += " "+str(value) s += "\n" f.write(s) f.close() print "Write completed"
def pcaAndPlot(X, x_to_centroids, centroids, no_dims=2): pca = RandomizedPCA(n_components=no_dims) x_trans = pca.fit_transform(X) x_sizes = np.full((x_trans.shape[0]), 30, dtype=np.int) plt.scatter(x_trans[:, 0], x_trans[:, 1], s=x_sizes, c=x_to_centroids) centroids_trans = pca.fit_transform(centroids) centroids_col = np.arange(centroids.shape[0]) centroids_sizes = np.full((centroids.shape[0]), 70, dtype=np.int) plt.scatter(centroids_trans[:, 0], centroids_trans[:, 1], s=centroids_sizes, c=centroids_col) plt.show()
def main(): ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required = True, help = "Path to the image") args = vars(ap.parse_args()) image = cv2.imread(args["image"]) rects, img = detect(image) cropped = [] for idx, (x1, y1, x2, y2) in enumerate(rects): crop_img = image[y1:y1 + (y2 - y1), x1:x1 + (x2 - x1)] crop_img = cv2.resize(crop_img, (100,100), interpolation = cv2.INTER_AREA) cv2.imshow("image" + str(idx), crop_img) new_img = crop_img.reshape(crop_img.shape[0] * crop_img.shape[1], 3) cropped.append(new_img.flatten()) # reduce feature size cropped_pca = [] pca = RandomizedPCA(n_components=100) cropped_pca = pca.fit_transform(cropped) # training (hardcoded for now) clf = SVC(probability=True) train = cropped_pca[:7] test = cropped_pca[7:13] # clf.fit([[0,0],[1,1]], [1, 2]) clf.fit(train, [1,2,2,1,2,1,1]) for item in test: print clf.predict_proba(item) print clf.predict(item) cv2.waitKey(0)
def _prepare_pca(self, data, max_n_components): """ Helper Function """ from sklearn.decomposition import RandomizedPCA # sklearn < 0.11 does not support random_state argument kwargs = {'n_components': max_n_components, 'whiten': False} aspec = inspect.getargspec(RandomizedPCA.__init__) if 'random_state' not in aspec.args: warnings.warn('RandomizedPCA does not support random_state ' 'argument. Use scikit-learn to version 0.11 ' 'or newer to get reproducible results.') else: kwargs['random_state'] = 0 pca = RandomizedPCA(**kwargs) pca_data = pca.fit_transform(data.T) if self._explained_var > 1.0: if self.n_components is not None: # normal n case self._comp_idx = np.arange(self.n_components) to_ica = pca_data[:, self._comp_idx] else: # None case to_ica = pca_data self.n_components = pca_data.shape[1] self._comp_idx = np.arange(self.n_components) else: # float case expl_var = pca.explained_variance_ratio_ self._comp_idx = (np.where(expl_var.cumsum() < self._explained_var)[0]) to_ica = pca_data[:, self._comp_idx] self.n_components = len(self._comp_idx) return to_ica, pca
def detect(self, imageURLs, params): array = [] for param in params: img = self.img_to_matrix(param['imageURL']) data = self.flatten_image(img) array.append(data) array = np.array(array) pca = RandomizedPCA(n_components=5) n_data = pca.fit_transform(array) clf = joblib.load('src/resource/models/model.pkl') result = clf.predict(n_data).tolist() for param, r in zip(params, result): raw_img = urllib2.urlopen(param['imageURL']).read() if r == 1: cntr = len([i for i in os.listdir("test/images/rain/") if 'rain' in i]) + 1 path = "static/images/rain_" + str(cntr) + '.jpg' f = open(path, 'wb') f.write(raw_img) f.close() # イベント情報作成 when = {'type': 'timestamp', 'time':param['time']} where = { "type": "Point", "coordinates": [param['longitude'], param['latitude']]} what = {'topic': {'value':u'雨'}, 'tweet': param['value']} who = [{"type": "url", "value": param['imageURL']}, {"value": "evwh <*****@*****.**>", "type": "author"}] event = {'observation':{'what': what, 'when': when, 'where': where, 'who': who}} self.connection['event']['TwitterImageRainSensor'].insert(event)
def rpca(numpy_file='../data/Paintings/two_class/Paintings_train.csv'): """ Performs randomized PCA on given numpy file. Given a numpy file of n-rows and n-cols, where the last column is the label and rest are features,n-rows are the samples. :type numpy_file: string :param numpy_file: The file name of numpy file to be analyzed. """ import numpy as np import matplotlib.pyplot as pl import pandas as pd from sklearn.decomposition import RandomizedPCA all_data = np.loadtxt(numpy_file,delimiter=',') data = all_data[:,:-1] y = all_data[:,-1] pca = RandomizedPCA(n_components=2) X = pca.fit_transform(data) df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1],\ "label":np.where(y==1, "realism", "abstract")}) colors = ["red", "yellow"] for label, color in zip(df['label'].unique(), colors): mask = df['label']==label pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) pl.legend() pl.title('Randomized PCA analysis') pl.show()
def main(): img_dir = 'images/' images = [img_dir + f for f in os.listdir(img_dir)] labels = [f.split('/')[-1].split('_')[0] for f in images] label2ids = {v: i for i, v in enumerate(sorted(set(labels), key=labels.index))} y = np.array([label2ids[l] for l in labels]) data = [] for image_file in images: img = img_to_matrix(image_file) img = flatten_image(img) data.append(img) data = np.array(data) # training samples is_train = np.random.uniform(0, 1, len(data)) <= 0.7 train_X, train_y = data[is_train], y[is_train] # training a classifier pca = RandomizedPCA(n_components=5) train_X = pca.fit_transform(train_X) multi_svm = OneVsRestClassifier(LinearSVC()) multi_svm.fit(train_X, train_y) # evaluating the model test_X, test_y = data[is_train == False], y[is_train == False] test_X = pca.transform(test_X) print pd.crosstab(test_y, multi_svm.predict(test_X), rownames=['Actual'], colnames=['Predicted'])
def rpca(numpy_file='../data/Paintings/two_class/Paintings_train.csv'): """ Performs randomized PCA on given numpy file. Given a numpy file of n-rows and n-cols, where the last column is the label and rest are features,n-rows are the samples. :type numpy_file: string :param numpy_file: The file name of numpy file to be analyzed. """ import numpy as np import matplotlib.pyplot as pl import pandas as pd from sklearn.decomposition import RandomizedPCA all_data = np.loadtxt(numpy_file, delimiter=',') data = all_data[:, :-1] y = all_data[:, -1] pca = RandomizedPCA(n_components=2) X = pca.fit_transform(data) df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1],\ "label":np.where(y==1, "realism", "abstract")}) colors = ["red", "yellow"] for label, color in zip(df['label'].unique(), colors): mask = df['label'] == label pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) pl.legend() pl.title('Randomized PCA analysis') pl.show()
def make_pca_datapoints(terms_map, stopwords, clusters): new_terms_map = {} raw_data = [] target = [] for line in open(tweets_file): tokens = line.split() terms = [terms_map[int(term)] for term in tokens[3].split(',') if terms_map[int(term)] not in stopwords] for term in terms: if not term in new_terms_map: new_terms_map[term] = len(new_terms_map) new_term_ids = [new_terms_map[term] for term in terms] tags = [terms_map[int(term)] for term in tokens[4].split(',')] raw_data.append(new_term_ids) target.append(tags) data = lil_matrix( (len(raw_data), len(new_terms_map)) ) count = 0 for cur_vector in raw_data: for point in cur_vector: data[(count, point)] += 1 count += 1 pca = RandomizedPCA (n_components=100) transformed_data = pca.fit_transform(data) xs = [] ys = [] count = 0 for datum in transformed_data: for tag in target[count]: if (len(tag) > 1) and tag[1:] in clusters: xs.append(datum) ys.append(clusters[tag[1:]]) count += 1 del transformed_data return xs, ys
def do_pca(corr_matrix: _nested_ndarray, num_dim: int, min_var_explanation: float =0.7) -> _nested_ndarray: ''' This method performs PCA on a self-correlation matrix, reducing the number of columns to `num_dim`. If such analysis does not sufficiently explain the underlying variance in the data, an exception is thrown. Args: * `corr_matrix` - a square matrix of correlations * `num_dim` - the number of dimensions to which the data should be reduced * `min_var_explanation` - the minimum fraction of the underlying data variance that should be explained Returns: > A matrix of the PCA result on `corr_matrix`. ''' num_dim = int(num_dim) pca = PCA(n_components=num_dim, random_state=0) pca_result = pca.fit_transform(corr_matrix) var_ratio = pca.explained_variance_ratio_ if sum(var_ratio) < min_var_explanation: raise PcaAccuracyException( 'PCA doesn\'t explain enough of the variance in the data') return pca_result
def calc_hog(fpaths, save=False): ''' Compute histogram of gradients (HOG). Saves in batches to prevent memory issues. Input: fpaths : files on which HOG will be computed save : if true, output is saved to disk ''' hogs = np.empty((len(fpaths), 15876)) for i, fpath in enumerate(fpaths): img = imread(os.path.join(imgdir, fpath)) if len(img.shape)==3: img = rgb2gray(img) # rescale so all feature vectors are the same length img_resize = resize(img, (128, 128)) img_hog = hog(img_resize) hogs[i, :] = img_hog hogs_sc = scale(hogs) n_components = 15 pca = RandomizedPCA(n_components=n_components) hogs_decomp = pca.fit_transform(hogs_sc) df = pd.DataFrame(hogs_decomp, index=[os.path.split(i)[1] for i in fpaths]) df.index.name='fpath' df.columns = ['feat_hog_%2.2u' % i for i in range(1, n_components+1)] if save: df.to_csv('hog.csv') return df
def Q4(): data = datasets.fetch_olivetti_faces(shuffle=True, random_state=0) X = data.data y = data.target image_shape = (64, 64) n = X.shape[0] n_components = 10 model = RandomizedPCA(n_components=n_components) Z = model.fit_transform(X) Z_c = Z #- Z.mean(axis=1).reshape((n, 1)) !!!! ERROR IN COURSERA Z_c = Z_c * Z_c Z_tot = Z_c.sum(axis=1).reshape((n, 1)) Cos = Z_c / Z_tot i_s = [] for j in range(n_components): i = np.argmax(Cos[:, j]) i_s.append(i) image = X[i, :].reshape(image_shape) #plt.imshow(image) #plt.show() utils.PATH.SAVE_RESULT((3, 2), (1, 4), i_s) return
def scatter(data, labels=None, title=None, name=None): """2d PCA scatter plot with optional class info Return the pca model to be able to introspect the components or transform new data with the same model. """ data = atleast2d_or_csr(data) if data.shape[1] == 2: # No need for a PCA: data_2d = data else: pca = RandomizedPCA(n_components=2) data_2d = pca.fit_transform(data) for i, c, m in zip(np.unique(labels), cycle(COLORS), cycle(MARKERS)): plt.scatter(data_2d[labels == i, 0], data_2d[labels == i, 1], c=c, marker=m, label=i, alpha=0.5) plt.legend(loc='best') if title is None: title = "2D PCA scatter plot" if name is not None: title += " for " + name plt.xlabel('First Principal Component') plt.ylabel('Second Principal Component') plt.title(title) return pca
def main(): protein = sys.argv[1] X = load_file(protein) """ scores = np.loadtxt("../LSDMap/{protein}.scores.txt".format(**locals())) if scores.shape[0] != RMSD.shape[0]: scores = scores[-RMSD.shape[0]:] print("selecting last N") models = select_N_models(RMSD[1:,1:], scores, 10000) keep = np.r_[0, models + 1] n_neigh = np.min(np.sum(RMSD < 6, axis=0)[models + 1]) RMSD = RMSD[keep,:][:,keep] """ #models = np.arange(N) #np.savetxt("output/{protein}/pca/kept.txt".format(**locals()), models) #np.save("output/{protein}/pca/RMSD.npy".format(**locals()), RMSD[0,:]) pca = RandomizedPCA(n_components=100, copy=False) proj = pca.fit_transform(X) acc_var = calcAccumVar(pca.explained_variance_ratio_) np.savetxt("output/{protein}/pca/acc_var.txt".format(**locals()), acc_var) np.save("output/%s/pca/proj.npy" % protein, proj) np.save("output/{protein}/pca/proj2D.npy".format(**locals()), proj[:, :2])
def pca(self, y): # select a random subset of Y dimensions (possibly gives robustness as well as speed) rand_dims = np.sort( np.random.choice(y.shape[1], np.minimum(self.tree_params['num_dims_for_pca'], y.shape[1]), replace=False)) y_dim_subset = y.take(rand_dims, 1) pca = RandomizedPCA(n_components=1) # compute for all components # optional: select a subset of exs (not so important if PCA is fast) if self.tree_params['sub_sample_exs_pca']: rand_exs = np.sort( np.random.choice(y.shape[0], np.minimum( self.tree_params['num_exs_for_pca'], y.shape[0]), replace=False)) pca.fit(y_dim_subset.take(rand_exs, 0)) return pca.transform(y_dim_subset) else: # perform PCA return pca.fit_transform(y_dim_subset)
def dimentionality_reduction(train_x , test_x): print "Dimentionality reduction to 10D on training and test data...." pca = RandomizedPCA(n_components=10) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) print "Done." return train_x , test_x
def randomized_pca(train_data_images, train_data_split_images, test_data_images, IMG_SIZE): train_data_features = [] test_data_features = [] train_data = [] test_data = [] train_data_split_crossfold = [] for image in train_data_images: img = img_to_matrix(image, IMG_SIZE) img = flatten_image(img) train_data.append(img) for image in train_data_split_images: img = img_to_matrix(image, IMG_SIZE) img = flatten_image(img) train_data_split_crossfold.append(img) for image in test_data_images: img = img_to_matrix(image, IMG_SIZE) img = flatten_image(img) test_data.append(img) pca = RandomizedPCA(50) return (pca.fit_transform(train_data), pca.transform(test_data))
def dimentionality_reduction(train_x, test_x): print "Dimentionality reduction to 10D on training and test data...." pca = RandomizedPCA(n_components=10) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) print "Done." return train_x, test_x
def pca_knn(): Xtrain,ytrain,Xtest,ytest = getSplitData() Xtrain, Xtest = getScaledData(Xtrain, Xtest) ntest = Xtest.shape[0] #Your code here for n in range(5,8): pca = RandomizedPCA(n_components=n) pca_Xtrain = pca.fit_transform(Xtrain, ytrain) pca_Xtest = pca.fit_transform(Xtest) neigh = KNeighborsClassifier(n_neighbors=5) neigh.fit(pca_Xtrain, ytrain) yPredict = neigh.predict(pca_Xtest) print "parameter: n_components = ",n print "parameter: n_neighbors = 5" print "pca_knn classification accuracy: ", accuracy_score(ytest,yPredict)
def run_pca(self, features): """Run a principal component analysis on the training data """ pca = RandomizedPCA(n_components=5) feautres_pca = pca.fit_transform(features) return feautres_pca
def main(): #get the file path from the command prompt if len(sys.argv) > 1: TEST_FILE = sys.argv[1] else: print("error: lease specify a file path") exit() print("TRAINING STARTED!") print("pulling images from files...") #Store image paths and labels images = [] rawlabels = [] for subdir, dirs, files in os.walk(DATA_DIR): for file in files: if (subdir.split('/')[1]) != "test": rawlabels.append(subdir.split('/')[1]) images.append(os.path.join(subdir, file)) print("converting images to arrays...") #Create a massive data array data = [] labels = [] counter = 0 for imagePath in images: #print imagePath img = [] try: img = imgToArray(imagePath) data.append(img) labels.append(rawlabels[counter]) except IOError: pass counter += 1 data = np.array(data) print("reducing arrays using randomizedPCA...") #randomizedPCA on training set #this reduces the huge amount of data points pca = RandomizedPCA(n_components=4) data = pca.fit_transform(data) #generate a 2D plot that shows the groupings #generatePlot(data,labels) print("using K-closest neighbors to classify data...") #fit the KNeighbors classifier knn = KNeighborsClassifier() knn.fit(data, labels) print("-----------------------------------") print("TESTING STARTED!") #test the image print "The test image, " + TEST_FILE + " is a:" test = string_to_img(TEST_FILE, pca) print classify_image(test, knn)
def rca1_decompose(dataset, n): rca = RandomizedPCA(n_components=n) reduced_features = rca.fit_transform(dataset.all.features) training_size = dataset.training_size training = Data(reduced_features[:training_size, :], dataset.all.target[:training_size]) testing = Data(reduced_features[training_size:, :], dataset.all.target[training_size:]) return DataSet(training, testing)
def pca_knn(): Xtrain, ytrain, Xtest, ytest = getSplitData() Xtrain, Xtest = getScaledData(Xtrain, Xtest) ntest = Xtest.shape[0] #Your code here for n in range(5, 8): pca = RandomizedPCA(n_components=n) pca_Xtrain = pca.fit_transform(Xtrain, ytrain) pca_Xtest = pca.fit_transform(Xtest) neigh = KNeighborsClassifier(n_neighbors=5) neigh.fit(pca_Xtrain, ytrain) yPredict = neigh.predict(pca_Xtest) print "parameter: n_components = ", n print "parameter: n_neighbors = 5" print "pca_knn classification accuracy: ", accuracy_score( ytest, yPredict)
def fit(self, X, y=None, c=None): """Fit the model using X as training data. Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: raise ValueError("early_exaggeration must be at least 1, but is " "%f" % self.early_exaggeration) if self.n_iter < 200: raise ValueError("n_iter should be at least 200") if self.metric == "precomputed": if self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be used " "with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") distances = X else: if self.verbose: print("[t-SNE] Computing pairwise distances...") if self.metric == "euclidean": distances = pairwise_distances(X, metric=self.metric, squared=True) else: distances = pairwise_distances(X, metric=self.metric) # Degrees of freedom of the Student's t-distribution. The suggestion # alpha = n_components - 1 comes from "Learning a Parametric Embedding # by Preserving Local Structure" Laurens van der Maaten, 2009. alpha = max(self.n_components - 1.0, 1) n_samples = X.shape[0] self.training_data_ = X P = _joint_probabilities(distances, self.perplexity, self.verbose) self.P = deepcopy(P) if self.init == 'pca': pca = RandomizedPCA(n_components=self.n_components, random_state=random_state) X_embedded = pca.fit_transform(X) elif self.init == 'random': X_embedded = None else: raise ValueError("Unsupported initialization scheme: %s" % self.init) self.embedding_ = self._tsne(P, alpha, n_samples, random_state, X_embedded=X_embedded, c=c)
def trainset(data, labels): pca = RandomizedPCA(n_components=10) std = StandardScaler() data = np.reshape(data, (data.shape[0], -1)) data = pca.fit_transform(data) data = std.fit_transform(data) knn = KNeighborsClassifier() knn.fit(data, labels) return pca, std, knn
def visualize(): pca = RandomizedPCA(n_components=2) X = pca.fit_transform(data) df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label": labels}) colors = ["red", "yellow"] for label, color in zip(df['label'].unique(), colors): mask = df['label'] == label pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) pl.legend() pl.show()
def visualize(): pca = RandomizedPCA(n_components=2) X = pca.fit_transform(data) df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label":labels}) colors = ["red", "yellow"] for label, color in zip(df['label'].unique(), colors): mask = df['label']==label pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) pl.legend() pl.show()
def HSV_PCA(image_paths, hue_bins = 180, sat_bins = 256, val_bins = 256): hsv_hists = HSV_hists(image_paths, hue_bins, sat_bins, val_bins) pca = RandomizedPCA(n_components=3) hue_pca = pca.fit_transform(np.log(hsv_hists[0])) sat_pca = pca.fit_transform(np.log(hsv_hists[1])) val_pca = pca.fit_transform(np.log(hsv_hists[2])) hsv_df = pd.DataFrame(data = np.hstack((hue_pca, sat_pca, val_pca))) h_cols = ["HuePC" + str(i) for i in range(1,4)] s_cols = ["SatPC" + str(i) for i in range(1,4)] v_cols = ["ValPC" + str(i) for i in range(1,4)] hsv_df.columns = h_cols + s_cols + v_cols df_res = pd.concat([pd.DataFrame({'image_paths': image_paths}), hsv_df], axis = 1) return df_res
def get_pca(data, num_components=2): """ Perform a PCA transformation Parameters ---------- data: Values to transform num_components: Number of dimension of the data """ pca = RandomizedPCA(n_components=num_components, whiten=False) data = pca.fit_transform(data) return data, pca.explained_variance_ratio_
def get_input_pca(imgs, labels, pca=None): I = np.rollaxis(imgs, 2) I = np.reshape(I, (I.shape[0], -1)) if not pca: pca = RandomizedPCA(n_components=None, copy=False, iterated_power=3, whiten=False) I = pca.fit_transform(I) L = np.ravel(labels) return I, L, pca
def RPCA(model_data, components=None, transform_data=None): t0 = time() rpca = RandomizedPCA(n_components=components) if transform_data == None: projection = rpca.fit_transform(model_data) else: rpca.fit(model_data) projection = rpca.transform(transform_data) print "Randomized PCA Explained Variance: ", rpca.explained_variance_ratio_ print "Randomized PCA Time: %0.3f" % (time() - t0) return projection
def PlotPCA(self): pca = RandomizedPCA(n_components=1) print shape(self.fmri_train) pca.fit(self.fmri_train) print shape(pca.components_) trainingVector = pca.fit_transform(self.fmri_train) plt.plot(pca.explained_variance_ratio_) plt.show() #print pca.get_params() print shape(trainingVector) io.mmwrite('fmri_train_240samples_1components.out', trainingVector, field='real', precision=25)
def RPCA(model_data, components = None, transform_data = None): t0 = time() rpca = RandomizedPCA(n_components=components) if transform_data == None: projection = rpca.fit_transform(model_data) else: rpca.fit(model_data) projection = rpca.transform(transform_data) print "Randomized PCA Explained Variance: ", rpca.explained_variance_ratio_ print "Randomized PCA Time: %0.3f" % (time() - t0) return projection
def pca_LG(train, test): y = [] x_train, y_train, x_test, y_test = split_data(train, test) pca = RandomizedPCA(n_components=500) x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test) lr = LogisticRegression() lr.fit(x_train, y_train) y = lr.predict(x_test) #print(lr.score(x_train,y_train)) return format_y(y)
def pca_knn(train, test): y = [] x_train, y_train, x_test, y_test = split_data(train, test) pca = RandomizedPCA(n_components=2) x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test) knn = KNeighborsClassifier() knn.fit(x_train, y_train) y = knn.predict(x_test) #print(knn.score(x_test,y_test)) return format_y(y)
def test_do_pca(self): pca_res = do_pca(self.dup_data, 3) for datum in pca_res.reshape(-1, 1): self.assertAlmostEqual(datum[0], 0.) pca_res = do_pca(self.data, 2).reshape(1, -1)[0] expected_pca = PCA(n_components = 2) expected_res = expected_pca.fit_transform(self.data).reshape(1, -1)[0] for expected, actual in zip(expected_res, pca_res): self.assertAlmostEqual(expected, actual)
def maybeReduceDimensionality(self, img_data): """Dimensional reduction of 3D image matrix (numpy array).""" # Iterating through a ndimensional array produces slices along # the last axis. This is equivalent to data[i,:,:] in this case img_data = img_data[::self.n_slices] if self.reduction is None: """No Reduction""" return img_data elif self.reduction == "H": """Histogram""" from sklearn import preprocessing img_data = np.asarray(img_data, dtype=float).flat min_max_scaler = preprocessing.MinMaxScaler() scaled_data = min_max_scaler.fit_transform(img_data) hist = np.histogram(scaled_data, bins=self.reduction_dict["H"]["value"], range=None, normed=False, weights=None, density=None)[0] return hist.reshape(1, hist.shape[0]) elif self.reduction == "P": """Slice-wise (randomized) Principal Component Analysis""" from sklearn.preprocessing import normalize from sklearn.decomposition import RandomizedPCA proj_data = [] for img_slice in img_data: norm_data = normalize(img_slice) shaped_data = np.reshape(norm_data, norm_data.size) # shaped_data.shape rpca = RandomizedPCA( n_components=self.reduction_dict["P"]["value"], random_state=0) proj_slice = rpca.fit_transform(norm_data) # plt.imshow(proj_data) # feat_data = rpca.inverse_transform(proj_data) # plt.imshow(feat_data) # plt.imshow(norm_data) proj_data.append(proj_slice) return proj_data
def pca(data,ncomp=100,whiten=False): pt4 = time.time() print 'import and normalization took time {0}'.format(pt4 - pt0) if whiten == True: #if data needs to be pca whitened, whiten data pca = RandomizedPCA(n_components=ncomp, whiten=True) #create pca object to pca whiten features X = pca.fit_transform(data) else: X = data #else return data as is pt5 = time.time() print 'array cast and pca whitening took time {0}'.format(pt5 - pt2) print 'total time taken {0}'.format(pt5-pt0) return X
def plot_for_2d(data , y): print "Reducing dimension to 2D for visualization...." pca = RandomizedPCA(n_components=2) X = pca.fit_transform(data) df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label":np.where(y==1, "Sphere", "cube")}) colors = ["red", "yellow"] print "Displaying plot...." for label, color in zip(df['label'].unique(), colors): mask = df['label'] == label pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) pl.show() print "Done."
def reduce_dimensions(data, n, random_state=None): """ Reduces the input data's dimension to 'n'. Args: data: An M x N matrix, where M is the number of samples and N is the number of features. The dimensions will be reduced from N to n. n: The new number of dimensions Returns: data: An M x n reduced dimension matrix. """ pca = RandomizedPCA(n_components=n, random_state=random_state) return pca.fit_transform(data)
def pcaPic(data, label): n_components =100 print(data.shape) print("train pca!!") pca = RandomizedPCA(n_components=n_components, whiten=True).fit(data) X_train_pca = pca.fit_transform(data) y_train = label print("Fitting the classifier to the training set") param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) clf = clf.fit(X_train_pca, y_train) return pca, clf
def compute_pca(raw_data): # randomly order the data # seed(0) print('shuffling data...') shuffle(raw_data) # pull out the features and the labels print('pulling out data to run PCA...') data = np.array([cd for (cd, _y, f) in raw_data]) print('finding principal components...') pca = RandomizedPCA(n_components=N_COMPONENTS, random_state=0) X = pca.fit_transform(data) return raw_data, data, pca, X
def preprocess_data(): datasets = sio.loadmat('../multi_data/Hyper_01_Urban.mat') hypercube = datasets['Hypercube'] datasets = sio.loadmat('../multi_data/Hyper_01_Urban_GroundTruth.mat') ground_truth = datasets['Ground_Truth'] del datasets hypercube_1D = np.reshape(hypercube, (-1, hypercube.shape[2])) rpca = RandomizedPCA(n_components=10, whiten=True) hypercube_1D_reduced = rpca.fit_transform(hypercube_1D) hypercube_reduced = np.reshape( hypercube_1D_reduced, (hypercube.shape[0], hypercube.shape[1], -1)) print rpca.explained_variance_ratio_.sum() window_sz = 5 window_pad = 2 dataset_matrix_size = ((hypercube_reduced.shape[0] - window_pad) * (hypercube_reduced.shape[1] - window_pad), window_sz, window_sz, hypercube_reduced.shape[2]) dataset_matrix = np.zeros(dataset_matrix_size) label_vector = np.zeros((dataset_matrix.shape[0], )) data_index = 0 for r in range(hypercube_reduced.shape[0]): if r < window_pad or r > hypercube_reduced.shape[0] - window_pad - 1: continue for c in range(hypercube_reduced.shape[1]): if c < window_pad or c > hypercube_reduced.shape[ 1] - window_pad - 1: continue patch = hypercube_reduced[r - window_pad:r + window_pad + 1, c - window_pad:c + window_pad + 1] dataset_matrix[data_index, :, :, :] = patch label_vector[data_index] = ground_truth[r, c] data_index = data_index + 1 dataset_matrix_r = dataset_matrix[label_vector > 0, :, :, :] label_vector_r = label_vector[label_vector > 0] rand_perm = np.random.permutation(label_vector_r.shape[0]) dataset_matrix_r = dataset_matrix_r[rand_perm, :, :, :] label_vector_r = label_vector_r[rand_perm] label_vector_r = label_vector_r - 1.0 return dataset_matrix, label_vector, dataset_matrix_r, label_vector_r
def c_random_pca(): pca_2 = RandomizedPCA(n_components=2) X_pca = pca_2.fit_transform(iris.data) print(X_pca.shape) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target, edgecolors="none") plt.show() # Percentage of variance explained by each of the selected components. # If all components are stored, the sum of explained variances is equal # to 1.0. print(pca_2.explained_variance_ratio_.sum()) # Principal axes in feature space, representing the directions of # maximum variance in the data print(pca_2.components_)
def pca(all_corr, pc_start, pc_end): pca_components = pc_end - pc_start pca = RandomizedPCA(n_components=pca_components, whiten=False) print 'reducing dimensions to ' + str(pca_components) + ' PCA components' pc_idx = range(pc_start, pc_end) pca_xform = pca.fit_transform(all_corr) all_corr_pca = pca_xform[:, pc_idx] #do not whiten PCA-space data eig = pca.components_ variances = pca.explained_variance_ratio_ eigenmaps = np.zeros([pca_components, masky * maskx]) eigenmaps[:] = np.nan eigenmaps[:, pushmask] = eig eigenmaps_img = eigenmaps.reshape(pca_components, masky, maskx) return eigenmaps_img, all_corr_pca, variances
def test_non_square_infomax(): """ Test non-square infomax """ from sklearn.decomposition import RandomizedPCA rng = np.random.RandomState(0) n_samples = 200 # Generate two sources: t = np.linspace(0, 100, n_samples) s1 = np.sin(t) s2 = np.ceil(np.sin(np.pi * t)) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s # Mixing matrix n_observed = 6 mixing = rng.randn(n_observed, 2) for add_noise in (False, True): m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(n_observed, n_samples) center_and_norm(m) pca = RandomizedPCA(n_components=2, whiten=True, random_state=rng) m = m.T m = pca.fit_transform(m) # we need extended since input signals are sub-gaussian unmixing_ = infomax(m, random_state=rng, extended=True) s_ = np.dot(unmixing_, m.T) # Check that the mixing model described in the docstring holds: mixing_ = linalg.pinv(unmixing_.T) assert_almost_equal(m, s_.T.dot(mixing_)) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2)
def get_pca_data_batch(imgs, hight=resize_hight, width=resize_width): """ """ newsize = (hight, width) rImgs = [lib_cv2.resize(e, newsize) for e in imgs] rImgs = [lib_cv2.cvtColor(e, lib_cv2.COLOR_BGR2GRAY) for e in rImgs] rImgs = [e.ravel() for e in rImgs] pca = RandomizedPCA(n_components=200, whiten=True) pImgs = pca.fit_transform(rImgs) return pImgs