def tryLinearDiscriminantAnalysis(goFast): from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True) from sklearn.lda import LDA from sklearn.metrics import accuracy_score from sklearn.grid_search import ParameterGrid from sklearn.decomposition import RandomizedPCA rpcaDataGrid = [{"n_components": [10,45,70,100], "iterated_power": [2, 3, 4], "whiten": [True]}] for rpca_parameter_set in ParameterGrid(rpcaDataGrid): rpcaOperator = RandomizedPCA(**rpca_parameter_set) rpcaOperator.fit(training_data,training_labels) new_training_data = rpcaOperator.transform(training_data,training_labels) new_validation_data = rpcaOperator.transform(validation_data,validation_labels) ldaOperator = LDA() ldaOperator.fit(new_training_data,training_labels) print "Score = " + str(accuracy_score(validation_labels,ldaOperator.predict(new_validation_data)))
def getPrincipleComponents(xtr, xte, n_components=50): train = np.array(xtr) test = np.array(xte) pca = RandomizedPCA(n_components=n_components).fit(train) xtrain = pca.transform(train) xtest = pca.transform(test) return xtrain, xtest
def SVM(X, y): X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=TRAIN_TEST_SPLIT_RATIO) print(len(X_train)) # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") X_train_pca = equalize_hist(X_train_pca) preprocessing.scale(X_train_pca * 1.0, axis=1) X_test_pca = equalize_hist(X_test_pca) preprocessing.scale(X_test_pca * 1.0, axis=1) # classifier = svm.SVC(kernel='poly', degree = 3) # classifier.fit(X_train, y_train) # # print("======",3,"========") # print('TRAIN SCORE', classifier.score(X_train, y_train)) # print('TEST SCORE', classifier.score(X_test, y_test)) param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier2 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier2.fit(X_train_pca, y_train) # print("======",3,"========") print('TRAIN SCORE', classifier2.score(X_train_pca, y_train)) print('TEST SCORE', classifier2.score(X_test_pca, y_test))
def SVM(X_data, y_data): X_data = equalize_hist(X_data) preprocessing.normalize(X_data, 'max') preprocessing.scale(X_data, axis=1) # preprocessing.normalize(X_data, 'max') # X_data = equalize_hist(X_data) # divide our data set into a training set and a test set X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_data, y_data, test_size=TRAIN_TEST_SPLIT_RATIO) n_components = 120 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier.fit(X_train_pca, y_train) print("====== PCA 150 ========") print('TRAIN SCORE', classifier.score(X_train_pca, y_train)) print('TEST SCORE', classifier.score(X_test_pca, y_test))
def SVM(X_train, y_train, X_test): print("SVM with PCA of rbf, writening all on, no normalize") preprocessing.normalize(X_train, 'max') preprocessing.normalize(X_test, 'max') #preprocessing.robust_scale(X, axis=1, with_centering = True) #bad X_train = equalize_hist(X_train) X_test = equalize_hist(X_test) '''X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=TRAIN_TEST_SPLIT_RATIO)''' n_components = 147 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) return list(classifier13.predict(X_test_pca))
def do_nbnn(train_folder, test_folder): train = load_patches(args.train_folder) test = load_patches(args.test_folder) if options.relu: get_logger().info("Applying RELU") for class_data in train: class_data.patches = class_data.patches.clip(min=0) for class_data in test: class_data.patches = class_data.patches.clip(min=0) if options.scale: get_logger().info("Applying standardization") scaler = StandardScaler(copy=False) scaler.fit(np.vstack([t.patches for t in train])) for class_data in train: class_data.patches = scaler.transform(class_data.patches) for class_data in test: class_data.patches = scaler.transform(class_data.patches) if options.pca: get_logger().info("Calculating PCA") pca = RandomizedPCA(n_components=options.pca) pca.fit(np.vstack([t.patches for t in train])) #for class_data in train: #get_logger().info("Fitting class " + class_data.name) #pca.partial_fit(class_data.patches) get_logger().info("Keeping " + str(pca.explained_variance_ratio_.sum()) + " variance (" + str(options.pca) + ") components\nApplying PCA") for class_data in train: class_data.patches = pca.transform(class_data.patches) for class_data in test: class_data.patches = pca.transform(class_data.patches) nbnn(train, test, NN_Engine())
def main(): #create the training & test sets, skipping the header row with [1:] dataset = genfromtxt(open('data/train.csv','r'), delimiter=',', dtype='u1')[1:] target = [x[0] for x in dataset] train = [x[1:] for x in dataset] test = genfromtxt(open('data/test.csv','r'), delimiter=',', dtype='u1')[1:] #build crossvalidation training set train_train, train_test, target_train, target_test = cross_validation.train_test_split(train, target, test_size=0.2, random_state=0) print train_train.shape print train_test.shape #PCA pca = RandomizedPCA(n_components=40) pca.fit(train_train) #create and train the random forest rf = RandomForestClassifier(n_estimators=1000, n_jobs=4) rf.fit(hstack((train_train, pca.transform(train_train))), target_train) print "crossval score is: ", rf.score(hstack((train_test, pca.transform(train_test))), target_test) labelid = np.array(range(1,28001)) output = rf.predict(hstack((test, pca.transform(test)))) savetxt('data/submission.csv', np.column_stack((labelid, output)), delimiter=',', header="ImageId,Label", fmt='%u', comments='')
def pca_data(test_x, train_x, params): print 'pcaing data ...' components = int(params['components']) pca = RandomizedPCA(components, whiten=True).fit(train_x) pca_train_x = pca.transform(train_x) pca_test_x = pca.transform(test_x) return pca_test_x, pca_train_x
def test_sparse_randomized_pca_inverse(): """Test that RandomizedPCA is inversible on sparse data""" rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= 0.00001 # make middle component relatively small # no large means because the sparse version of randomized pca does not do # centering to avoid breaking the sparsity X = csr_matrix(X) # same check that we can find the original data from the transformed signal # (since the data is almost of rank n_components) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DeprecationWarning) pca = RandomizedPCA(n_components=2, random_state=0).fit(X) assert_equal(len(w), 1) assert_equal(w[0].category, DeprecationWarning) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X.todense(), Y_inverse, decimal=2) # same as above with whitening (approximate reconstruction) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DeprecationWarning) pca = RandomizedPCA(n_components=2, whiten=True, random_state=0).fit(X) assert_equal(len(w), 1) assert_equal(w[0].category, DeprecationWarning) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) relative_max_delta = (np.abs(X.todense() - Y_inverse) / np.abs(X).mean()).max() # XXX: this does not seam to work as expected: assert_almost_equal(relative_max_delta, 0.91, decimal=2)
def LogisticRegressionPCA(X, y): # divide our data set into a training set and a test set X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=TRAIN_TEST_SPLIT_RATIO) # get randomized PCA model num_components = 147 print("Extracting the top %d eigenfaces from %d faces" % (num_components, X_train.shape[0])) pca = RandomizedPCA(n_components=num_components, whiten=True).fit(X_train) # use the PCA model on our training set and test set. print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") h = .02 # step size in the mesh logistic_regression = linear_model.LogisticRegression(C=1e5) # we create an instance of Neighbours Classifier and fit the data. logistic_regression.fit(X, y) # print the performance of logistic regression print("====== Logistic Regression with PCA ========") print('TRAIN SCORE', logistic_regression.score(X_train, y_train)) print('TEST SCORE', logistic_regression.score(X_test, y_test))
def pca_estimator(data, targets, estimator, components_number=DEFAULT_COMPONENTS_NUMBER, folds_number=DEFAULT_FOLDS_NUMBER): kf = KFold(len(targets), n_folds=folds_number) # 'scores' is numpy array. An index is a number of a fold. A value is a percent of right # predicted samples from a test. scores = np.zeros(folds_number) start = time() index = 0 for train, test in kf: x_train, x_test, y_train, y_test = data[train], data[test], targets[train], targets[test] pca = RandomizedPCA(n_components=components_number, whiten=True).fit(x_train) x_train_pca = pca.transform(x_train) x_test_pca = pca.transform(x_test) clf = estimator.fit(x_train_pca, y_train) scores[index] = clf.score(x_test_pca, y_test) index += 1 # print("Iteration %d from %d has done! Score: %f" % (index, folds_number, # scores[index - 1])) finish = time() return scores.mean(), scores.std() * 2, (finish - start)
def rpca(train_X, test_X, n): start_time = time.time() pca = RandomizedPCA(n_components=n) pca.fit(train_X.toarray()) train_X_pca = pca.transform(train_X.toarray()) test_X_pca = pca.transform(test_X.toarray()) print("--- %s seconds ---" % (time.time() - start_time)) return pca, train_X_pca, test_X_pca
class Cluster(object): def __init__(self, name): self.name = name self.raw_dataset = [] self.dataset = [] self.dataset_red = [] def get_featurevec(self, data): '''Takes in data in the form of an array of EmoPackets, and outputs a list of feature vectors.''' # CHECKED, all good :) num_bins = (len(data)/int(dsp.SAMPLE_RATE*dsp.STAGGER) - int(dsp.BIN_SIZE / dsp.STAGGER) + 1) size = int(dsp.BIN_SIZE*dsp.SAMPLE_RATE) starts = int(dsp.SAMPLE_RATE*dsp.STAGGER) points = [] for i in range(num_bins): points.append(dsp.get_features(data[i*starts:i*starts+size])) return points def add_data(self, raw): '''Allows the addition of new data. Will retrain upon addition. Expects a list of EmoPackets.''' self.dataset.extend(self.get_featurevec(raw)) def extract_features(self): '''Does feature extraction for all of the datasets.''' self.dataset = [] for sess in self.raw_dataset: self.dataset.extend(self.get_featurevec(sess)) def reduce_dim(self, NDIM=5): '''Reduces the dimension of the extracted feature vectors.''' X = np.array(self.dataset) self.pca = RandomizedPCA(n_components=NDIM).fit(X) self.dataset_red = self.pca.transform(X) def train(self): '''Trains the classifier.''' self.svm = OneClassSVM() self.svm.fit(self.dataset_red) def is_novel(self, pt): '''Says whether or not the bin is novel. Expects an array of EmoPackets''' X = self.pca.transform(np.array(self.get_featurevec(data)[0])) ans = self.svm.predict(X) self.dataset_red.append(X) self.train() return ans def save(self): '''Saves this classifier to a data directory.''' this_dir, this_filename = os.path.split(__file__) DATA_PATH = os.path.join(this_dir, "data", self.name+'.pkl') dumpfile = open(DATA_PATH, "wb") pickle.dump(self, dumpfile, pickle.HIGHEST_PROTOCOL) dumpfile.close()
def reduce_dim(self, NDIM=5): '''Reduces the dimension of the extracted feature vectors.''' X = np.array(self.neutral) pca = RandomizedPCA(n_components=NDIM).fit(X) print pca.explained_variance_ratio_ self.pca = pca self.neutral_red = pca.transform(X) for label in self.labelled: X = np.array(self.labelled[label]) self.labelled_red[label] = pca.transform(X)
def compute_pca(self): # print 'We have ', self.x.shape[1], 'features. Reducing dimensionality.' pca_count = 200 pca = RandomizedPCA(pca_count, copy = False, whiten=True) pca.fit(self.x_train) self.x_train = pca.transform(self.x_train) if self.do_submission: self.x_test = pca.transform(self.x_test) if self.do_validation(): self.x_validate = pca.transform(self.x_validate)
def face_rec(): IMG_RES = 100 * 100 # img resolution NUM_EIGENFACES = 10 # images per train person NUM_TRAINIMAGES = 110 # total images in training set #loading training set from folder train_faces folders = glob.glob('/home/pi/New/SelfieLibrary/cropped/gallery') # Create an array with flattened images X # and an array with ID of the people on each image y X = np.zeros([NUM_TRAINIMAGES, IMG_RES], dtype='int8') y = [] # Populate training array with flattened imags from subfolders of train_faces and names c = 0 for x, folder in enumerate(folders): train_faces = glob.glob(folder + '/*') for i, face in enumerate(train_faces): X[c,:] = prepare_image(face) y.append(face) c = c + 1 # perform principal component analysis on the images pca = RandomizedPCA(n_components=NUM_EIGENFACES, whiten=True).fit(X) X_pca = pca.transform(X) # load test faces (usually one), located in folder test_faces test_faces = glob.glob('/home/pi/New/SelfieLibrary/cropped/probe/*') # Create an array with flattened images X X = np.zeros([len(test_faces), IMG_RES], dtype='int8') # Populate test array with flattened imags from subfolders of train_faces for i, face in enumerate(test_faces): X[i,:] = prepare_image(face) # run through test images (usually one) for j, ref_pca in enumerate(pca.transform(X)): distances = [] # Calculate euclidian distance from test image to each of the known images and save distances for i, test_pca in enumerate(X_pca): if i<c: dist = math.sqrt(sum([diff**2 for diff in (ref_pca - test_pca)])) distances.append((dist, y[i])) found_ID = min(distances)[1] print float(min(distances)[0]) print "Identified (result: "+ str(found_ID) +" - dist - " + str(min(distances)[0]) + ")" return distances
def tryDenseOperator(goFast, operatorClass, parameterGrid): bestScore = 0 bestRpcaParams = None bestOperatorParams = None from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True) from sklearn.metrics import accuracy_score from sklearn.grid_search import ParameterGrid from sklearn.decomposition import RandomizedPCA rpcaDataGrid = [{"n_components": [10,45,70,100], "iterated_power": [1, 2, 3, 4], "whiten": [False, True]}] for rpca_parameter_set in ParameterGrid(rpcaDataGrid): try: rpcaOperator = RandomizedPCA(**rpca_parameter_set) rpcaOperator.fit(training_data,training_labels) new_training_data = rpcaOperator.transform(training_data,training_labels) new_validation_data = rpcaOperator.transform(validation_data,validation_labels) for dense_operator_parameter_set in ParameterGrid(parameterGrid): try: denseOperator = operatorClass(**dense_operator_parameter_set) denseOperator.fit(new_training_data,training_labels) score = accuracy_score(validation_labels,denseOperator.predict(new_validation_data)) print "Score = " + str(score) if score > bestScore: bestScore = score bestRpcaParams = rpca_parameter_set bestOperatorParams = dense_operator_parameter_set print "***New best score: " + str(bestScore) print "***RPCA params: " + str(bestRpcaParams) print "***Operator params: " + str(bestOperatorParams) except: print "Illegal combination skipped" print sys.exc_info()[:2] except: print "Illegal combination skipped." print sys.exc_info()[:2] print "***New best score: " + str(bestScore) print "***RPCA params: " + str(bestRpcaParams) print "***Operator params: " + str(bestOperatorParams)
def test(n_components): pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) clf = clf.fit(X_train_pca, y_train) y_pred = clf.predict(X_test_pca) print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes))
def run_stuff(n_components): ############################################################################### # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction #n_components = 10 print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) print "Explaining variance:" print np.round(pca.explained_variance_ratio_[:10], decimals=3) eigenfaces = pca.components_.reshape((n_components, h, w)) print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print "Results for " + str(n_components) + " components:" print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes)) return y_pred, y_test, target_names, X_test, h, w, eigenfaces
def perform_rec(): num_files = sum([len(files) for r, d, files in os.walk('train_faces/')]) IMG_RES = 92 * 112 # img resolution NUM_EIGENFACES = 10 # images per train person NUM_TRAINIMAGES = num_files - 1 # total images in training set #loading training set from folder train_faces folders = glob.glob('train_faces/*') # Create an array with flattened images X # and an array with ID of the people on each image y X = np.zeros([NUM_TRAINIMAGES, IMG_RES], dtype='int8') y = [] # Populate training array with flattened imags from subfolders of train_faces and names c = 0 for x, folder in enumerate(folders): train_faces = glob.glob(folder + '/*') for i, face in enumerate(train_faces): X[c,:] = prepare_image(face) y.append(ID_from_filename(face)) c = c + 1 # perform principal component analysis on the images pca = RandomizedPCA(n_components=NUM_EIGENFACES, whiten=True).fit(X) X_pca = pca.transform(X) # load test faces (usually one), located in folder test_faces test_faces = glob.glob('test_faces/*') # Create an array with flattened images X X = np.zeros([len(test_faces), IMG_RES], dtype='int8') # Populate test array with flattened imags from subfolders of train_faces for i, face in enumerate(test_faces): X[i,:] = prepare_image(face) for j, ref_pca in enumerate(pca.transform(X)): distances = [] # Calculate euclidian distance from test image to each of the known images and save distances for i, test_pca in enumerate(X_pca): dist = math.sqrt(sum([diff**2 for diff in (ref_pca - test_pca)])) distances.append((dist, y[i])) found_ID = min(distances)[1] print "Identified (result: "+ str(found_ID) +" - dist - " + str(min(distances)[0]) + ")" return {'ident':found_ID, 'dist':min(distances)[0] }
def return_pca_transformed_data(train_features, test_features): # Principal Component Analysis: we are going to find the optimum principal # components and transform the training and testing data accordingly for other classifier # that I will using while pipelining them. pca = RandomizedPCA(n_components=11, whiten=True).fit(train_features) # Array containing means of individual features for the entire dataset. means = pca.mean_ total = sum(pca.explained_variance_) print "PCA trained" print "The first component expains: ", (float(pca.explained_variance_[0]) / total) * 100, "% of the variance" # Training and Testing data transformed by PCA features_train_pca = pca.transform(train_features) features_test_pca = pca.transform(test_features) return features_train_pca, features_test_pca
def pcaFaces(X,y,n_components,h,w): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) # insert code here t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) print "Projecting the input data on the eigenfaces orthonormal basis" # insert code here X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) eigenfaces = pca.components_.reshape((n_components, h, w)) print "done in %0.3fs" % (time() - t0) return X_train_pca,X_test_pca,X_test,y_train,y_test,eigenfaces
def pca(X_train, X_test, n): """Use PCA to perform unsupervised feature extraction.""" print "Extracting %d principle components from %d features" % (n, X_train.shape[1]) t0 = time() pca = RandomizedPCA(n_components=n, whiten=True, random_state=47).fit(X_train) print "done in %0.3fs" % (time() - t0) print "Transforming the input data" t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) return X_train_pca, X_test_pca
def preprocess(cross_validation_tuple, preprocess_correlation=False, preprocess_scaling=False): X = cross_validation_tuple.X_training X_test = cross_validation_tuple.X_testing if preprocess_scaling: scaler = preprocessing.StandardScaler().fit(cross_validation_tuple.X_training) X = scaler.transform(X) X_test = scaler.transform(X_test) if preprocess_correlation: from sklearn.decomposition import RandomizedPCA pca = RandomizedPCA(n_components=0.99, whiten=True) # n between 0 and 1 to select number of componnents to explain 99 percents of the data pca.fit(X) print("PCA component keeping {}".format(pca.n_components)) X = pca.transform(X) X_test = pca.transform(X_test) return Cross_validation_split(X, X_test, cross_validation_tuple.Y_training, cross_validation_tuple.Y_testing)
def test_explained_variance(): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2).fit(X) rpca = RandomizedPCA(n_components=2, random_state=rng).fit(X) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 1) # compare to empirical variances X_pca = pca.transform(X) assert_array_almost_equal(pca.explained_variance_, np.var(X_pca, axis=0)) X_rpca = rpca.transform(X) assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, axis=0), decimal=1) # Same with correlated data X = datasets.make_classification(n_samples, n_features, n_informative=n_features-2, random_state=rng)[0] pca = PCA(n_components=2).fit(X) rpca = RandomizedPCA(n_components=2, random_state=rng).fit(X) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 5)
def dimentionality_reduction(train_x , test_x): print "Dimentionality reduction to 10D on training and test data...." pca = RandomizedPCA(n_components=10) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) print "Done." return train_x , test_x
def fit(self): wordids_map = NameToIndex() labs_map = NameToIndex() wordscount = self._word_cluster.get_words_count() print "start compute_tfidf ..." #计算文档的词袋模型 docs = self._word_cluster.get_samples() count =0 bow = [] labs = [] for k,v in docs.iteritems(): vec = numpy.zeros(wordscount).tolist() for i in v: vec[wordids_map.map(i)]+=1 bow.append(vec) labs.append(labs_map.map(k[0])) labs = numpy.array(labs) tfidf = TfidfTransformer(smooth_idf=True, sublinear_tf=True,use_idf=True) datas = numpy.array(tfidf.fit_transform(bow).toarray()) print "compute_tfidf done" pca = RandomizedPCA(n_components=20, whiten=True).fit(datas) svc = train_svc(numpy.array(labs_map.names), labs, pca.transform(datas)) self._tfidf = tfidf self._svc = svc self._labs_map = labs_map self._wordids_map = wordids_map self._pca = pca
def test_explained_variance(): """Check that PCA output has unit-variance""" rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2).fit(X) rpca = RandomizedPCA(n_components=2, random_state=42).fit(X) assert_array_almost_equal(pca.explained_variance_, rpca.explained_variance_, 1) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 3) # compare to empirical variances X_pca = pca.transform(X) assert_array_almost_equal(pca.explained_variance_, np.var(X_pca, axis=0)) X_rpca = rpca.transform(X) assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, axis=0)) # Compare with RandomizedPCA using sparse data X = csr_matrix(X) rpca = assert_warns(DeprecationWarning, rpca.fit, X) assert_array_almost_equal(pca.explained_variance_, rpca.explained_variance_, 1) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 3)
def _eliminate_features(X_test, X_train, attribute_count, y_train): print "Eliminating features until %d has been reached" % attribute_count pca = RandomizedPCA(n_components=attribute_count+10).fit(X_train) X_train = pca.transform(to_float(X_train)) print "Finished pca" clf = SVC(**SVC_parameters) rfe = RFE(clf, n_features_to_select=attribute_count, step=0.1) fit = rfe.fit(X_train, y_train) print "Finished rfe" # Reduce the feature matrices to contain just the selected features X_train = [fit.transform(X) for X in X_train] X_test = [fit.transform(X) for X in pca.transform(to_float(X_test))] return X_test, X_train
def main(): img_dir = 'images/' images = [img_dir + f for f in os.listdir(img_dir)] labels = [f.split('/')[-1].split('_')[0] for f in images] label2ids = {v: i for i, v in enumerate(sorted(set(labels), key=labels.index))} y = np.array([label2ids[l] for l in labels]) data = [] for image_file in images: img = img_to_matrix(image_file) img = flatten_image(img) data.append(img) data = np.array(data) # training samples is_train = np.random.uniform(0, 1, len(data)) <= 0.7 train_X, train_y = data[is_train], y[is_train] # training a classifier pca = RandomizedPCA(n_components=5) train_X = pca.fit_transform(train_X) multi_svm = OneVsRestClassifier(LinearSVC()) multi_svm.fit(train_X, train_y) # evaluating the model test_X, test_y = data[is_train == False], y[is_train == False] test_X = pca.transform(test_X) print pd.crosstab(test_y, multi_svm.predict(test_X), rownames=['Actual'], colnames=['Predicted'])
montage.addResult(component) mean = pca.mean_.reshape((62,47)) mean = exposure.rescale_intensity(mean, out_range=(0,255)).astype("uint8") cv2.imshow("Mean", mean) cv2.imshow("components", montage.montage) cv2.WaitKey(0) # train a classifier on the eigenfaces representation print("[INFO] training classifier...") model = SVC(kernel="rbf", C=10.0, gamma=0.001, random_state=84) model.fit(trainData, training.target) # evaluate the model print("[INFO] evaluating model...") predictions = model.predict(pca.transform(testing.data)) print(classification_report(testing.target, predictions)) # loop over the desired number of samples for i in np.random.randint(0, high=len(testing.data), size=(args["sample_size"],)): # grab the face and classify it face = testing.data[i].reshape((62, 47)).astype("uint8") prediction = model.predict(pca.transform(testing.data[i].reshape(1, -1))) # resize the face to make it more visable, then display the face and the prediction print("[INFO] Prediction: {}, Actual: {}".format(prediction[0], testing.target[i])) face = imutils.resize(face, width=face.shape[1] * 2, inter=cv2.INTER_CUBIC) cv2.imshow("Face", face) cv2.waitKey(0)
__author__ = 'pglebow' from sklearn.decomposition import RandomizedPCA from sklearn.neighbors import KNeighborsClassifier pca = RandomizedPCA(n_components=5) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) print train_x[:5] #array([[ 12614.55016475, -9156.62662224, -7649.37090539, -3230.94749506, # 2495.71170459], # [ 16111.39363837, -259.55063579, 699.60464599, 3058.59026495, # -1552.34714653], # [ 15019.71069584, -6403.86621428, 1968.44401114, 2896.76676466, # -2157.76499726], # [ 13410.53053415, -1658.3751377 , 261.26829049, 1991.33404567, # -486.60683822], # [ 12717.28773107, -1544.27233216, -1279.70167969, 503.33658729, # -38.00244617]]) knn = KNeighborsClassifier() knn.fit(train_x, train_y)
ax1.set_xlabel("Silhouette coefficient") ax1.set_ylabel("Cluster label") # The vertical line for average silhoutte score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([0, 0.02, 0.04, 0.06, 0.08, .1]) plt.show() # fig.savefig('silhouette50_%s'%n_clusters, dpi=700) # # 2nd Plot showing the actual clusters formed pca = RandomizedPCA(n_components=2) reduced_data = pca.fit_transform(X.toarray()) print "PCA done" centroids = pca.transform(clusterer.cluster_centers_) for i in range(n_clusters): col = cm.spectral(float(i) / n_clusters) ax2.plot(reduced_data[np.argwhere(cluster_labels==i), 0], reduced_data[np.argwhere(cluster_labels==i), 1], \ '.', markersize=4, alpha=0.6,color=col) ax2.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) ax2.set_xticks([]) ax2.set_yticks([]) ax2.set_xticklabels([]) ax2.set_xlabel("1st PCA component")
def pca(n_components=150): n_components = n_components print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) eigenfaces = pca.components_.reshape((n_components, h, w)) print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes)) ############################################################################### # Qualitative evaluation of the predictions using matplotlib #def plot_gallery(images, titles, h, w, n_row=3, n_col=4): # """Helper function to plot a gallery of portraits""" # pl.figure(figsize=(1.8 * n_col, 2.4 * n_row)) # pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) # for i in range(n_row * n_col): # pl.subplot(n_row, n_col, i + 1) # pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray) # pl.title(titles[i], size=12) # pl.xticks(()) # pl.yticks(()) # plot the result of the prediction on a portion of the test set def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] true_name = target_names[y_test[i]].rsplit(' ', 1)[-1] return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("done in %0.3fs" % (time() - t0)) eigenfaces = pca.components_.reshape((n_components, h, w)) print("Projecting the input data on the eigenfaces orthonormal basis") t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done in %0.3fs" % (time() - t0)) # Train a SVM classification model print("Fitting the classifier to the training set") t0 = time() param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") print(clf.best_estimator_) # Quantitative evaluation of the model quality on the test set
for cc in CHROMOSOMES: curs.execute("select * from %s" % chrtable + condition) for rr in ResultIter(curs): # print(rr) cs = CoverageSqlite(cc, rr, args) # cs.print_range("snp_cov", subrange, h) feature_array[ii, :] = cs.feature_vector(subrange, "snp_cov") ii += 1 # X = feature_array - np.mean(feature_array, axis = 0) from sklearn.decomposition import RandomizedPCA # import sklearn pca = RandomizedPCA(n_components=3, iterated_power=7) pca.fit(feature_array) print(pca.explained_variance_ratio_) Y = pca.transform(feature_array) from sklearn.decomposition import FastICA # import sklearn pca = FastICA(n_components=3) pca.fit(feature_array) # print(pca.explained_variance_ratio_) Y = pca.transform(feature_array) import matplotlib.pyplot as plt fig = plt.figure(figsize=(5, 5)) plt.plot(Y[:, 0], Y[:, 1], 'b.') plt.show() fig = plt.figure(figsize=(5, 5)) plt.plot(Y[:, 0], Y[:, 2], 'r.')
for i, f in enumerate(files): print i, "of", len(files) data.append(get_image_data(f)) labels.append(int(f.split(".")[-2][-1])) print "done." pca = RandomizedPCA(n_components=10) std_scaler = StandardScaler() X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.1) print "scaling data..." X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) print "done." print "transforming data..." X_train = std_scaler.fit_transform(X_train) X_test = std_scaler.transform(X_test) print "done." print "training model..." clf = KNeighborsClassifier(n_neighbors=33) clf.fit(X_train, y_train) print "done" print "=" * 20 print clf print "Confusion Matrix"
def build_SVC(face_profile_data, face_profile_name_index, face_dim): """ Build the SVM classification modle using the face_profile_data matrix (numOfFace X numOfPixel) and face_profile_name_index array, face_dim is a tuple of the dimension of each image(h,w) Returns the SVM classification modle Parameters ---------- face_profile_data : ndarray (number_of_images_in_face_profiles, width * height of the image) The pca that contains the top eigenvectors extracted using approximated Singular Value Decomposition of the data face_profile_name_index : ndarray The name corresponding to the face profile is encoded in its index face_dim : tuple (int, int) The dimension of the face data is reshaped to Returns ------- clf : theano object The trained SVM classification model pca : theano ojbect The pca that contains the top 150 eigenvectors extracted using approximated Singular Value Decomposition of the data """ X = face_profile_data y = face_profile_name_index X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 # maximum number of components to keep print("\nExtracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.reshape( (n_components, face_dim[0], face_dim[1])) # This portion of the code is used if the data is scarce, it uses the number # of imputs as the number of features # pca = RandomizedPCA(n_components=None, whiten=True).fit(X_train) # eigenfaces = pca.components_.reshape((pca.components_.shape[0], face_dim[0], face_dim[1])) print("\nProjecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Train a SVM classification model print("\nFitting the classifier to the training set") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) # Best Estimator found using Radial Basis Function Kernal: clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) # Train_pca with Alex Test Error Rate: 0.088424437299 # Train_pca with Alex Test Recognition Rate: 0.911575562701 clf = clf.fit(X_train_pca, y_train) # print("\nBest estimator found by grid search:") # print(clf.best_estimator_) ############################################################################### # Quantitative evaluation of the model quality on the test set print("\nPredicting people's names on the test set") t0 = time() y_pred = clf.predict(X_test_pca) print("\nPrediction took %s per sample on average" % ((time() - t0) / y_pred.shape[0] * 1.0)) # print "predicated names: ", y_pred # print "actual names: ", y_test error_rate = errorRate(y_pred, y_test) print("\nTest Error Rate: %0.4f %%" % (error_rate * 100)) print("Test Recognition Rate: %0.4f %%" % ((1.0 - error_rate) * 100)) return clf, pca
# Compute a PCA(eigenfaces) on the face dataset (treated as unlabeled dataset) # unsupervised feature extraction/dimensioanlity reduction n_components = 150 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("done in %0.3fs" % (time() - t0) eigenfaces = pcs.components_.reshape((n_components, h, w)) print("Projecting the input data on the eigenfaces orthonormal basis") t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pcs.transform(X_test) print("Done in %0.3fs" %(time() - t0) # Train a SVM classification model print("Fitting the classifier to the training set") t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5] 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) clf = clf.fit(X_train_pca, y_train)
gridspec_kw=dict(hspace=0.1, wspace=0.1)) for i, ax in enumerate(axes.flat): ax.imshow(pca.components_[i].reshape(62, 47), cmap='bone') plt.show() plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance') plt.show() # compute the components and projected faces pca = RandomizedPCA(150).fit(faces.data) components = pca.transform(faces.data) projected = pca.inverse_transform(components) # plot the results fig,ax=plt.subplots(2,10,figsize=(10,2.5),subplot_kw={'xticks':[],'yticks':[]},\ gridspec_kw=dict(hspace=0.1,wspace=0.1)) for i in range(10): ax[0, i].imshow(faces.data[i].reshape(62, 47), cmap='binary_r') ax[1, i].imshow(projected[i].reshape(62, 47), cmap='binary_r') ax[0, 0].set_ylabel('full-dim\ninput') ax[1, 0].set_ylabel('150-dim\nreconstruction') plt.show()
n_components = 150 # #component in PCA cv2.destroyAllWindows() pca = RandomizedPCA(n_components=n_components, whiten=True) param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) testing_data = [] for i in range(len(images)): testing_data.append(images[i].flatten()) pca = pca.fit(testing_data) transformed = pca.transform(testing_data) # if lda is done than #component = 80 #lda = LinearDiscriminantAnalysis(n_components=80) #transformed = lda.fit(transformed, labels).transform(transformed) clf.fit(transformed, labels) directory2 = 'yalefaces_5' # test directory name image_paths = [ os.path.join(directory2, filename) for filename in os.listdir(directory2) ] j = 0 for image_path in image_paths: pred_image_pil = Image.open(image_path).convert('L') pred_image = np.array(pred_image_pil, 'uint8') faces = faceCascade.detectMultiScale(pred_image) if (len(faces) == 0):
def SVM(X, y): # divide our data set into a training set and a test set X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=TRAIN_TEST_SPLIT_RATIO) classifier_poly2 = svm.SVC(kernel='poly', degree=2) classifier_poly2.fit(X_train, y_train) print("======= poly degree=2 ========") print('TRAIN SCORE', classifier_poly2.score(X_train, y_train)) print('TEST SCORE', classifier_poly2.score(X_test, y_test)) n_components = 10 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier11 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier11.fit(X_train_pca, y_train) print("====== PCA 10 ========") print('TRAIN SCORE', classifier11.score(X_train_pca, y_train)) print('TEST SCORE', classifier11.score(X_test_pca, y_test)) n_components = 50 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier12 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier12.fit(X_train_pca, y_train) print("====== PCA 50 ========") print('TRAIN SCORE', classifier12.score(X_train_pca, y_train)) print('TEST SCORE', classifier12.score(X_test_pca, y_test)) n_components = 100 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 100 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) n_components = 120 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 120 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) n_components = 135 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 135 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) n_components = 150 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 150 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) n_components = 165 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 165 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) n_components = 180 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 180 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) n_components = 200 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 200 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) n_components = 400 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 400 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test))
# In[6]: scaler = StandardScaler(with_std=False) scaled = scaler.fit_transform(df) pca = RandomizedPCA(n_components=15, random_state=1) pca.fit(scaled) pca.explained_variance_ratio_.cumsum()[-1] # In[7]: df_fuzzy = pd.read_csv('features_fuzzy_train.csv', usecols=columns,dtype=np.float32) scaled = scaler.transform(df_fuzzy) X = pca.transform(scaled) # In[8]: df_fuzzy_test = pd.read_csv('features_fuzzy_test.csv', usecols=columns,dtype=np.float32) scaled = scaler.transform(df_fuzzy_test) X_test = pca.transform(scaled) # In[9]: for i in range(5): df_res_train['pca_fuzzy_%d' % i] = X[:, i] df_res_test['pca_fuzzy_%d' % i] = X_test[:, i]
def test_SVM(face_profile_data, face_profile_name_index, face_dim, face_profile_names): """ Testing: Build the SVM classification modle using the face_profile_data matrix (numOfFace X numOfPixel) and face_profile_name_index array, face_dim is a tuple of the dimension of each image(h,w) Returns the SVM classification modle Parameters ---------- face_profile_data : ndarray (number_of_images_in_face_profiles, width * height of the image) The pca that contains the top eigenvectors extracted using approximated Singular Value Decomposition of the data face_profile_name_index : ndarray The name corresponding to the face profile is encoded in its index face_dim : tuple (int, int) The dimension of the face data is reshaped to face_profile_names: ndarray The names corresponding to the face profiles Returns ------- clf : theano object The trained SVM classification model pca : theano ojbect The pca that contains the top 150 eigenvectors extracted using approximated Singular Value Decomposition of the data """ X = face_profile_data y = face_profile_name_index X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 # maximum number of components to keep print("\nExtracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.reshape( (n_components, face_dim[0], face_dim[1])) # This portion of the code is used if the data is scarce, it uses the number # of imputs as the number of features # pca = RandomizedPCA(n_components=None, whiten=True).fit(X_train) # eigenfaces = pca.components_.reshape((pca.components_.shape[0], face_dim[0], face_dim[1])) print("\nProjecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Train a SVM classification model print("\nFitting the classifier to the training set") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) # Train_pca Test Error Rate: 0.0670016750419 # Train_pca Test Recognition Rate: 0.932998324958 # clf = SVC(kernel='linear', C=1) # 2452 samples from 38 people are loaded # Extracting the top 150 eigenfaces from 1839 faces # Extracting the top 150 eigenfaces from 1790 faces # Train_pca Test Error Rate: 0.0904522613065 # Train_pca Test Recognition Rate: 0.909547738693 # clf = SVC(kernel='poly') # Train_pca Test Error Rate: 0.201005025126 # Train_pca Test Recognition Rate: 0.798994974874 # clf = SVC(kernel='sigmoid') # Train_pca Test Error Rate: 0.985318107667 # Train_pca Test Recognition Rate: 0.0146818923328 # clf = SVC(kernel='rbf').fit(X_train, y_train) # Train_pca Test Error Rate: 0.0619765494137 # Train_pca Test Recognition Rate: 0.938023450586 # Best Estimator found using Radial Basis Function Kernal: clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) # Train_pca with Alex Test Error Rate: 0.088424437299 # Train_pca with Alex Test Recognition Rate: 0.911575562701 clf = clf.fit(X_train_pca, y_train) # print("\nBest estimator found by grid search:") # print(clf.best_estimator_) ############################################################################### # Quantitative evaluation of the model quality on the test set print("\nPredicting people's names on the test set") t0 = time() y_pred = clf.predict(X_test_pca) print("\nPrediction took %0.8f second per sample on average" % ((time() - t0) / y_pred.shape[0] * 1.0)) # print "predicated names: ", y_pred # print "actual names: ", y_test error_rate = errorRate(y_pred, y_test) print("\nTest Error Rate: %0.4f %%" % (error_rate * 100)) print("Test Recognition Rate: %0.4f %%" % ((1.0 - error_rate) * 100)) ############################################################################### # Testing # X_test_pic1 = X_test[0] # X_test_pic1_for_display = np.reshape(X_test_pic1, face_dim) # t0 = time() # pic1_pred_name = predict(clf, pca, X_test_pic1, face_profile_names) # print("\nPrediction took %0.3fs" % (time() - t0)) # print "\nPredicated result for picture_1 name: ", pic1_pred_name # for i in range(1,3): print ("\n") # Display the picture # plt.figure(1) # plt.title(pic1_pred_name) # plt.subplot(111) # plt.imshow(X_test_pic1_for_display) # plt.show() ############################################################################### # Qualitative evaluation of the predictions using matplotlib # import matplotlib.pyplot as plt # def plot_gallery(images, titles, face_dim, n_row=3, n_col=4): # """Helper function to plot a gallery of portraits""" # plt.figure(figsize=(1.8 * n_col, 2.4 * n_row)) # plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) # for i in range(n_row * n_col): # plt.subplot(n_row, n_col, i + 1) # plt.imshow(images[i].reshape(face_dim), cmap=plt.cm.gray) # plt.title(titles[i], size=12) # plt.xticks(()) # plt.yticks(()) # # plot the result of the prediction on a portion of the test set # def title(y_pred, y_test, face_profile_names, i): # pred_name = face_profile_names[y_pred[i]].rsplit(' ', 1)[-1] # true_name = face_profile_names[y_test[i]].rsplit(' ', 1)[-1] # return 'predicted: %s\ntrue: %s' % (pred_name, true_name) # prediction_titles = [title(y_pred, y_test, face_profile_names, i) # for i in range(y_pred.shape[0])] # plot_gallery(X_test, prediction_titles, face_dim) # # plot the gallery of the most significative eigenfaces # eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])] # plot_gallery(eigenfaces, eigenface_titles, face_dim) # plt.show() return clf, pca
import numpy as np from scipy.cluster.vq import kmeans from scipy.spatial.distance import cdist, pdist from sklearn import datasets from sklearn.decomposition import RandomizedPCA from matplotlib import pyplot as plt from matplotlib import cm ##### data ##### # load digits dataset data = datasets.load_digits() t = data['target'] # perform PCA dimensionality reduction pca = RandomizedPCA(n_components=2).fit(data['data']) X = pca.transform(data['data']) console = [] ##### cluster data into K=1..20 clusters ##### K_MAX = 20 KK = range(1, K_MAX + 1) KM = [kmeans(X, k) for k in KK] centroids = [cent for (cent, var) in KM] D_k = [cdist(X, cent, 'euclidean') for cent in centroids] cIdx = [np.argmin(D, axis=1) for D in D_k] dist = [np.min(D, axis=1) for D in D_k] tot_withinss = [sum(d**2) for d in dist] # Total within-cluster sum of squares totss = sum(pdist(X)**2) / X.shape[0] # The total sum of squares betweenss = totss - tot_withinss # The between-cluster sum of squares
def dimention_reduction(financial_data): pca = RandomizedPCA(n_components=2) pca = pca.fit(financial_data) trans_financial_data = pca.transform(financial_data) return trans_financial_data
class RemoveRealtorTestModel(object): def __init__(self): self.features = None self.pca_model = None self.scorespriced = None self.scores = None self.features = None self.components = None self.log = None self.model = None self.divided_position = None def get_data(self, fquery, lookback, dataget=' select * from final_table;'): ''' ''' f = open(fquery) q = f.readlines() q = ' '.join(q) q = q.replace('\n',' ') q = q.replace('xxxxx',str(lookback)) engine = create_engine('postgresql://user@localhost:5432/mydb') q += dataget df = pd.read_sql_query(dataget,con=engine) hold_out_cutoff = pd.datetime(2014,10,1) dftest = df[df.listdate < hold_out_cutoff].reset_index(drop=True) dfhold = df[df.listdate > hold_out_cutoff].reset_index(drop=True) dftest.reset_index(inplace=True, drop=True) dfhold.reset_index(inplace=True, drop=True) return dftest, dftest_y, dfhold, dfhold_y def init_final(self, features, components, log, divided_position, model): ''' ''' self.features = features self.components = components self.log = log self.divided_position = divided_position self.model = model return None def fit(self, dfX, dfy): ''' ''' start = time.time() dfX.reset_index(inplace=True,drop=True) boolvars = [col for col in dfX.columns.values if 'dvar' in col] if self.components > 0: self.pca_model = RandomizedPCA(n_components=self.components) self.pca_model.fit(dfX[boolvars].values) dftrain_bool = pd.DataFrame(self.pca_model.transform(dfX[boolvars].values)) dftrain = pd.concat([dfX[self.features], dftrain_bool],axis=1) else: self.pca_model=None dftrain = dfX[self.features] if self.log: dftrain = self._log_feature(dftrain) dfy = np.log(dfy.values) self.log = True else: self.log = False if self.divided_position is not None: for i, j in self.divided_position: test_X = self._divide_two_features(test_X, i, j) train_X = self._divide_two_features(train_X, i, j) self.model.fit(dftrain, dfy) print self.model.get_params print 'model fit {} homes in {}'.format(dfX.shape[0] ,time.time()-start) return None def predict(self, dfX, gettree): ''' ''' start = time.time() dfX.reset_index(inplace=True,drop=True) boolvars = [col for col in dfX.columns.values if 'dvar' in col] if self.pca_model is not None: df_bool = pd.DataFrame(self.pca_model.transform(dfX[boolvars].values)) dftest = pd.concat([dfX[self.features], df_bool], axis=1) else: dftest = dfX[self.features] if self.log: dftest = self._log_feature(dftest) point_estimates = self.model.predict(dftest) tree_estimates = np.array([]) if gettree: for est in self.model.estimators_: tree_estimates = np.concatenate([tree_estimates, est.predict(dftest)], axis=1) else: tree_estimates = 0 print 'model predict (2x) {} in {}'.format(dfX.shape[0] ,time.time()-start) if self.log: return np.exp(point_estimates), np.exp(tree_estimates) else: return point_estimates, tree_estimates def df_time_iterator(self, df, cv, splittype): ''' ''' minlistdate = df.listdate.min() maxlistdate = df.statusupdate.max() dt = ((maxlistdate-minlistdate).days/cv) c = 1 dayrandom = np.random.randint(0,16)-8 while c < cv+1: #chunk does equal train/test splits if splittype == 'chunk': train = df[((df.listdate >= (minlistdate+timedelta(days=dt*(c-1)+dayrandom))) & (df.statuschangedate <= (minlistdate+timedelta(days=dt*(c)))))].index test = df[((df.listdate > (minlistdate+timedelta(days=dt*(c)-dayrandom))) & (df.statuschangedate <= (minlistdate+timedelta(days=dt*(c+1)))))].index #forward does train/test splits which grow in time elif splittype == 'forward': train = df[df.statuschangedate <= minlistdate+timedelta(days=dt*(c)-dayrandom)].index test = df[df.listdate > minlistdate+timedelta(days=dt*(c))].index c += 1 yield train, test def cross_validate_model(self, dfX, dfy, features, components, log, divided_position, model, cv): ''' ''' kf = KFold(dfX.shape[0], n_folds=cv, shuffle=True) dfX.reset_index(inplace=True, drop=True) dfy.reset_index(inplace=True, drop=True) mets = [metrics.median_absolute_error, metrics.r2_score, self.percent_difference] scores = np.zeros(len(mets)) scorespriced = np.zeros(len(mets)) boolvars = [col for col in dfX.columns.values if 'dvar' in col] if components>0: features = features+boolvars dfX = dfX[features] for train_index, test_index in kf: train_X, train_y = dfX.loc[train_index,:].copy(), dfy.loc[train_index].copy() test_X, test_y = dfX.loc[test_index,:].copy(), dfy.loc[test_index].copy() if components>0: train_X, test_X = self._pca_dummies(components, boolvars, train_X, test_X) if divided_position is not None: for i, j in divided_position: test_X = self._divide_two_features(test_X, i, j) train_X = self._divide_two_features(train_X, i, j) self.features = train_X.columns.values if log: train_X = self._log_feature(train_X) train_y = np.log(train_y.values) test_X = self._log_feature(test_X) test_y = np.log(test_y.values) model.fit(train_X, train_y) ypred = model.predict(test_X) if log: test_y = np.exp(test_y) ypred = np.exp(ypred) mask = (ypred>100000) & (ypred<230000) for i, m in enumerate(mets): scores[i] += m(test_y, ypred) scorespriced[i] += m(test_y[mask], ypred[mask]) scores = scores/float(cv) scorespriced = scorespriced/float(cv) self.a_cved_model = model print ''.join(['-']*40) print ''.join(['-']*40) print model.get_params print ''.join(['-']*40) self._display_scoring_metrics(zip(mets,scores), 'full') print ''.join(['-']*40) self._display_scoring_metrics(zip(mets,scorespriced), 'priced') return None def scorer(self, ytrue, ypred): ''' ''' mask = (ypred>100000) & (ypred<230000) mets = [metrics.median_absolute_error, metrics.r2_score, self._percent_difference] scores = np.zeros(len(mets)) scorespriced = np.zeros(len(mets)) for i, m in enumerate(mets): scores[i] += m(ytrue, ypred) scorespriced[i] += m(ytrue[mask], ypred[mask]) print 'full_size {}'.format(len(ytrue)) self._display_scoring_metrics(zip(mets,scores), 'full') print ''.join(['-']*40) print 'full_size {}'.format(np.sum(mask)) self._display_scoring_metrics(zip(mets,scorespriced), 'priced') return None def percent_difference(self, ytest, ypred): ''' ''' return np.mean(abs((ytest-ypred)/ytest)*100.) def tree_importance(self, model, threshold): ''' ''' fimport = pd.DataFrame(zip(self.features, model.feature_importances_), columns=['feature','importance']).sort('importance', ascending=False) fvar=[] for est in model.estimators_: fvar.append(est.feature_importances_) fimport['std'] = np.array(fvar).std(axis=0) fimport['cumimport'] = fimport['importance'].cumsum().values return fimport, fimport[fimport['cumimport'] < threshold]['feature'].tolist() def _pca_dummies(self, components, boolvars, dftrain, dftest): ''' ''' dftrain.reset_index(inplace=True, drop=True) dftest.reset_index(inplace=True, drop=True) self.pca_model = RandomizedPCA(n_components=components) self.pca_model.fit(dftrain[boolvars].values) dftrain_bool = pd.DataFrame(self.pca_model.transform(dftrain[boolvars].values)) dftest_bool = pd.DataFrame(self.pca_model.transform(dftest[boolvars].values)) dftrain.drop(boolvars, axis=1,inplace=True) dftest.drop(boolvars, axis=1,inplace=True) features = dftrain.columns.tolist()+['pca'+str(i) for i in range(components)] dftrain = pd.concat([dftrain, dftrain_bool], axis=1, ignore_index=True) dftest = pd.concat([dftest, dftest_bool], axis=1, ignore_index=True) dftrain.columns = features dftest.columns = features return dftrain, dftest def _display_scoring_metrics(self, met_scores, label): ''' ''' for met, score in met_scores: print label+' {}: {}'.format(met.func_name, np.round(score,3)) def _log_feature(self, df): ''' ''' for feature in [col for col in df.columns.values if (('price' in str(col)) | ('taxes' in str(col)))]: df[feature] = np.log(df[feature].values) return df def _divide_two_features(self, df, f1, f2): ''' ''' df[f1+'_'+f2] = df[f1]/df[f2] df.drop([f1,f2], axis=1, inplace=True) return df
def main(opt_list, arg_list, runall=False): """ Pass either only_ml, ml_svm, or only_svm""" accuracies = { 'soft_unw': [], 'soft_wei': [], 'hard_wei': [], 'hard_unw': [], 'svm': 0, 'lda': 0 } #print(__doc__) # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') ############################################################################### # Download the data, if not already on disk and load it as numpy arrays lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) # introspect the images arrays to find the shapes (for plotting) n_samples, h, w = lfw_people.images.shape # for machine learning we use the 2 data directly (as relative pixel # positions info is ignored by this model) X = lfw_people.data n_features = X.shape[1] # the label to predict is the id of the person y = lfw_people.target target_names = lfw_people.target_names n_classes = target_names.shape[0] print("Total dataset size:") print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) print("n_classes: %d" % n_classes) ############################################################################### # Split into a training set and a test set using a stratified k fold # split into a training and testing set X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42) ############################################################################### # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("done in %0.3fs" % (time() - t0)) eigenfaces = pca.components_.reshape((n_components, h, w)) print("Projecting the input data on the eigenfaces orthonormal basis") t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done in %0.3fs" % (time() - t0)) if opt_list is "serial": if not runall: a = time() acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ['lmnn', 'lsml', 'rca', 'ldml', 'lfda'], 'soft', "weighted") print("accuracy = %s",acc) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) b = time() else: if 'soft_unw' in arg_list: mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml']) ml_strs = [] y_preds = [] for ml in mls: if len(ml) == 0: continue print(ml) acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ml, 'soft', 'unweighted') y_preds.append(y_pred) accuracies['soft_unw'].append(acc) ml_strs.append(getStr(ml)) print("accuracy = %s",acc) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) y_preds = np.array(y_preds) num_samples = y_preds.shape[1] majority_pred = np.zeros(num_samples) for sample in xrange(y_preds.shape[1]): majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax() majority_pred= np.array(majority_pred, dtype=np.int32) c = np.sum(majority_pred == y_test) accuracy = c * 100.0 / num_samples accuracies['soft_unw'].append(accuracy) ml_strs.append('all') cleanCachedMls() if 'soft_wei' in arg_list: mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml']) ml_strs = [] y_preds = [] for ml in mls: if len(ml) == 0: continue print(ml) acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ml, 'soft', 'weighted') y_preds.append(y_pred) accuracies['soft_wei'].append(acc) ml_strs.append(getStr(ml)) print("accuracy = %s",acc) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) y_preds = np.array(y_preds) num_samples = y_preds.shape[1] majority_pred = np.zeros(num_samples) for sample in xrange(y_preds.shape[1]): majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax() majority_pred= np.array(majority_pred, dtype=np.int32) c = np.sum(majority_pred == y_test) accuracy = c * 100.0 / num_samples accuracies['soft_wei'].append(accuracy) ml_strs.append('all') cleanCachedMls() if 'hard_wei' in arg_list: mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml']) ml_strs = [] y_preds = [] for ml in mls: if len(ml) == 0: continue print(ml) acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ml, 'hard', 'weighted') y_preds.append(y_pred) accuracies['hard_wei'].append(acc) ml_strs.append(getStr(ml)) print("accuracy = %s",acc) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) y_preds = np.array(y_preds, dtype=np.int32) num_samples = y_preds.shape[1] majority_pred = np.zeros(num_samples) for sample in xrange(y_preds.shape[1]): majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax() majority_pred= np.array(majority_pred, dtype=np.int32) c = np.sum(majority_pred == y_test) accuracy = c * 100.0 / num_samples accuracies['hard_wei'].append(accuracy) ml_strs.append('all') cleanCachedMls() if 'hard_unw' in arg_list: mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml']) ml_strs = [] y_preds = [] for ml in mls: if len(ml) == 0: continue print(ml) acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ml, 'hard', 'unweighted') y_preds.append(y_pred) accuracies['hard_unw'].append(acc) ml_strs.append(getStr(ml)) print("accuracy = %s",acc) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) y_preds = np.array(y_preds, dtype=np.int32) num_samples = y_preds.shape[1] majority_pred = np.zeros(num_samples) for sample in xrange(y_preds.shape[1]): majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax() majority_pred= np.array(majority_pred, dtype=np.int32) c = np.sum(majority_pred == y_test) accuracy = c * 100.0 / num_samples accuracies['hard_unw'].append(accuracy) ml_strs.append('all') cleanCachedMls() if opt_list is "parallel": """ TODO: Opt for the parallel thread implementation. """ if not runall: a = time() acc, y_pred = assemble_parallel(X_train_pca, y_train, X_test_pca, y_test, 'hard') print("accuracy = %s",acc) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) b = time() print("Total time taken for all this: {0}".format(b-a)) else: mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml']) ml_strs = [] y_preds = [] for ml in mls: if len(ml) == 0: continue print(ml) acc, y_pred = assemble_parallel(X_train_pca, y_train, X_test_pca, y_test, 'hard') accuracies['hard'].append(acc) y_preds.append(y_pred) ml_strs.append(getStr(ml)) print("accuracy = %s", acc) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) y_preds = np.array(y_preds) num_samples = y_preds.shape[1] majority_pred = np.zeros(num_samples) for sample in xrange(y_preds.shape[1]): majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax() majority_pred= np.array(majority_pred, dtype=np.int32) c = np.sum(majority_pred == y_test) accuracy = c * 100.0 / num_samples accuracies['hard'].append(accuracy) ml_strs.append('all') ############################################################################### print("Without the LMNN structure") # Train a SVM classification model print("Fitting the classifier to the training set") t0 = time() param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid) clf.fit(X_train_pca, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") print(clf.best_estimator_) ############################################################################### # Quantitative evaluation of the model quality on the test set print("Predicting people's names on the test set") t0 = time() y_pred = clf.predict(X_test_pca) acc = 100.0*sum(y_pred == y_test) / len(y_test) print("accuracy = %s",acc) print("done in %0.3fs" % (time() - t0)) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) print("Fitting the classifier to the training set") t0 = time() clf = LDA() clf.fit(X_train_pca, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") ############################################################################### # Quantitative evaluation of the model quality on the test set print("Predicting people's names on the test set") t0 = time() y_pred = clf.predict(X_test_pca) acc1 = 100.0*sum(y_pred == y_test) / len(y_test) print("accuracy = %s",acc1) print("done in %0.3fs" % (time() - t0)) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) if runall: accuracies['svm'] = acc accuracies['lda'] = acc1 ml_strs.append('svm') ml_strs.append('lda') ml_strs = ", ".join(ml_strs) return ml_strs, accuracies
n_components = 150 #組成元素的數量 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) t0 = time() #print(t0) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) #隨機隆維 print("done in %0.3fs" % (time() - t0)) eigenfaces = pca.components_.reshape( (n_components, h, w)) #從人臉提取一些特徵點,叫做eigenface print("Projecting te input data on the eigenfaces orthonormal basis") t0 = time() #print(t0) X_train_pca = pca.transform(X_train) #針對X_train執行降維動作 X_test_pca = pca.transform(X_test) #針對X_test執行降維動作 print("done in %0.3fs" % (time() - t0)) #=========================================================================================== # Train a SCM classification model print("fitting the classifier to the training set") t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } #嚐試不同參數。共30種組合 # C:float,optional(default=1.0), Penalty parameter C of the error term 對錯誤進行懲罰,權重 # gamma:float, optional(default=0.0) Kernel coefficient for 'rbf", "poly" and "sigmoid" # If gamma is 0.0 then 1/n_features will be used instead 多少特徵點會被使用將有一個比例
class MKSHomogenizationModel(BaseEstimator): """ The `MKSHomogenizationModel` takes in microstructures and a their associated macroscopic property, and created a low dimensional structure property linkage. The `MKSHomogenizationModel` model is designed to integrate with dimensionality reduction techniques and predictive models. Attributes: degree: Degree of the polynomial used by `property_linker`. n_components: Number of components used by `dimension_reducer`. dimension_reducer: Instance of a dimensionality reduction class. property_linker: Instance of class that maps materials property to the microstuctures. correlations: spatial correlations to be computed basis: instance of a basis class reduced_fit_data: Low dimensionality representation of spatial correlations used to fit the model. reduced_predict_data: Low dimensionality representation of spatial correlations predicted by the model. Below is an examlpe of using MKSHomogenizationModel to predict (or classify) the type of microstructure using PCA and Logistic Regression. >>> n_states = 3 >>> domain = [-1, 1] >>> from pymks.bases import LegendreBasis >>> leg_basis = LegendreBasis(n_states=n_states, domain=domain) >>> from sklearn.decomposition import PCA >>> from sklearn.linear_model import LogisticRegression >>> reducer = PCA(n_components=3) >>> linker = LogisticRegression() >>> model = MKSHomogenizationModel( ... basis=leg_basis, dimension_reducer=reducer, property_linker=linker) >>> from pymks.datasets import make_cahn_hilliard >>> X0, X1 = make_cahn_hilliard(n_samples=50) >>> y0 = np.zeros(X0.shape[0]) >>> y1 = np.ones(X1.shape[0]) >>> X = np.concatenate((X0, X1)) >>> y = np.concatenate((y0, y1)) >>> model.fit(X, y) >>> X0_test, X1_test = make_cahn_hilliard(n_samples=3) >>> y0_test = model.predict(X0_test) >>> y1_test = model.predict(X1_test) >>> assert np.allclose(y0_test, [0, 0, 0]) >>> assert np.allclose(y1_test, [1, 1, 1]) """ def __init__(self, basis=None, dimension_reducer=None, n_components=None, property_linker=None, degree=1, correlations=None, compute_correlations=True): """ Create an instance of a `MKSHomogenizationModel`. Args: basis (class, optional): an instance of a bases class. dimension_reducer (class, optional): an instance of a dimensionality reduction class with a fit_transform method. property_linker (class, optional): an instance for a machine learning class with fit and predict methods. n_components (int, optional): number of components kept by the dimension_reducer degree (int, optional): degree of the polynomial used by property_linker. correlations (list, optional): list of spatial correlations to compute, default is the autocorrelation with the first local state and all of its cross correlations. For example if basis has n_states=3, correlation would be [(0, 0), (0, 1), (0, 2)] compute_correlations (boolean, optional): If false spatial correlations will not be calculated as part of the fit and predict methods. The spatial correlations can be passed as `X` to both methods, default is True. """ self.basis = basis self.dimension_reducer = dimension_reducer if self.dimension_reducer is None: self.dimension_reducer = RandomizedPCA() if n_components is None: n_components = self.dimension_reducer.n_components if n_components is None: n_components = 2 if property_linker is None: property_linker = LinearRegression() if correlations is None and basis is not None: if compute_correlations is True: correlations = [(0, l) for l in range(basis.n_states)] self._linker = Pipeline([('poly', PolynomialFeatures(degree=degree)), ('connector', property_linker)]) self._check_methods self.degree = degree self.n_components = n_components self.property_linker = property_linker self.correlations = correlations self._fit = False self.compute_correlations = compute_correlations self.reduced_fit_data = None self.reduced_predict_data = None @property def n_components(self): return self._n_components @n_components.setter def n_components(self, value): """Setter for the number of components using by the dimension_reducer """ self._n_components = value self.dimension_reducer.n_components = value @property def degree(self): return self._degree @degree.setter def degree(self, value): """Setter for the polynomial degree for property_linker. """ self._degree = value self._linker.set_params(poly__degree=value) @property def property_linker(self): return self._property_linker @property_linker.setter def property_linker(self, prop_linker): """Setter for the property_linker class. """ self._property_linker = prop_linker self._linker.set_params(connector=prop_linker) def _check_methods(self): """ Helper function to make check that the dimensionality reduction and property linking methods have the appropriate methods. """ if not callable(getattr(self.dimension_reducer, "fit_transform", None)): raise RuntimeError( "dimension_reducer does not have fit_transform() method.") if not callable(getattr(self.dimension_reducer, "transform", None)): raise RuntimeError( "dimension_reducer does not have transform() method.") if not callable(getattr(self.linker, "fit", None)): raise RuntimeError("property_linker does not have fit() method.") if not callable(getattr(self.linker, "predict", None)): raise RuntimeError( "property_linker does not have predict() method.") def fit(self, X, y, reduce_labels=None, periodic_axes=None, confidence_index=None, size=None): """ Fits data by calculating 2-point statistics from X, preforming dimension reduction using dimension_reducer, and fitting the reduced data with the property_linker. Args: X (ND array): The microstructures or spatial correlations, a `(n_samples, n_x, ...)` shaped array where `n_samples` is the number of samples and `n_x` is the spatial discretization. y (1D array): The material property associated with `X`. reducer_labels (1D array, optional): label for X used during the fit_transform method for the `dimension_reducer`. periodic_axes (list, optional): axes that are periodic. (0, 2) would indicate that axes x and z are periodic in a 3D microstrucure. confidence_index (ND array, optional): array with same shape as X used to assign a confidence value for each data point. Example >>> from sklearn.decomposition import PCA >>> from sklearn.linear_model import LinearRegression >>> from pymks.bases import PrimitiveBasis >>> from pymks.stats import correlate >>> reducer = PCA(n_components=2) >>> linker = LinearRegression() >>> prim_basis = PrimitiveBasis(n_states=2, domain=[0, 1]) >>> correlations = [(0, 0), (1, 1), (0, 1)] >>> model = MKSHomogenizationModel(prim_basis, ... dimension_reducer=reducer, ... property_linker=linker, ... correlations=correlations) >>> np.random.seed(99) >>> X = np.random.randint(2, size=(3, 15)) >>> y = np.array([1, 2, 3]) >>> model.fit(X, y) >>> X_ = prim_basis.discretize(X) >>> X_stats = correlate(X_) >>> X_reshaped = X_stats.reshape((X_stats.shape[0], X_stats[0].size)) >>> X_pca = reducer.fit_transform(X_reshaped - np.mean(X_reshaped, ... axis=1)[:, None]) >>> assert np.allclose(model.reduced_fit_data, X_pca) Now let's use the same method with spatial correlations instead of microtructures. >>> from sklearn.decomposition import PCA >>> from sklearn.linear_model import LinearRegression >>> from pymks.bases import PrimitiveBasis >>> from pymks.stats import correlate >>> reducer = PCA(n_components=2) >>> linker = LinearRegression() >>> prim_basis = PrimitiveBasis(n_states=2, domain=[0, 1]) >>> correlations = [(0, 0), (1, 1), (0, 1)] >>> model = MKSHomogenizationModel(dimension_reducer=reducer, ... property_linker=linker, ... compute_correlations=False) >>> np.random.seed(99) >>> X = np.random.randint(2, size=(3, 15)) >>> y = np.array([1, 2, 3]) >>> X_ = prim_basis.discretize(X) >>> X_stats = correlate(X_, correlations=correlations) >>> model.fit(X_stats, y) >>> X_reshaped = X_stats.reshape((X_stats.shape[0], X_stats[0].size)) >>> X_pca = reducer.fit_transform(X_reshaped - np.mean(X_reshaped, ... axis=1)[:, None]) >>> assert np.allclose(model.reduced_fit_data, X_pca) """ if self.compute_correlations is True: if periodic_axes is None: periodic_axes = [] if size is not None: new_shape = (X.shape[0], ) + size X = X.reshape(new_shape) X = self._correlate(X, periodic_axes, confidence_index) X_reshape = self._reduce_shape(X) X_reduced = self.dimension_reducer.fit_transform( X_reshape, reduce_labels) self._linker.fit(X_reduced, y) self.reduced_fit_data = X_reduced self._fit = True def predict(self, X, periodic_axes=None, confidence_index=None): """Predicts macroscopic property for the microstructures `X`. Args: X (ND array): The microstructure, an `(n_samples, n_x, ...)` shaped array where `n_samples` is the number of samples and `n_x` is the spatial discretization. periodic_axes (list, optional): axes that are periodic. (0, 2) would indicate that axes x and z are periodic in a 3D microstrucure. confidence_index (ND array, optional): array with same shape as X used to assign a confidence value for each data point. Returns: The predicted macroscopic property for `X`. Example >>> from sklearn.manifold import LocallyLinearEmbedding >>> from sklearn.linear_model import BayesianRidge >>> from pymks.bases import PrimitiveBasis >>> np.random.seed(99) >>> X = np.random.randint(2, size=(50, 100)) >>> y = np.random.random(50) >>> reducer = LocallyLinearEmbedding() >>> linker = BayesianRidge() >>> prim_basis = PrimitiveBasis(2, domain=[0, 1]) >>> model = MKSHomogenizationModel(prim_basis, n_components=2, ... dimension_reducer=reducer, ... property_linker=linker) >>> model.fit(X, y) >>> X_test = np.random.randint(2, size=(1, 100)) Predict with microstructures >>> y_pred = model.predict(X_test) Predict with spatial correlations >>> from pymks.stats import correlate >>> model.compute_correlations = False >>> X_ = prim_basis.discretize(X_test) >>> X_corr = correlate(X_, correlations=[(0, 0), (0, 1)]) >>> y_pred_stats = model.predict(X_corr) >>> assert y_pred_stats == y_pred """ if not self._fit: raise RuntimeError('fit() method must be run before predict().') if self.compute_correlations is True: if periodic_axes is None: periodic_axes = [] X = self._correlate(X, periodic_axes, confidence_index) X_reshape = self._reduce_shape(X) X_reduced = self.dimension_reducer.transform(X_reshape) self.reduced_predict_data = X_reduced return self._linker.predict(X_reduced) def _correlate(self, X, periodic_axes, confidence_index): """ Helper function used to calculated 2-point statistics from `X` and reshape them appropriately for fit and predict methods. Args: X (ND array): The microstructure, an `(n_samples, n_x, ...)` shaped array where `n_samples` is the number of samples and `n_x` is the spatial discretization.. periodic_axes (list, optional): axes that are periodic. (0, 2) would indicate that axes x and z are periodic in a 3D microstrucure. confidence_index (ND array, optional): array with same shape as X used to assign a confidence value for each data point. Returns: Spatial correlations for each sample formated with dimensions (n_samples, n_features). Example >>> from sklearn.manifold import Isomap >>> from sklearn.linear_model import ARDRegression >>> from pymks.bases import PrimitiveBasis >>> reducer = Isomap() >>> linker = ARDRegression() >>> prim_basis = PrimitiveBasis(2, [0, 1]) >>> model = MKSHomogenizationModel(prim_basis, reducer, linker) >>> X = np.array([[0, 1], ... [1, 0]]) >>> X_stats = model._correlate(X, [], None) >>> X_test = np.array([[[ 0, 0], ... [0.5, 0]], ... [[0, 1,], ... [0.5, 0]]]) >>> assert np.allclose(X_test, X_stats) """ if self.basis is None: raise AttributeError('basis must be specified') X_ = self.basis.discretize(X) X_stats = correlate(X_, periodic_axes=periodic_axes, confidence_index=confidence_index, correlations=self.correlations) return X_stats def _reduce_shape(self, X_stats): """ Helper function used to reshape 2-point statistics appropriately for fit and predict methods. Args: `X_stats`: The discretized microstructure function, an `(n_samples, n_x, ..., n_states)` shaped array Where `n_samples` is the number of samples, `n_x` is thes patial discretization, and n_states is the number of local states. Returns: Spatial correlations for each sample formated with dimensions (n_samples, n_features). Example >>> X_stats = np.zeros((2, 2, 2, 2)) >>> X_stats[1] = 3. >>> X_stats[..., 1] = 1. >>> X_results = np.array([[-.5, .5, -.5, .5, -.5, .5, -.5, 0.5], ... [1., -1., 1., -1., 1., -1., 1., -1.]]) >>> from pymks import PrimitiveBasis >>> prim_basis = PrimitiveBasis(2) >>> model = MKSHomogenizationModel(prim_basis) >>> assert np.allclose(X_results, model._reduce_shape(X_stats)) """ X_reshaped = X_stats.reshape((X_stats.shape[0], X_stats[0].size)) return X_reshaped - np.mean(X_reshaped, axis=1)[:, None] def score(self, X, y, periodic_axes=None, confidence_index=None): """ The score function for the MKSHomogenizationModel. It formats the data and uses the score method from the property_linker. Args: X (ND array): The microstructure, an `(n_samples, n_x, ...)` shaped array where `n_samples` is the number of samples and `n_x` is the spatial discretization. y (1D array): The material property associated with `X`. periodic_axes (list, optional): axes that are periodic. (0, 2) would indicate that axes x and z are periodic in a 3D microstrucure. confidence_index (ND array, optional): array with same shape as X used to assign a confidence value for each data point. Returns: Score for MKSHomogenizationModel from the selected property_linker. """ if periodic_axes is None: periodic_axes = [] if not callable(getattr(self._linker, "score", None)): raise RuntimeError("property_linker does not have score() method.") X_corr = self._correlate(X, periodic_axes, confidence_index) X_reshaped = self._reduce_shape(X_corr) X_reduced = self.dimension_reducer.transform(X_reshaped) return self._linker.score(X_reduced, y)
except IndexError: print "Please specify trainingfile.csv testingfile.csv NumComponents" sys.exit(1) traindf = pandas.read_csv(trainfile) testdf = pandas.read_csv(testfile) columns = ["tfidfpca_%s" % x for x in xrange(ncomponents)] trainCleanEssay = traindf.essay.str.decode('mac-roman') testCleanEssay = testdf.essay.str.decode('mac-roman') vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english") trainvec = vectorizer.fit_transform(trainCleanEssay) testvec = vectorizer.transform(testCleanEssay) pca = RandomizedPCA(n_components=ncomponents) pca.fit(trainvec) trainpca = pca.transform(trainvec) trainpcadf = pandas.DataFrame(trainpca, columns=columns) testpca = pca.transform(testvec) testpcadf = pandas.DataFrame(testpca, columns=columns) traindf = traindf.combine_first(trainpcadf) testdf = testdf.combine_first(testpcadf) nf = lambda x: os.path.splitext(os.path.basename(x))[0] + "_tfidf.csv" traindf.to_csv(nf(trainfile)) testdf.to_csv(nf(testfile)) print "+".join(columns)
### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) #use pca from sklearn.decomposition import RandomizedPCA n_components = 1 pca = RandomizedPCA(n_components=n_components, whiten=True).fit(features_train) pca_train = pca.transform(features_train) pca_test = pca.transform(features_test) #use pca train data # clf.fit(pca_train,labels_train) # print clf.score(pca_test,labels_test) clf.fit(features_train, labels_train) print clf.score(features_test, labels_test) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
pre_dispatch=1))]) print '\nGridSearchCV finished\n', clf pca.fit(finance_features) print '\npca.explained_variance_ratio_', pca.explained_variance_ratio_, '\n' print 'CLF', clf #print '\nBest estimator:', clf.best_estimator_ # extraction of components to plot them print '\npca.explained_variance_ratio_', pca.explained_variance_ratio_, '\n' financial_pc1 = pca.components_[0] financial_pc2 = pca.components_[1] transformed_data = pca.transform(features) for ii, jj in zip(transformed_data, features): plt.scatter(financial_pc1[0] * ii[0], financial_pc1[1] * ii[0], color='r') plt.scatter(financial_pc2[0] * ii[0], financial_pc2[1] * ii[0], color='c') plt.scatter(jj[0], jj[1], color="b") plt.xlabel("bonus") plt.ylabel("long-term incentive") plt.show() clf.fit(features, labels) ######################################################################## ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. ### Because of the small size of the dataset, the script uses stratified
# and an array with ID of the people on each image y X = np.zeros([NUM_TRAINIMAGES, IMG_RES], dtype='int8') y = [] # Populate training array with flattened imags from subfolders of train_faces and names c = 0 for x, folder in enumerate(folders): train_faces = glob.glob(folder + '/*') for i, face in enumerate(train_faces): X[c, :] = prepare_image(face) y.append(ID_from_filename(face)) c = c + 1 # perform principal component analysis on the images pca = RandomizedPCA(n_components=NUM_EIGENFACES, whiten=True).fit(X) X_pca = pca.transform(X) while 1: r = "" # load test faces (usually one), located in folder test_faces test_faces = glob.glob('test_faces/*') # Create an array with flattened images X X = np.zeros([len(test_faces), IMG_RES], dtype='int8') # Populate test array with flattened imags from subfolders of train_faces for i, face in enumerate(test_faces): img = cv2.imread(face) if img is not None: X[i, :] = prepare_image(face)
def feature_reduction_pca(full_features): pca = RandomizedPCA(n_components=8, whiten=True).fit(full_features) return pca.transform(full_features)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) n_components = 150 print("extracting the top %d eigenfaces from the %d faces" % (n_components, x_train.shape[0])) start_time = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(x_train) print("运行 %0.3fs" % (time() - start_time)) eigenfaces = pca.components_.reshape((n_components, h, w)) print("将输入数据降维") start_time = time() x_train_pca = pca.transform(x_train) x_test_pca = pca.transform(x_test) print("运行 %0.3fs" % (time() - start_time)) print("分类数据集的拟合") start_time = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf', class_weight="balanced"), param_grid) clf = clf.fit(x_train_pca, y_train) print("运行 %0.3fs" % (time() - start_time)) print("grid search 最佳估计:") print(clf.best_estimator_)
n_components = 10 cv2.destroyAllWindows() pca = RandomizedPCA(n_components=n_components, whiten=True) param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) testing_data = [] for i in range(len(images)): testing_data.append(images[i].flatten()) pca = pca.fit(testing_data) transformed = pca.transform(testing_data) clf.fit(transformed, labels) image_paths = [ os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith('sad') ] for image_path in image_paths: pred_image_pil = Image.open(image_path).convert('L') pred_image = np.array(pred_image_pil, 'uint8') faces = faceCascade.detectMultiScale(pred_image) for (x, y, w, h) in faces: X_test = pca.transform( np.array(pred_image[y:y + col, x:x + row]).flatten()) mynbr = clf.predict(X_test) nbr_act = int(
def main(): """ CLI Arguments allowed: --display_graphs Displays graphs --retrain Trains a new model --cross-validate Runs cross validation to fine tune the model --test=validation_set Tests the latest trained model against the validation set --test=test_set Tests the latets trained model against the test set """ global trainer, classifier inputs_train, targets_train, inputs_valid, targets_valid, inputs_test, targets_test = load_parsed_data() if '--display_graphs' in sys.argv: display_graphs = True print('using {} percent of all data in corpus'.format(PERCENTAGE_DATA_SET_TO_USE*100)) print('using {} most common words as features'.format(NUM_FEATURES)) if not trained_model_exists() or '--retrain' in sys.argv: train_features, valid_features, test_features = extract_features( inputs_train[:len(inputs_train)*PERCENTAGE_DATA_SET_TO_USE], targets_train[:len(targets_train)*PERCENTAGE_DATA_SET_TO_USE], inputs_valid[:len(inputs_valid)*PERCENTAGE_DATA_SET_TO_USE], targets_valid[:len(targets_valid)*PERCENTAGE_DATA_SET_TO_USE], inputs_test[:len(inputs_test)*PERCENTAGE_DATA_SET_TO_USE], targets_test[:len(targets_test)*PERCENTAGE_DATA_SET_TO_USE] ) save_features(train_features, valid_features, test_features) pca = RandomizedPCA(n_components=N_COMPONENTS, whiten=False).fit(train_features) save_pca(pca) print ("Saved PCA") X_train = pca.transform(train_features) X_valid = pca.transform(valid_features) pca = None print ("Created PCAd features") valid_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2) for i in range(len(X_valid)): valid_data.addSample(X_valid[i], targets_test[i]) valid_data._convertToOneOfMany() X_valid = None train_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2) for i in range(len(X_train)): train_data.addSample( X_train[i], targets_train[i]) train_data._convertToOneOfMany() X_train = None classifier = buildNetwork( train_data.indim, N_HIDDEN, train_data.outdim, outclass=SoftmaxLayer) trainer = BackpropTrainer( classifier, dataset=train_data, momentum=0.1, learningrate=0.01 , verbose=True) train_model(train_data, valid_data) save_model(classifier) train_data = None valid_data = None else: train_features, valid_features, test_features = load_features() pca = load_pca() X_train = pca.transform(train_features) pca = None print ("Created PCAd features") train_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2) for i in range(len(X_train)): train_data.addSample( X_train[i], targets_train[i]) train_data._convertToOneOfMany() X_train = None classifier = load_trained_model() trainer = BackpropTrainer( classifier, dataset=train_data, momentum=0.1, learningrate=0.01 , verbose=True) if '--test=validation_set' in sys.argv: print ("Running against validation set") pca = load_pca() X_valid = pca.transform(valid_features) pca = None valid_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2) for i in range(len(X_valid)): valid_data.addSample( X_valid[i], targets_test[i]) valid_data._convertToOneOfMany() X_valid = None make_prediction(valid_data) if '--test=test_set' in sys.argv: print ("Running against test set") pca = load_pca() X_test = pca.transform(test_features) pca = None test_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2) for i in range(len(X_test)): test_data.addSample( X_test[i], targets_test[i]) test_data._convertToOneOfMany() y_pred = trainer.testOnClassData(dataset=test_data) plot_precision_and_recall(y_pred, targets_test[:len(targets_test) * PERCENTAGE_DATA_SET_TO_USE]) X_test = None make_prediction(test_data)
def main(): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') # Download the data, if not already on disk and load it as numpy arrays lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) # introspect the images arrays to find the shapes (for plotting) n_samples, h, w = lfw_people.images.shape np.random.seed(42) # for machine learning we use the data directly (as relative pixel # position info is ignored by this model) X = lfw_people.data n_features = X.shape[1] # the label to predict is the id of the person y = lfw_people.target target_names = lfw_people.target_names n_classes = target_names.shape[0] print("Total dataset size:") print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) print("n_classes: %d" % n_classes) # Split into a training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) def xprint(*args, **kwargs): pass N_COMPONENTS = [10, 15, 25, 50, 100, 250] for n_components in N_COMPONENTS: # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction xprint("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) xprint("done in %0.3fs" % (time() - t0)) eigenfaces = pca.components_.reshape((n_components, h, w)) xprint("Projecting the input data on the eigenfaces orthonormal basis") t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) xprint("done in %0.3fs" % (time() - t0)) # Train a SVM classification model xprint("Fitting the classifier to the training set") t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, # the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) xprint("done in %0.3fs" % (time() - t0)) xprint("Best estimator found by grid search:") xprint(clf.best_estimator_) # Quantitative evaluation of the model quality on the test set xprint("Predicting the people names on the testing set") t0 = time() y_pred = clf.predict(X_test_pca) xprint("done in %0.3fs" % (time() - t0)) print(n_components, classification_report(y_test, y_pred, target_names=target_names)) return # Qualitative evaluation of the predictions using matplotlib def plot_gallery(images, titles, h, w, n_row=3, n_col=4): """Helper function to plot a gallery of portraits""" pl.figure(figsize=(1.8 * n_col, 2.4 * n_row)) pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) for i in range(n_row * n_col): pl.subplot(n_row, n_col, i + 1) pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray) pl.title(titles[i], size=12) pl.xticks(()) pl.yticks(()) # plot the result of the prediction on a portion of the test set def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] true_name = target_names[y_test[i]].rsplit(' ', 1)[-1] return 'predicted: %s\ntrue: %s' % (pred_name, true_name) prediction_titles = [ title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0]) ] plot_gallery(X_test, prediction_titles, h, w) # plot the gallery of the most significative eigenfaces eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])] plot_gallery(eigenfaces, eigenface_titles, h, w) pl.show()
pca = RandomizedPCA(n_components=5) train_x = pca.fit_transform(data) ones = np.ones(23) zeros = np.zeros(33) train_y = np.concatenate((ones,zeros)) knn = KNeighborsClassifier() knn.fit(train_x, train_y) #Test images img_dir_test = "image_classification/test/ images_test = [img_dir_test+ f for f in os.listdir(img_dir_test)] data_test = [] for image in images_test: img = img_to_matrix(image) img = flatten_image(img) data_test.append(img) test_x = pca.transform(data_test) knn.predict(test_x) pd.crosstab(train_y,knn.predict(train_x),rownames=['Act'],colnames=['Predicted'])