Example #1
1
def tryLinearDiscriminantAnalysis(goFast):
  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True)

  from sklearn.lda import LDA
  from sklearn.metrics import accuracy_score
  from sklearn.grid_search import ParameterGrid
  from sklearn.decomposition import RandomizedPCA

  rpcaDataGrid = [{"n_components": [10,45,70,100],
                    "iterated_power": [2, 3, 4],
                    "whiten": [True]}]

  for rpca_parameter_set in ParameterGrid(rpcaDataGrid):
    rpcaOperator = RandomizedPCA(**rpca_parameter_set)
    rpcaOperator.fit(training_data,training_labels)
    new_training_data = rpcaOperator.transform(training_data,training_labels)
    new_validation_data = rpcaOperator.transform(validation_data,validation_labels)
    ldaOperator = LDA()
    ldaOperator.fit(new_training_data,training_labels)
    print "Score = " + str(accuracy_score(validation_labels,ldaOperator.predict(new_validation_data)))
def getPrincipleComponents(xtr, xte, n_components=50):
    train = np.array(xtr)
    test = np.array(xte)
    pca = RandomizedPCA(n_components=n_components).fit(train)
    xtrain = pca.transform(train)
    xtest = pca.transform(test)
    return xtrain, xtest
Example #3
0
def SVM(X, y):

	X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=TRAIN_TEST_SPLIT_RATIO)
	print(len(X_train))

    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
	n_components = 150
	pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)


	print("Projecting the input data on the eigenfaces orthonormal basis")
	X_train_pca = pca.transform(X_train)
	X_test_pca = pca.transform(X_test)
	print("done ")

	X_train_pca = equalize_hist(X_train_pca)
	preprocessing.scale(X_train_pca * 1.0, axis=1)
	X_test_pca = equalize_hist(X_test_pca)
	preprocessing.scale(X_test_pca * 1.0, axis=1)

    # classifier = svm.SVC(kernel='poly', degree = 3)
    # classifier.fit(X_train, y_train)
    # # print("======",3,"========")
    # print('TRAIN SCORE', classifier.score(X_train, y_train))
    # print('TEST SCORE', classifier.score(X_test, y_test))


	param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
	classifier2 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
	classifier2.fit(X_train_pca, y_train)
	# print("======",3,"========")
	print('TRAIN SCORE', classifier2.score(X_train_pca, y_train))
	print('TEST SCORE', classifier2.score(X_test_pca, y_test))
Example #4
0
def SVM(X_data, y_data):

	X_data = equalize_hist(X_data) 
	preprocessing.normalize(X_data, 'max')
	preprocessing.scale(X_data, axis=1)
	# preprocessing.normalize(X_data, 'max')
	# X_data = equalize_hist(X_data) 

	# divide our data set into a training set and a test set
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_data, y_data, test_size=TRAIN_TEST_SPLIT_RATIO)

	n_components = 120

	print("Extracting the top %d eigenfaces from %d faces"
		% (n_components, X_train.shape[0]))
	pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

	print("Projecting the input data on the eigenfaces orthonormal basis")
	X_train_pca = pca.transform(X_train)
	X_test_pca = pca.transform(X_test)
	print("done ")

	param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
	'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
	classifier = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
	classifier.fit(X_train_pca, y_train)



	print("====== PCA 150 ========")
	print('TRAIN SCORE', classifier.score(X_train_pca, y_train))
	print('TEST SCORE', classifier.score(X_test_pca, y_test))
Example #5
0
def SVM(X_train, y_train, X_test):
    print("SVM with PCA of rbf, writening all on, no normalize")
    preprocessing.normalize(X_train, 'max')
    preprocessing.normalize(X_test, 'max')
    #preprocessing.robust_scale(X, axis=1, with_centering = True) #bad
    X_train = equalize_hist(X_train)
    X_test = equalize_hist(X_test)
    '''X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=TRAIN_TEST_SPLIT_RATIO)'''

    n_components = 147

    print("Extracting the top %d eigenfaces from %d faces"
          % (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    classifier13.fit(X_train_pca, y_train)
    return list(classifier13.predict(X_test_pca))
Example #6
0
def do_nbnn(train_folder, test_folder):
    train = load_patches(args.train_folder)
    test = load_patches(args.test_folder)
    if options.relu:
        get_logger().info("Applying RELU")
        for class_data in train:
            class_data.patches = class_data.patches.clip(min=0)
        for class_data in test:
            class_data.patches = class_data.patches.clip(min=0)
    if options.scale:
        get_logger().info("Applying standardization")
        scaler = StandardScaler(copy=False)
        scaler.fit(np.vstack([t.patches for t in train]))
        for class_data in train:
            class_data.patches = scaler.transform(class_data.patches)
        for class_data in test:
            class_data.patches = scaler.transform(class_data.patches)
    if options.pca:
        get_logger().info("Calculating PCA")
        pca = RandomizedPCA(n_components=options.pca)
        pca.fit(np.vstack([t.patches for t in train]))
        #for class_data in train:
            #get_logger().info("Fitting class " + class_data.name)
            #pca.partial_fit(class_data.patches)
        get_logger().info("Keeping " + str(pca.explained_variance_ratio_.sum()) + " variance (" + str(options.pca) +
             ") components\nApplying PCA")
        for class_data in train:
            class_data.patches = pca.transform(class_data.patches)
        for class_data in test:
            class_data.patches = pca.transform(class_data.patches)
    nbnn(train, test, NN_Engine())
def main():
    #create the training & test sets, skipping the header row with [1:]
    dataset = genfromtxt(open('data/train.csv','r'), delimiter=',', dtype='u1')[1:]    
    target = [x[0] for x in dataset]
    train = [x[1:] for x in dataset]
    test = genfromtxt(open('data/test.csv','r'), delimiter=',', dtype='u1')[1:]

    #build crossvalidation training set
    train_train, train_test, target_train, target_test = cross_validation.train_test_split(train, target, test_size=0.2, random_state=0)
    print train_train.shape
    print train_test.shape

    #PCA
    pca = RandomizedPCA(n_components=40)
    pca.fit(train_train)
    
    #create and train the random forest
    rf = RandomForestClassifier(n_estimators=1000, n_jobs=4)
    rf.fit(hstack((train_train, pca.transform(train_train))), target_train)
    print "crossval score is: ", rf.score(hstack((train_test, pca.transform(train_test))), target_test)

    labelid = np.array(range(1,28001))

    output = rf.predict(hstack((test, pca.transform(test))))
    savetxt('data/submission.csv', np.column_stack((labelid, output)), delimiter=',', header="ImageId,Label", fmt='%u', comments='')
def pca_data(test_x, train_x, params):
    print 'pcaing data ...'
    components = int(params['components'])
    pca = RandomizedPCA(components, whiten=True).fit(train_x)
    pca_train_x = pca.transform(train_x)
    pca_test_x  = pca.transform(test_x)
    return pca_test_x, pca_train_x
Example #9
0
def test_sparse_randomized_pca_inverse():
    """Test that RandomizedPCA is inversible on sparse data"""
    rng = np.random.RandomState(0)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= 0.00001  # make middle component relatively small
    # no large means because the sparse version of randomized pca does not do
    # centering to avoid breaking the sparsity
    X = csr_matrix(X)

    # same check that we can find the original data from the transformed signal
    # (since the data is almost of rank n_components)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", DeprecationWarning)
        pca = RandomizedPCA(n_components=2, random_state=0).fit(X)
        assert_equal(len(w), 1)
        assert_equal(w[0].category, DeprecationWarning)

    Y = pca.transform(X)

    Y_inverse = pca.inverse_transform(Y)
    assert_almost_equal(X.todense(), Y_inverse, decimal=2)

    # same as above with whitening (approximate reconstruction)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", DeprecationWarning)
        pca = RandomizedPCA(n_components=2, whiten=True, random_state=0).fit(X)
        assert_equal(len(w), 1)
        assert_equal(w[0].category, DeprecationWarning)

    Y = pca.transform(X)
    Y_inverse = pca.inverse_transform(Y)
    relative_max_delta = (np.abs(X.todense() - Y_inverse) / np.abs(X).mean()).max()
    # XXX: this does not seam to work as expected:
    assert_almost_equal(relative_max_delta, 0.91, decimal=2)
def LogisticRegressionPCA(X, y):

	# divide our data set into a training set and a test set
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    									X, y, test_size=TRAIN_TEST_SPLIT_RATIO)

	# get randomized PCA model
	num_components = 147
	print("Extracting the top %d eigenfaces from %d faces"
          % (num_components, X_train.shape[0]))
	pca = RandomizedPCA(n_components=num_components, whiten=True).fit(X_train)

    # use the PCA model on our training set and test set.
	print("Projecting the input data on the eigenfaces orthonormal basis")
	X_train_pca = pca.transform(X_train)
	X_test_pca = pca.transform(X_test)
	print("done ")

	h = .02  # step size in the mesh

	logistic_regression = linear_model.LogisticRegression(C=1e5)

	# we create an instance of Neighbours Classifier and fit the data.
	logistic_regression.fit(X, y)

	# print the performance of logistic regression 
	print("====== Logistic Regression with PCA ========")
	print('TRAIN SCORE', logistic_regression.score(X_train, y_train))
	print('TEST SCORE', logistic_regression.score(X_test, y_test))
Example #11
0
File: PCA.py Project: himl/boson
def pca_estimator(data, targets, estimator, components_number=DEFAULT_COMPONENTS_NUMBER,
                  folds_number=DEFAULT_FOLDS_NUMBER):

    kf = KFold(len(targets), n_folds=folds_number)

    # 'scores' is numpy array. An index is a number of a fold. A value is a percent of right
    # predicted samples from a test.
    scores = np.zeros(folds_number)

    start = time()

    index = 0
    for train, test in kf:
        x_train, x_test, y_train, y_test = data[train], data[test], targets[train], targets[test]

        pca = RandomizedPCA(n_components=components_number, whiten=True).fit(x_train)
        x_train_pca = pca.transform(x_train)
        x_test_pca = pca.transform(x_test)

        clf = estimator.fit(x_train_pca, y_train)
        scores[index] = clf.score(x_test_pca, y_test)
        index += 1
        # print("Iteration %d from %d has done! Score: %f" % (index, folds_number,
        #                                                     scores[index - 1]))
    finish = time()

    return scores.mean(), scores.std() * 2, (finish - start)
Example #12
0
def rpca(train_X, test_X, n):
	start_time = time.time()
	pca = RandomizedPCA(n_components=n)
	pca.fit(train_X.toarray())
	train_X_pca = pca.transform(train_X.toarray())
	test_X_pca = pca.transform(test_X.toarray())
	print("--- %s seconds ---" % (time.time() - start_time))
	return pca, train_X_pca, test_X_pca
Example #13
0
class Cluster(object):

    def __init__(self, name):
        self.name = name
        self.raw_dataset = []
        self.dataset = []
        self.dataset_red = []
    
    def get_featurevec(self, data):
            '''Takes in data in the form of an array of EmoPackets, and outputs
                a list of feature vectors.'''
            # CHECKED, all good :) 
            num_bins = (len(data)/int(dsp.SAMPLE_RATE*dsp.STAGGER) -
                        int(dsp.BIN_SIZE / dsp.STAGGER) + 1)
            size = int(dsp.BIN_SIZE*dsp.SAMPLE_RATE)
            starts = int(dsp.SAMPLE_RATE*dsp.STAGGER)
            points = []
            for i in range(num_bins):
                points.append(dsp.get_features(data[i*starts:i*starts+size]))
            return points

    def add_data(self, raw):
        '''Allows the addition of new data. Will retrain upon addition.
            Expects a list of EmoPackets.'''
        self.dataset.extend(self.get_featurevec(raw))

    def extract_features(self):
        '''Does feature extraction for all of the datasets.'''
        self.dataset = []
        for sess in self.raw_dataset:
            self.dataset.extend(self.get_featurevec(sess))

    def reduce_dim(self, NDIM=5):
        '''Reduces the dimension of the extracted feature vectors.'''
        X = np.array(self.dataset)
        self.pca = RandomizedPCA(n_components=NDIM).fit(X)
        self.dataset_red = self.pca.transform(X)
        
    def train(self):
        '''Trains the classifier.'''
        self.svm = OneClassSVM()
        self.svm.fit(self.dataset_red)

    def is_novel(self, pt):
        '''Says whether or not the bin is novel. Expects an array of EmoPackets'''
        X = self.pca.transform(np.array(self.get_featurevec(data)[0]))
        ans = self.svm.predict(X)
        self.dataset_red.append(X)
        self.train()
        return ans
                    
    def save(self):
        '''Saves this classifier to a data directory.'''
        this_dir, this_filename = os.path.split(__file__)
        DATA_PATH = os.path.join(this_dir, "data", self.name+'.pkl')
        dumpfile = open(DATA_PATH, "wb")
        pickle.dump(self, dumpfile, pickle.HIGHEST_PROTOCOL)
        dumpfile.close()
Example #14
0
 def reduce_dim(self, NDIM=5):
     '''Reduces the dimension of the extracted feature vectors.'''
     X = np.array(self.neutral)
     pca = RandomizedPCA(n_components=NDIM).fit(X)
     print pca.explained_variance_ratio_
     self.pca = pca
     self.neutral_red = pca.transform(X)
     for label in self.labelled:
         X = np.array(self.labelled[label])
         self.labelled_red[label] = pca.transform(X)
Example #15
0
    def compute_pca(self):
#        print 'We have ', self.x.shape[1], 'features. Reducing dimensionality.'
        pca_count = 200
        pca = RandomizedPCA(pca_count, copy = False, whiten=True)
        pca.fit(self.x_train)
        self.x_train = pca.transform(self.x_train)
        if self.do_submission:
            self.x_test = pca.transform(self.x_test)

        if self.do_validation():
            self.x_validate = pca.transform(self.x_validate)
Example #16
0
def face_rec():
    IMG_RES = 100 * 100 # img resolution
    NUM_EIGENFACES = 10 # images per train person
    NUM_TRAINIMAGES = 110 # total images in training set

    #loading training set from folder train_faces
    folders = glob.glob('/home/pi/New/SelfieLibrary/cropped/gallery')
 
    # Create an array with flattened images X
    # and an array with ID of the people on each image y
    X = np.zeros([NUM_TRAINIMAGES, IMG_RES], dtype='int8')
    y = []

    # Populate training array with flattened imags from subfolders of train_faces and names
    c = 0
    for x, folder in enumerate(folders):
        train_faces = glob.glob(folder + '/*')
    
        for i, face in enumerate(train_faces):
            X[c,:] = prepare_image(face)
            y.append(face)
            c = c + 1
            
    # perform principal component analysis on the images
    pca = RandomizedPCA(n_components=NUM_EIGENFACES, whiten=True).fit(X)
    X_pca = pca.transform(X)

    # load test faces (usually one), located in folder test_faces
    test_faces = glob.glob('/home/pi/New/SelfieLibrary/cropped/probe/*')

    # Create an array with flattened images X
    X = np.zeros([len(test_faces), IMG_RES], dtype='int8')
 
    # Populate test array with flattened imags from subfolders of train_faces 
    for i, face in enumerate(test_faces):
        X[i,:] = prepare_image(face)
 
    # run through test images (usually one)
    for j, ref_pca in enumerate(pca.transform(X)):
        distances = []
    # Calculate euclidian distance from test image to each of the known images and save distances
        for i, test_pca in enumerate(X_pca):
            if i<c:
                dist = math.sqrt(sum([diff**2 for diff in (ref_pca - test_pca)]))
                distances.append((dist, y[i]))
        
        found_ID = min(distances)[1]
        print float(min(distances)[0])
        
        
        print "Identified (result: "+ str(found_ID) +" - dist - " + str(min(distances)[0])  + ")"
        return distances
Example #17
0
def tryDenseOperator(goFast, operatorClass, parameterGrid):
  bestScore = 0
  bestRpcaParams = None
  bestOperatorParams = None

  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True)

  from sklearn.metrics import accuracy_score
  from sklearn.grid_search import ParameterGrid
  from sklearn.decomposition import RandomizedPCA

  rpcaDataGrid = [{"n_components": [10,45,70,100],
                    "iterated_power": [1, 2, 3, 4],
                    "whiten": [False, True]}]

  for rpca_parameter_set in ParameterGrid(rpcaDataGrid):
    try:
      rpcaOperator = RandomizedPCA(**rpca_parameter_set)
      rpcaOperator.fit(training_data,training_labels)
      new_training_data = rpcaOperator.transform(training_data,training_labels)
      new_validation_data = rpcaOperator.transform(validation_data,validation_labels)
      for dense_operator_parameter_set in ParameterGrid(parameterGrid):
        try:
          denseOperator = operatorClass(**dense_operator_parameter_set)
          denseOperator.fit(new_training_data,training_labels)
          score = accuracy_score(validation_labels,denseOperator.predict(new_validation_data))
          print "Score = " + str(score)
          if score > bestScore:
            bestScore = score
            bestRpcaParams = rpca_parameter_set
            bestOperatorParams = dense_operator_parameter_set
            print "***New best score: " + str(bestScore)
            print "***RPCA params: " + str(bestRpcaParams)
            print "***Operator params: " + str(bestOperatorParams)
        except:
          print "Illegal combination skipped"
          print sys.exc_info()[:2]
    except:
      print "Illegal combination skipped."
      print sys.exc_info()[:2]

  print "***New best score: " + str(bestScore)
  print "***RPCA params: " + str(bestRpcaParams)
  print "***Operator params: " + str(bestOperatorParams)
Example #18
0
def test(n_components):
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    param_grid = {
             'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
              }
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
    clf = clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    print classification_report(y_test, y_pred, target_names=target_names)
    print confusion_matrix(y_test, y_pred, labels=range(n_classes))
Example #19
0
def run_stuff(n_components):

    ###############################################################################
    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
    #n_components = 10

    print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
    t0 = time()
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    print "done in %0.3fs" % (time() - t0)
    print "Explaining variance:"
    print np.round(pca.explained_variance_ratio_[:10], decimals=3)

    eigenfaces = pca.components_.reshape((n_components, h, w))

    print "Projecting the input data on the eigenfaces orthonormal basis"
    t0 = time()
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print "done in %0.3fs" % (time() - t0)


    ###############################################################################
    # Train a SVM classification model

    print "Fitting the classifier to the training set"
    t0 = time()
    param_grid = {
            'C': [1e3, 5e3, 1e4, 5e4, 1e5],
            'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
            }
    # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    clf = clf.fit(X_train_pca, y_train)
    print "done in %0.3fs" % (time() - t0)
    print "Best estimator found by grid search:"
    print clf.best_estimator_


    ###############################################################################
    # Quantitative evaluation of the model quality on the test set

    print "Predicting the people names on the testing set"
    t0 = time()
    y_pred = clf.predict(X_test_pca)
    print "done in %0.3fs" % (time() - t0)
    print "Results for " + str(n_components) + " components:"
    print classification_report(y_test, y_pred, target_names=target_names)
    print confusion_matrix(y_test, y_pred, labels=range(n_classes))
    return y_pred, y_test, target_names, X_test, h, w, eigenfaces
Example #20
0
def perform_rec():

    num_files = sum([len(files) for r, d, files in os.walk('train_faces/')])
    IMG_RES = 92 * 112 # img resolution
    NUM_EIGENFACES = 10 # images per train person
    NUM_TRAINIMAGES = num_files - 1 # total images in training set

    #loading training set from folder train_faces
    folders = glob.glob('train_faces/*')
     
    # Create an array with flattened images X
    # and an array with ID of the people on each image y
    X = np.zeros([NUM_TRAINIMAGES, IMG_RES], dtype='int8')
    y = []

    # Populate training array with flattened imags from subfolders of train_faces and names
    c = 0
    for x, folder in enumerate(folders):
        train_faces = glob.glob(folder + '/*')
        for i, face in enumerate(train_faces):
            X[c,:] = prepare_image(face)
            y.append(ID_from_filename(face))
            c = c + 1

    # perform principal component analysis on the images
    pca = RandomizedPCA(n_components=NUM_EIGENFACES, whiten=True).fit(X)
    X_pca = pca.transform(X)

    # load test faces (usually one), located in folder test_faces
    test_faces = glob.glob('test_faces/*')

    # Create an array with flattened images X
    X = np.zeros([len(test_faces), IMG_RES], dtype='int8')
     
    # Populate test array with flattened imags from subfolders of train_faces 
    for i, face in enumerate(test_faces):
        X[i,:] = prepare_image(face)
    
    for j, ref_pca in enumerate(pca.transform(X)):
        distances = []
        # Calculate euclidian distance from test image to each of the known images and save distances
        for i, test_pca in enumerate(X_pca):
            dist = math.sqrt(sum([diff**2 for diff in (ref_pca - test_pca)]))
            distances.append((dist, y[i]))
     
        found_ID = min(distances)[1]

        print "Identified (result: "+ str(found_ID) +" - dist - " + str(min(distances)[0])  + ")"

    return {'ident':found_ID, 'dist':min(distances)[0] }
Example #21
0
def return_pca_transformed_data(train_features, test_features):
    # Principal Component Analysis: we are going to find the optimum principal
    # components and transform the training and testing data accordingly for other classifier
    # that I will using while pipelining them.
    pca = RandomizedPCA(n_components=11, whiten=True).fit(train_features)
    # Array containing means of individual features for the entire dataset.
    means = pca.mean_
    total = sum(pca.explained_variance_)
    print "PCA trained"
    print "The first component expains: ", (float(pca.explained_variance_[0]) / total) * 100, "% of the variance"
    # Training and Testing data transformed by PCA
    features_train_pca = pca.transform(train_features)
    features_test_pca = pca.transform(test_features)
    return features_train_pca, features_test_pca
Example #22
0
def pcaFaces(X,y,n_components,h,w):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
    # insert code here
    t0 = time()
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    print "done in %0.3fs" % (time() - t0)
    print "Projecting the input data on the eigenfaces orthonormal basis"
    # insert code here
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    eigenfaces = pca.components_.reshape((n_components, h, w))
    print "done in %0.3fs" % (time() - t0)
    return X_train_pca,X_test_pca,X_test,y_train,y_test,eigenfaces
def pca(X_train, X_test, n):
    """Use PCA to perform unsupervised feature extraction."""

    print "Extracting %d principle components from %d features" % (n, X_train.shape[1])
    t0 = time()
    pca = RandomizedPCA(n_components=n, whiten=True, random_state=47).fit(X_train)
    print "done in %0.3fs" % (time() - t0)

    print "Transforming the input data"
    t0 = time()
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print "done in %0.3fs" % (time() - t0)

    return X_train_pca, X_test_pca
def preprocess(cross_validation_tuple, preprocess_correlation=False, preprocess_scaling=False):

    X = cross_validation_tuple.X_training
    X_test = cross_validation_tuple.X_testing
    if preprocess_scaling:
        scaler = preprocessing.StandardScaler().fit(cross_validation_tuple.X_training)
        X = scaler.transform(X)
        X_test = scaler.transform(X_test)
    if preprocess_correlation:
        from sklearn.decomposition import RandomizedPCA
        pca = RandomizedPCA(n_components=0.99, whiten=True)  # n between 0 and 1 to select number of componnents to explain 99 percents of the data
        pca.fit(X)
        print("PCA component keeping {}".format(pca.n_components))
        X = pca.transform(X)
        X_test = pca.transform(X_test)
    return Cross_validation_split(X, X_test, cross_validation_tuple.Y_training, cross_validation_tuple.Y_testing)
def test_explained_variance():
    # Check that PCA output has unit-variance
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 80

    X = rng.randn(n_samples, n_features)

    pca = PCA(n_components=2).fit(X)
    rpca = RandomizedPCA(n_components=2, random_state=rng).fit(X)
    assert_array_almost_equal(pca.explained_variance_ratio_,
                              rpca.explained_variance_ratio_, 1)

    # compare to empirical variances
    X_pca = pca.transform(X)
    assert_array_almost_equal(pca.explained_variance_,
                              np.var(X_pca, axis=0))

    X_rpca = rpca.transform(X)
    assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, axis=0),
                              decimal=1)

    # Same with correlated data
    X = datasets.make_classification(n_samples, n_features,
                                     n_informative=n_features-2,
                                     random_state=rng)[0]

    pca = PCA(n_components=2).fit(X)
    rpca = RandomizedPCA(n_components=2, random_state=rng).fit(X)
    assert_array_almost_equal(pca.explained_variance_ratio_,
                              rpca.explained_variance_ratio_, 5)
def dimentionality_reduction(train_x , test_x):
	print "Dimentionality reduction to 10D on training and test data...."
	pca = RandomizedPCA(n_components=10)
	train_x = pca.fit_transform(train_x)
	test_x = pca.transform(test_x)
	print "Done."
	return train_x , test_x
Example #27
0
File: odr.py Project: caoym/odr
    def fit(self):

        wordids_map = NameToIndex()
        labs_map = NameToIndex()

        wordscount = self._word_cluster.get_words_count()
        print "start compute_tfidf ..."
        #计算文档的词袋模型
        docs = self._word_cluster.get_samples()
        count =0
        bow = []
        labs = []

        for k,v in docs.iteritems():
            vec = numpy.zeros(wordscount).tolist()
            for i in v:
                vec[wordids_map.map(i)]+=1
            bow.append(vec)
            labs.append(labs_map.map(k[0]))

        labs = numpy.array(labs)

        tfidf = TfidfTransformer(smooth_idf=True, sublinear_tf=True,use_idf=True)
        datas = numpy.array(tfidf.fit_transform(bow).toarray())

        print "compute_tfidf done"
        pca = RandomizedPCA(n_components=20, whiten=True).fit(datas)
        svc = train_svc(numpy.array(labs_map.names), labs, pca.transform(datas))

        self._tfidf = tfidf
        self._svc = svc
        self._labs_map = labs_map
        self._wordids_map = wordids_map
        self._pca = pca
Example #28
0
def test_explained_variance():
    """Check that PCA output has unit-variance"""
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 80

    X = rng.randn(n_samples, n_features)

    pca = PCA(n_components=2).fit(X)
    rpca = RandomizedPCA(n_components=2, random_state=42).fit(X)
    assert_array_almost_equal(pca.explained_variance_,
                              rpca.explained_variance_, 1)
    assert_array_almost_equal(pca.explained_variance_ratio_,
                              rpca.explained_variance_ratio_, 3)

    # compare to empirical variances
    X_pca = pca.transform(X)
    assert_array_almost_equal(pca.explained_variance_,
                              np.var(X_pca, axis=0))

    X_rpca = rpca.transform(X)
    assert_array_almost_equal(rpca.explained_variance_,
                              np.var(X_rpca, axis=0))

    # Compare with RandomizedPCA using sparse data
    X = csr_matrix(X)
    rpca = assert_warns(DeprecationWarning, rpca.fit, X)
    assert_array_almost_equal(pca.explained_variance_,
                              rpca.explained_variance_, 1)
    assert_array_almost_equal(pca.explained_variance_ratio_,
                              rpca.explained_variance_ratio_, 3)
def _eliminate_features(X_test, X_train, attribute_count, y_train):
    print "Eliminating features until %d has been reached" % attribute_count

    pca = RandomizedPCA(n_components=attribute_count+10).fit(X_train)
    X_train = pca.transform(to_float(X_train))
    print "Finished pca"

    clf = SVC(**SVC_parameters)
    rfe = RFE(clf, n_features_to_select=attribute_count, step=0.1)
    fit = rfe.fit(X_train, y_train)
    print "Finished rfe"

    # Reduce the feature matrices to contain just the selected features
    X_train = [fit.transform(X) for X in X_train]
    X_test = [fit.transform(X) for X in pca.transform(to_float(X_test))]
    return X_test, X_train
Example #30
0
def main():
    img_dir = 'images/'
    images = [img_dir + f for f in os.listdir(img_dir)]
    labels = [f.split('/')[-1].split('_')[0] for f in images]
    label2ids = {v: i for i, v in enumerate(sorted(set(labels),
                                                   key=labels.index))}
    y = np.array([label2ids[l] for l in labels])

    data = []
    for image_file in images:
        img = img_to_matrix(image_file)
        img = flatten_image(img)
        data.append(img)
    data = np.array(data)

    # training samples
    is_train = np.random.uniform(0, 1, len(data)) <= 0.7
    train_X, train_y = data[is_train], y[is_train]

    # training a classifier
    pca = RandomizedPCA(n_components=5)
    train_X = pca.fit_transform(train_X)
    multi_svm = OneVsRestClassifier(LinearSVC())
    multi_svm.fit(train_X, train_y)

    # evaluating the model
    test_X, test_y = data[is_train == False], y[is_train == False]
    test_X = pca.transform(test_X)
    print pd.crosstab(test_y, multi_svm.predict(test_X),
                      rownames=['Actual'], colnames=['Predicted'])
Example #31
0
        montage.addResult(component)

    mean = pca.mean_.reshape((62,47))
    mean = exposure.rescale_intensity(mean, out_range=(0,255)).astype("uint8")
    cv2.imshow("Mean", mean)
    cv2.imshow("components", montage.montage)
    cv2.WaitKey(0)


# train a classifier on the eigenfaces representation
print("[INFO] training classifier...")
model = SVC(kernel="rbf", C=10.0, gamma=0.001, random_state=84)
model.fit(trainData, training.target)

# evaluate the model
print("[INFO] evaluating model...")
predictions = model.predict(pca.transform(testing.data))
print(classification_report(testing.target, predictions))

# loop over the desired number of samples
for i in np.random.randint(0, high=len(testing.data), size=(args["sample_size"],)):
    # grab the face and classify it
    face = testing.data[i].reshape((62, 47)).astype("uint8")
    prediction = model.predict(pca.transform(testing.data[i].reshape(1, -1)))

    # resize the face to make it more visable, then display the face and the prediction
    print("[INFO] Prediction: {}, Actual: {}".format(prediction[0], testing.target[i]))
    face = imutils.resize(face, width=face.shape[1] * 2, inter=cv2.INTER_CUBIC)
    cv2.imshow("Face", face)
    cv2.waitKey(0)
__author__ = 'pglebow'
from sklearn.decomposition import RandomizedPCA
from sklearn.neighbors import KNeighborsClassifier

pca = RandomizedPCA(n_components=5)
train_x = pca.fit_transform(train_x)
test_x = pca.transform(test_x)

print train_x[:5]
#array([[ 12614.55016475,  -9156.62662224,  -7649.37090539,  -3230.94749506,
#          2495.71170459],
#       [ 16111.39363837,   -259.55063579,    699.60464599,   3058.59026495,
#         -1552.34714653],
#       [ 15019.71069584,  -6403.86621428,   1968.44401114,   2896.76676466,
#         -2157.76499726],
#       [ 13410.53053415,  -1658.3751377 ,    261.26829049,   1991.33404567,
#          -486.60683822],
#       [ 12717.28773107,  -1544.27233216,  -1279.70167969,    503.33658729,
#           -38.00244617]])

knn = KNeighborsClassifier()
knn.fit(train_x, train_y)
Example #33
0
        ax1.set_xlabel("Silhouette coefficient")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhoutte score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([0, 0.02, 0.04, 0.06, 0.08, .1])
        plt.show()
        #       fig.savefig('silhouette50_%s'%n_clusters, dpi=700)
        #    # 2nd Plot showing the actual clusters formed

        pca = RandomizedPCA(n_components=2)
        reduced_data = pca.fit_transform(X.toarray())
        print "PCA done"
        centroids = pca.transform(clusterer.cluster_centers_)
        for i in range(n_clusters):
            col = cm.spectral(float(i) / n_clusters)
            ax2.plot(reduced_data[np.argwhere(cluster_labels==i), 0], reduced_data[np.argwhere(cluster_labels==i), 1], \
                     '.', markersize=4, alpha=0.6,color=col)
            ax2.scatter(centroids[:, 0],
                        centroids[:, 1],
                        marker='x',
                        s=169,
                        linewidths=3,
                        color='w',
                        zorder=10)
            ax2.set_xticks([])
            ax2.set_yticks([])
            ax2.set_xticklabels([])
            ax2.set_xlabel("1st PCA component")
Example #34
0
def pca(n_components=150):
    n_components = n_components

    print "Extracting the top %d eigenfaces from %d faces" % (n_components,
                                                              X_train.shape[0])
    t0 = time()
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    print "done in %0.3fs" % (time() - t0)

    eigenfaces = pca.components_.reshape((n_components, h, w))

    print "Projecting the input data on the eigenfaces orthonormal basis"
    t0 = time()
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print "done in %0.3fs" % (time() - t0)

    ###############################################################################
    # Train a SVM classification model

    print "Fitting the classifier to the training set"
    t0 = time()
    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    clf = clf.fit(X_train_pca, y_train)
    print "done in %0.3fs" % (time() - t0)
    print "Best estimator found by grid search:"
    print clf.best_estimator_

    ###############################################################################
    # Quantitative evaluation of the model quality on the test set

    print "Predicting the people names on the testing set"
    t0 = time()
    y_pred = clf.predict(X_test_pca)
    print "done in %0.3fs" % (time() - t0)

    print classification_report(y_test, y_pred, target_names=target_names)
    print confusion_matrix(y_test, y_pred, labels=range(n_classes))

    ###############################################################################
    # Qualitative evaluation of the predictions using matplotlib

    #def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    #    """Helper function to plot a gallery of portraits"""
    #    pl.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    #    pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    #    for i in range(n_row * n_col):
    #        pl.subplot(n_row, n_col, i + 1)
    #        pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray)
    #        pl.title(titles[i], size=12)
    #        pl.xticks(())
    #        pl.yticks(())

    # plot the result of the prediction on a portion of the test set

    def title(y_pred, y_test, target_names, i):
        pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
        true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
        return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)
Example #35
0
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150

print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train.shape[0]))
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))

print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))

# Train a SVM classification model
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

# Quantitative evaluation of the model quality on the test set
    for cc in CHROMOSOMES:
        curs.execute("select * from %s" % chrtable + condition)
        for rr in ResultIter(curs):
            # print(rr)
            cs = CoverageSqlite(cc, rr, args)
            # cs.print_range("snp_cov", subrange, h)
            feature_array[ii, :] = cs.feature_vector(subrange, "snp_cov")
            ii += 1

# X = feature_array - np.mean(feature_array, axis = 0)
from sklearn.decomposition import RandomizedPCA  # import sklearn
pca = RandomizedPCA(n_components=3, iterated_power=7)
pca.fit(feature_array)
print(pca.explained_variance_ratio_)
Y = pca.transform(feature_array)

from sklearn.decomposition import FastICA  # import sklearn
pca = FastICA(n_components=3)
pca.fit(feature_array)
# print(pca.explained_variance_ratio_)
Y = pca.transform(feature_array)

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(5, 5))
plt.plot(Y[:, 0], Y[:, 1], 'b.')
plt.show()

fig = plt.figure(figsize=(5, 5))
plt.plot(Y[:, 0], Y[:, 2], 'r.')
Example #37
0
for i, f in enumerate(files):
    print i, "of", len(files)
    data.append(get_image_data(f))
    labels.append(int(f.split(".")[-2][-1]))
print "done."

pca = RandomizedPCA(n_components=10)
std_scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    labels,
                                                    test_size=0.1)

print "scaling data..."
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
print "done."

print "transforming data..."
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)
print "done."

print "training model..."
clf = KNeighborsClassifier(n_neighbors=33)
clf.fit(X_train, y_train)
print "done"
print "=" * 20
print clf

print "Confusion Matrix"
Example #38
0
def build_SVC(face_profile_data, face_profile_name_index, face_dim):
    """
    Build the SVM classification modle using the face_profile_data matrix (numOfFace X numOfPixel) and face_profile_name_index array, face_dim is a tuple of the dimension of each image(h,w) Returns the SVM classification modle
    Parameters
    ----------
    face_profile_data : ndarray (number_of_images_in_face_profiles, width * height of the image)
        The pca that contains the top eigenvectors extracted using approximated Singular Value Decomposition of the data

    face_profile_name_index : ndarray
        The name corresponding to the face profile is encoded in its index

    face_dim : tuple (int, int)
        The dimension of the face data is reshaped to

    Returns
    -------
    clf : theano object
        The trained SVM classification model

    pca : theano ojbect
        The pca that contains the top 150 eigenvectors extracted using approximated Singular Value Decomposition of the data

    """

    X = face_profile_data
    y = face_profile_name_index

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)

    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
    n_components = 150  # maximum number of components to keep

    print("\nExtracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))

    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    eigenfaces = pca.components_.reshape(
        (n_components, face_dim[0], face_dim[1]))

    # This portion of the code is used if the data is scarce, it uses the number
    # of imputs as the number of features
    # pca = RandomizedPCA(n_components=None, whiten=True).fit(X_train)
    # eigenfaces = pca.components_.reshape((pca.components_.shape[0], face_dim[0], face_dim[1]))

    print("\nProjecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Train a SVM classification model

    print("\nFitting the classifier to the training set")
    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    # clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)

    # Best Estimator found using Radial Basis Function Kernal:
    clf = SVC(C=1000.0,
              cache_size=200,
              class_weight='balanced',
              coef0=0.0,
              decision_function_shape=None,
              degree=3,
              gamma=0.0001,
              kernel='rbf',
              max_iter=-1,
              probability=False,
              random_state=None,
              shrinking=True,
              tol=0.001,
              verbose=False)
    # Train_pca with Alex Test Error Rate:  0.088424437299
    # Train_pca with Alex Test Recognition Rate:  0.911575562701

    clf = clf.fit(X_train_pca, y_train)
    # print("\nBest estimator found by grid search:")
    # print(clf.best_estimator_)

    ###############################################################################
    # Quantitative evaluation of the model quality on the test set
    print("\nPredicting people's names on the test set")
    t0 = time()
    y_pred = clf.predict(X_test_pca)
    print("\nPrediction took %s per sample on average" %
          ((time() - t0) / y_pred.shape[0] * 1.0))

    # print "predicated names: ", y_pred
    # print "actual names: ", y_test
    error_rate = errorRate(y_pred, y_test)
    print("\nTest Error Rate: %0.4f %%" % (error_rate * 100))
    print("Test Recognition Rate: %0.4f %%" % ((1.0 - error_rate) * 100))

    return clf, pca
Example #39
0
# Compute a PCA(eigenfaces) on the face dataset (treated as unlabeled dataset)
# unsupervised feature extraction/dimensioanlity reduction 

n_components = 150
print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0)

eigenfaces = pcs.components_.reshape((n_components, h, w))

print("Projecting the input data on the eigenfaces orthonormal basis")

t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pcs.transform(X_test)
print("Done in %0.3fs" %(time() - t0)

# Train a SVM classification model

print("Fitting the classifier to the training set")

t0 = time()
param_grid = {
            'C': [1e3, 5e3, 1e4, 5e4, 1e5]
            'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
             }

clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train_pca, y_train)
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))

for i, ax in enumerate(axes.flat):
    ax.imshow(pca.components_[i].reshape(62, 47), cmap='bone')

plt.show()

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

# compute the components and projected faces

pca = RandomizedPCA(150).fit(faces.data)
components = pca.transform(faces.data)
projected = pca.inverse_transform(components)

# plot the results

fig,ax=plt.subplots(2,10,figsize=(10,2.5),subplot_kw={'xticks':[],'yticks':[]},\
gridspec_kw=dict(hspace=0.1,wspace=0.1))

for i in range(10):
    ax[0, i].imshow(faces.data[i].reshape(62, 47), cmap='binary_r')
    ax[1, i].imshow(projected[i].reshape(62, 47), cmap='binary_r')

ax[0, 0].set_ylabel('full-dim\ninput')
ax[1, 0].set_ylabel('150-dim\nreconstruction')

plt.show()
Example #41
0
n_components = 150  # #component in PCA
cv2.destroyAllWindows()
pca = RandomizedPCA(n_components=n_components, whiten=True)

param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)

testing_data = []
for i in range(len(images)):
    testing_data.append(images[i].flatten())
pca = pca.fit(testing_data)

transformed = pca.transform(testing_data)
# if lda is done than #component = 80
#lda = LinearDiscriminantAnalysis(n_components=80)
#transformed = lda.fit(transformed, labels).transform(transformed)

clf.fit(transformed, labels)
directory2 = 'yalefaces_5'  # test directory name
image_paths = [
    os.path.join(directory2, filename) for filename in os.listdir(directory2)
]
j = 0
for image_path in image_paths:
    pred_image_pil = Image.open(image_path).convert('L')
    pred_image = np.array(pred_image_pil, 'uint8')
    faces = faceCascade.detectMultiScale(pred_image)
    if (len(faces) == 0):
Example #42
0
def SVM(X, y):

    # divide our data set into a training set and a test set
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=TRAIN_TEST_SPLIT_RATIO)

    classifier_poly2 = svm.SVC(kernel='poly', degree=2)
    classifier_poly2.fit(X_train, y_train)
    print("======= poly degree=2 ========")
    print('TRAIN SCORE', classifier_poly2.score(X_train, y_train))
    print('TEST SCORE', classifier_poly2.score(X_test, y_test))

    n_components = 10

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier11 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier11.fit(X_train_pca, y_train)

    print("====== PCA 10 ========")
    print('TRAIN SCORE', classifier11.score(X_train_pca, y_train))
    print('TEST SCORE', classifier11.score(X_test_pca, y_test))

    n_components = 50

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier12 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier12.fit(X_train_pca, y_train)

    print("====== PCA 50 ========")
    print('TRAIN SCORE', classifier12.score(X_train_pca, y_train))
    print('TEST SCORE', classifier12.score(X_test_pca, y_test))

    n_components = 100

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 100 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))

    n_components = 120

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 120 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))

    n_components = 135

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 135 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))

    n_components = 150

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 150 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))

    n_components = 165

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 165 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))

    n_components = 180

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 180 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))

    n_components = 200

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 200 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))

    n_components = 400

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 400 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))
# In[6]:

scaler = StandardScaler(with_std=False)
scaled = scaler.fit_transform(df)

pca = RandomizedPCA(n_components=15, random_state=1)
pca.fit(scaled)

pca.explained_variance_ratio_.cumsum()[-1]


# In[7]:

df_fuzzy = pd.read_csv('features_fuzzy_train.csv', usecols=columns,dtype=np.float32)
scaled = scaler.transform(df_fuzzy)
X = pca.transform(scaled)


# In[8]:

df_fuzzy_test = pd.read_csv('features_fuzzy_test.csv', usecols=columns,dtype=np.float32)
scaled = scaler.transform(df_fuzzy_test)
X_test = pca.transform(scaled)


# In[9]:

for i in range(5):
    df_res_train['pca_fuzzy_%d' % i] = X[:, i]
    df_res_test['pca_fuzzy_%d' % i] = X_test[:, i]
Example #44
0
def test_SVM(face_profile_data, face_profile_name_index, face_dim,
             face_profile_names):
    """
    Testing: Build the SVM classification modle using the face_profile_data matrix (numOfFace X numOfPixel) and face_profile_name_index array, face_dim is a tuple of the dimension of each image(h,w) Returns the SVM classification modle
    Parameters
    ----------
    face_profile_data : ndarray (number_of_images_in_face_profiles, width * height of the image)
        The pca that contains the top eigenvectors extracted using approximated Singular Value Decomposition of the data

    face_profile_name_index : ndarray
        The name corresponding to the face profile is encoded in its index

    face_dim : tuple (int, int)
        The dimension of the face data is reshaped to

    face_profile_names: ndarray
        The names corresponding to the face profiles
    Returns
    -------
    clf : theano object
        The trained SVM classification model

    pca : theano ojbect
        The pca that contains the top 150 eigenvectors extracted using approximated Singular Value Decomposition of the data

    """
    X = face_profile_data
    y = face_profile_name_index

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)

    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
    n_components = 150  # maximum number of components to keep

    print("\nExtracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))

    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    eigenfaces = pca.components_.reshape(
        (n_components, face_dim[0], face_dim[1]))

    # This portion of the code is used if the data is scarce, it uses the number
    # of imputs as the number of features
    # pca = RandomizedPCA(n_components=None, whiten=True).fit(X_train)
    # eigenfaces = pca.components_.reshape((pca.components_.shape[0], face_dim[0], face_dim[1]))

    print("\nProjecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Train a SVM classification model

    print("\nFitting the classifier to the training set")
    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    # clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    # Train_pca Test Error Rate:  0.0670016750419
    # Train_pca Test Recognition Rate:  0.932998324958

    # clf = SVC(kernel='linear', C=1)
    # 2452  samples from  38  people are loaded
    # Extracting the top 150 eigenfaces from 1839 faces
    # Extracting the top 150 eigenfaces from 1790 faces
    # Train_pca Test Error Rate:  0.0904522613065
    # Train_pca Test Recognition Rate:  0.909547738693

    # clf = SVC(kernel='poly')
    # Train_pca Test Error Rate:  0.201005025126
    # Train_pca Test Recognition Rate:  0.798994974874

    # clf = SVC(kernel='sigmoid')
    # Train_pca Test Error Rate:  0.985318107667
    # Train_pca Test Recognition Rate:  0.0146818923328

    # clf = SVC(kernel='rbf').fit(X_train, y_train)
    # Train_pca Test Error Rate:  0.0619765494137
    # Train_pca Test Recognition Rate:  0.938023450586

    # Best Estimator found using Radial Basis Function Kernal:
    clf = SVC(C=1000.0,
              cache_size=200,
              class_weight='balanced',
              coef0=0.0,
              decision_function_shape=None,
              degree=3,
              gamma=0.0001,
              kernel='rbf',
              max_iter=-1,
              probability=False,
              random_state=None,
              shrinking=True,
              tol=0.001,
              verbose=False)
    # Train_pca with Alex Test Error Rate:  0.088424437299
    # Train_pca with Alex Test Recognition Rate:  0.911575562701

    clf = clf.fit(X_train_pca, y_train)
    # print("\nBest estimator found by grid search:")
    # print(clf.best_estimator_)

    ###############################################################################
    # Quantitative evaluation of the model quality on the test set
    print("\nPredicting people's names on the test set")
    t0 = time()
    y_pred = clf.predict(X_test_pca)
    print("\nPrediction took %0.8f second per sample on average" %
          ((time() - t0) / y_pred.shape[0] * 1.0))

    # print "predicated names: ", y_pred
    # print "actual names: ", y_test
    error_rate = errorRate(y_pred, y_test)
    print("\nTest Error Rate: %0.4f %%" % (error_rate * 100))
    print("Test Recognition Rate: %0.4f %%" % ((1.0 - error_rate) * 100))

    ###############################################################################
    # Testing

    # X_test_pic1 = X_test[0]
    # X_test_pic1_for_display = np.reshape(X_test_pic1, face_dim)

    # t0 = time()
    # pic1_pred_name = predict(clf, pca, X_test_pic1, face_profile_names)
    # print("\nPrediction took %0.3fs" % (time() - t0))
    # print "\nPredicated result for picture_1 name: ", pic1_pred_name
    # for i in range(1,3): print ("\n")

    # Display the picture
    # plt.figure(1)
    # plt.title(pic1_pred_name)
    # plt.subplot(111)
    # plt.imshow(X_test_pic1_for_display)
    # plt.show()

    ###############################################################################
    # Qualitative evaluation of the predictions using matplotlib
    # import matplotlib.pyplot as plt

    # def plot_gallery(images, titles, face_dim, n_row=3, n_col=4):
    #     """Helper function to plot a gallery of portraits"""
    #     plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    #     plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    #     for i in range(n_row * n_col):
    #         plt.subplot(n_row, n_col, i + 1)
    #         plt.imshow(images[i].reshape(face_dim), cmap=plt.cm.gray)
    #         plt.title(titles[i], size=12)
    #         plt.xticks(())
    #         plt.yticks(())

    # # plot the result of the prediction on a portion of the test set

    # def title(y_pred, y_test, face_profile_names, i):
    #     pred_name = face_profile_names[y_pred[i]].rsplit(' ', 1)[-1]
    #     true_name = face_profile_names[y_test[i]].rsplit(' ', 1)[-1]
    #     return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

    # prediction_titles = [title(y_pred, y_test, face_profile_names, i)
    #                      for i in range(y_pred.shape[0])]

    # plot_gallery(X_test, prediction_titles, face_dim)

    # # plot the gallery of the most significative eigenfaces

    # eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
    # plot_gallery(eigenfaces, eigenface_titles, face_dim)

    # plt.show()

    return clf, pca
Example #45
0
import numpy as np
from scipy.cluster.vq import kmeans
from scipy.spatial.distance import cdist, pdist
from sklearn import datasets
from sklearn.decomposition import RandomizedPCA
from matplotlib import pyplot as plt
from matplotlib import cm

##### data #####
# load digits dataset
data = datasets.load_digits()
t = data['target']

# perform PCA dimensionality reduction
pca = RandomizedPCA(n_components=2).fit(data['data'])
X = pca.transform(data['data'])
console = []
##### cluster data into K=1..20 clusters #####
K_MAX = 20
KK = range(1, K_MAX + 1)

KM = [kmeans(X, k) for k in KK]
centroids = [cent for (cent, var) in KM]
D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D, axis=1) for D in D_k]
dist = [np.min(D, axis=1) for D in D_k]

tot_withinss = [sum(d**2) for d in dist]  # Total within-cluster sum of squares
totss = sum(pdist(X)**2) / X.shape[0]  # The total sum of squares
betweenss = totss - tot_withinss  # The between-cluster sum of squares
Example #46
0
def dimention_reduction(financial_data):
    pca = RandomizedPCA(n_components=2)
    pca = pca.fit(financial_data)
    trans_financial_data = pca.transform(financial_data)
    return trans_financial_data
Example #47
0
class RemoveRealtorTestModel(object):
    def  __init__(self):
        self.features = None
        self.pca_model = None
        self.scorespriced = None
        self.scores = None
        self.features = None
        self.components = None
        self.log = None
        self.model = None        
        self.divided_position = None 

    def get_data(self, fquery, lookback, dataget=' select * from final_table;'):
        '''
        '''        
        f = open(fquery)
        q = f.readlines()
        q = ' '.join(q)
        q = q.replace('\n',' ')
        q = q.replace('xxxxx',str(lookback))
        engine = create_engine('postgresql://user@localhost:5432/mydb')
        q += dataget
        df = pd.read_sql_query(dataget,con=engine)
        hold_out_cutoff = pd.datetime(2014,10,1)
        dftest = df[df.listdate < hold_out_cutoff].reset_index(drop=True)
        dfhold = df[df.listdate > hold_out_cutoff].reset_index(drop=True)
        dftest.reset_index(inplace=True, drop=True)
        dfhold.reset_index(inplace=True, drop=True)
        return dftest, dftest_y,  dfhold, dfhold_y

    def init_final(self, features, components, log, divided_position, model):
        '''
        '''        
        self.features = features
        self.components = components
        self.log = log
        self.divided_position = divided_position
        self.model = model
        return None

    def fit(self, dfX, dfy):
        '''
        '''        
        start = time.time()
        dfX.reset_index(inplace=True,drop=True)
        boolvars = [col for col in dfX.columns.values if 'dvar' in col]
        if self.components > 0:
            self.pca_model = RandomizedPCA(n_components=self.components)
            self.pca_model.fit(dfX[boolvars].values)
            dftrain_bool = pd.DataFrame(self.pca_model.transform(dfX[boolvars].values))
            dftrain = pd.concat([dfX[self.features], dftrain_bool],axis=1)
        else:
            self.pca_model=None
            dftrain = dfX[self.features]
            
        if self.log:
            dftrain = self._log_feature(dftrain)
            dfy = np.log(dfy.values)
            self.log = True
        else:
            self.log = False

        if self.divided_position is not None:
            for i, j in self.divided_position:
                test_X = self._divide_two_features(test_X, i, j)
                train_X = self._divide_two_features(train_X, i, j)            
            
        self.model.fit(dftrain, dfy)
        print self.model.get_params
        print 'model fit {} homes in {}'.format(dfX.shape[0] ,time.time()-start)
        return None
    
    def predict(self, dfX, gettree):
        '''
        '''        
        start = time.time()
        dfX.reset_index(inplace=True,drop=True)
        boolvars = [col for col in dfX.columns.values if 'dvar' in col]

        if self.pca_model is not None:
            df_bool = pd.DataFrame(self.pca_model.transform(dfX[boolvars].values))            
            dftest = pd.concat([dfX[self.features], df_bool], axis=1)
        else:
            dftest = dfX[self.features]

        if self.log:
            dftest = self._log_feature(dftest)

        point_estimates = self.model.predict(dftest)
        tree_estimates = np.array([])
        
        if gettree:
            for est in self.model.estimators_:
                tree_estimates = np.concatenate([tree_estimates, est.predict(dftest)], axis=1)
        else:
            tree_estimates = 0
            
        print 'model predict (2x) {} in {}'.format(dfX.shape[0] ,time.time()-start)
        
        if self.log:
            return np.exp(point_estimates), np.exp(tree_estimates)
        else:
            return point_estimates, tree_estimates
            
    def df_time_iterator(self, df, cv, splittype):
        '''
        '''
        minlistdate = df.listdate.min()
        maxlistdate = df.statusupdate.max()
        dt = ((maxlistdate-minlistdate).days/cv)
        c = 1
        dayrandom = np.random.randint(0,16)-8
        while c < cv+1:
            #chunk does equal train/test splits
            if splittype == 'chunk':
                train = df[((df.listdate >= (minlistdate+timedelta(days=dt*(c-1)+dayrandom))) &
                           (df.statuschangedate <= (minlistdate+timedelta(days=dt*(c)))))].index

                test = df[((df.listdate > (minlistdate+timedelta(days=dt*(c)-dayrandom))) &
                          (df.statuschangedate <= (minlistdate+timedelta(days=dt*(c+1)))))].index
            
            #forward does train/test splits which grow in time
            elif splittype == 'forward':
                train = df[df.statuschangedate <= minlistdate+timedelta(days=dt*(c)-dayrandom)].index
                test = df[df.listdate > minlistdate+timedelta(days=dt*(c))].index
            c += 1
            yield train, test

    def cross_validate_model(self, dfX, dfy, features, components, log, divided_position, model, cv):
        '''
        '''        
        kf = KFold(dfX.shape[0], n_folds=cv, shuffle=True)
        dfX.reset_index(inplace=True, drop=True)
        dfy.reset_index(inplace=True, drop=True)
        mets = [metrics.median_absolute_error, metrics.r2_score, self.percent_difference]
        scores = np.zeros(len(mets))
        scorespriced = np.zeros(len(mets))
        boolvars = [col for col in dfX.columns.values if 'dvar' in col]
        if components>0:         
            features = features+boolvars 
        dfX = dfX[features]

        for train_index, test_index in kf:

            train_X, train_y = dfX.loc[train_index,:].copy(), dfy.loc[train_index].copy()
            test_X, test_y = dfX.loc[test_index,:].copy(), dfy.loc[test_index].copy()

            if components>0:
                train_X, test_X = self._pca_dummies(components, boolvars, train_X, test_X)

            if divided_position is not None:
                for i, j in divided_position:
                    test_X = self._divide_two_features(test_X, i, j)
                    train_X = self._divide_two_features(train_X, i, j)
                    
            self.features = train_X.columns.values
            
            if log:
                train_X = self._log_feature(train_X)
                train_y = np.log(train_y.values)
                test_X = self._log_feature(test_X)
                test_y = np.log(test_y.values)
            

            model.fit(train_X, train_y)

            ypred = model.predict(test_X)

            if log:
                test_y = np.exp(test_y)
                ypred = np.exp(ypred) 

            mask = (ypred>100000) & (ypred<230000)

            for i, m in enumerate(mets):
                scores[i] += m(test_y, ypred)
                scorespriced[i] +=  m(test_y[mask], ypred[mask])
        scores = scores/float(cv)
        scorespriced = scorespriced/float(cv)
        self.a_cved_model = model
        print ''.join(['-']*40)
        print ''.join(['-']*40)        
        print model.get_params
        print ''.join(['-']*40)       
        self._display_scoring_metrics(zip(mets,scores), 'full')
        print ''.join(['-']*40)
        self._display_scoring_metrics(zip(mets,scorespriced), 'priced')

        return None

    
    def scorer(self, ytrue, ypred):
        '''
        '''        
        mask = (ypred>100000) & (ypred<230000)
        mets = [metrics.median_absolute_error, metrics.r2_score, self._percent_difference]
        scores = np.zeros(len(mets))     
        scorespriced = np.zeros(len(mets))
        for i, m in enumerate(mets):
            scores[i] += m(ytrue, ypred)
            scorespriced[i] +=  m(ytrue[mask], ypred[mask])
        print 'full_size {}'.format(len(ytrue))
        self._display_scoring_metrics(zip(mets,scores), 'full')
        print ''.join(['-']*40)        
        print 'full_size {}'.format(np.sum(mask))        
        self._display_scoring_metrics(zip(mets,scorespriced), 'priced')           
        return None
        
    def percent_difference(self, ytest, ypred):
        '''
        '''        
        return np.mean(abs((ytest-ypred)/ytest)*100.)        

    
    def tree_importance(self, model, threshold):
        '''
        '''
        fimport = pd.DataFrame(zip(self.features, model.feature_importances_), columns=['feature','importance']).sort('importance', ascending=False)
        fvar=[]
        for est in model.estimators_:
            fvar.append(est.feature_importances_)
        fimport['std'] = np.array(fvar).std(axis=0)
        fimport['cumimport'] = fimport['importance'].cumsum().values
        return fimport, fimport[fimport['cumimport'] < threshold]['feature'].tolist()
    
    def _pca_dummies(self, components, boolvars, dftrain, dftest):
        '''
        '''
        dftrain.reset_index(inplace=True, drop=True)
        dftest.reset_index(inplace=True, drop=True)        
        self.pca_model = RandomizedPCA(n_components=components)
        self.pca_model.fit(dftrain[boolvars].values)

        dftrain_bool = pd.DataFrame(self.pca_model.transform(dftrain[boolvars].values))
        dftest_bool = pd.DataFrame(self.pca_model.transform(dftest[boolvars].values))
        
        dftrain.drop(boolvars, axis=1,inplace=True)
        dftest.drop(boolvars, axis=1,inplace=True)
        features = dftrain.columns.tolist()+['pca'+str(i) for i in range(components)]        
        dftrain = pd.concat([dftrain, dftrain_bool], axis=1, ignore_index=True)
        dftest = pd.concat([dftest, dftest_bool], axis=1, ignore_index=True)

        dftrain.columns = features
        dftest.columns = features  
        return dftrain, dftest

    def _display_scoring_metrics(self, met_scores, label):
        '''
        '''        
        for met, score in met_scores:
            print label+' {}: {}'.format(met.func_name, np.round(score,3))

    def _log_feature(self, df):
        '''
        '''
        for feature in [col for col in df.columns.values if (('price' in str(col)) |  ('taxes' in str(col)))]:
            df[feature] = np.log(df[feature].values)
        return df

    def _divide_two_features(self, df, f1, f2):
        '''
        '''
        df[f1+'_'+f2] = df[f1]/df[f2]
        df.drop([f1,f2], axis=1, inplace=True)
        return df
def main(opt_list, arg_list, runall=False):
    """ Pass either only_ml, ml_svm, or only_svm"""
    accuracies = {
            'soft_unw': [],
            'soft_wei': [],
            'hard_wei': [],
            'hard_unw': [],
            'svm': 0,
            'lda': 0
            }
    #print(__doc__)

    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')


    ###############################################################################
    # Download the data, if not already on disk and load it as numpy arrays

    lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

    # introspect the images arrays to find the shapes (for plotting)
    n_samples, h, w = lfw_people.images.shape

    # for machine learning we use the 2 data directly (as relative pixel
    # positions info is ignored by this model)
    X = lfw_people.data
    n_features = X.shape[1]

    # the label to predict is the id of the person
    y = lfw_people.target
    target_names = lfw_people.target_names
    n_classes = target_names.shape[0]

    print("Total dataset size:")
    print("n_samples: %d" % n_samples)
    print("n_features: %d" % n_features)
    print("n_classes: %d" % n_classes)


    ###############################################################################
    # Split into a training set and a test set using a stratified k fold

    # split into a training and testing set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42)


    ###############################################################################
    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
    n_components = 150

    print("Extracting the top %d eigenfaces from %d faces"
          % (n_components, X_train.shape[0]))
    t0 = time()
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    print("done in %0.3fs" % (time() - t0))

    eigenfaces = pca.components_.reshape((n_components, h, w))
    print("Projecting the input data on the eigenfaces orthonormal basis")
    t0 = time()
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done in %0.3fs" % (time() - t0))

    if opt_list is "serial":
        if not runall:
            a = time()
            acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ['lmnn', 'lsml', 'rca', 'ldml', 'lfda'], 'soft', "weighted")
            print("accuracy = %s",acc)
            print(classification_report(y_test, y_pred, target_names=target_names))
            print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
            b = time()

        else:
            if 'soft_unw' in arg_list:
                mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml'])
                ml_strs = []
                y_preds = []
                for ml in mls:
                    if len(ml) == 0:
                        continue
                    print(ml)
                    acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ml, 'soft', 'unweighted')
                    y_preds.append(y_pred)
                    accuracies['soft_unw'].append(acc)
                    ml_strs.append(getStr(ml))
                    print("accuracy = %s",acc)
                    print(classification_report(y_test, y_pred, target_names=target_names))
                    print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
                y_preds = np.array(y_preds)
                num_samples = y_preds.shape[1]
                majority_pred = np.zeros(num_samples)
                
                for sample in xrange(y_preds.shape[1]):
                    majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax()
                majority_pred= np.array(majority_pred, dtype=np.int32)
                c = np.sum(majority_pred == y_test)
                accuracy = c * 100.0 / num_samples
                accuracies['soft_unw'].append(accuracy)
                ml_strs.append('all')
                cleanCachedMls()
            if 'soft_wei' in arg_list:
                mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml'])
                ml_strs = []
                y_preds = []
                for ml in mls:
                    if len(ml) == 0:
                        continue
                    print(ml)
                    acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ml, 'soft', 'weighted')
                    y_preds.append(y_pred)
                    accuracies['soft_wei'].append(acc)
                    ml_strs.append(getStr(ml))
                    print("accuracy = %s",acc)
                    print(classification_report(y_test, y_pred, target_names=target_names))
                    print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
                y_preds = np.array(y_preds)
                num_samples = y_preds.shape[1]
                majority_pred = np.zeros(num_samples)
                
                for sample in xrange(y_preds.shape[1]):
                    majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax()
                majority_pred= np.array(majority_pred, dtype=np.int32)
                c = np.sum(majority_pred == y_test)
                accuracy = c * 100.0 / num_samples
                accuracies['soft_wei'].append(accuracy)
                ml_strs.append('all')

                cleanCachedMls()
            if 'hard_wei' in arg_list:
                mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml'])
                ml_strs = []
                y_preds = []
                for ml in mls:
                    if len(ml) == 0:
                        continue
                    print(ml)
                    acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ml, 'hard', 'weighted')
                    y_preds.append(y_pred)
                    accuracies['hard_wei'].append(acc)
                    ml_strs.append(getStr(ml))
                    print("accuracy = %s",acc)
                    print(classification_report(y_test, y_pred, target_names=target_names))
                    print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
                y_preds = np.array(y_preds, dtype=np.int32)
                num_samples = y_preds.shape[1]
                majority_pred = np.zeros(num_samples)
                
                for sample in xrange(y_preds.shape[1]):
                    majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax()
                majority_pred= np.array(majority_pred, dtype=np.int32)
                c = np.sum(majority_pred == y_test)
                accuracy = c * 100.0 / num_samples
                accuracies['hard_wei'].append(accuracy)
                ml_strs.append('all')
                cleanCachedMls()
            if 'hard_unw' in arg_list:
                mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml'])
                ml_strs = []
                y_preds = []
                for ml in mls:
                    if len(ml) == 0:
                        continue
                    print(ml)
                    acc, y_pred = assemble_series(X_train_pca, y_train, X_test_pca, y_test, ml, 'hard', 'unweighted')
                    y_preds.append(y_pred)
                    accuracies['hard_unw'].append(acc)
                    ml_strs.append(getStr(ml))
                    print("accuracy = %s",acc)
                    print(classification_report(y_test, y_pred, target_names=target_names))
                    print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
                y_preds = np.array(y_preds, dtype=np.int32)
                num_samples = y_preds.shape[1]
                majority_pred = np.zeros(num_samples)
                
                for sample in xrange(y_preds.shape[1]):
                    majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax()
                majority_pred= np.array(majority_pred, dtype=np.int32)
                c = np.sum(majority_pred == y_test)
                accuracy = c * 100.0 / num_samples
                accuracies['hard_unw'].append(accuracy)
                ml_strs.append('all')
                cleanCachedMls()
    if opt_list is "parallel":
        """ TODO:  Opt for the parallel thread implementation. """
        if not runall:
            a = time()
            acc, y_pred = assemble_parallel(X_train_pca, y_train, X_test_pca, y_test, 'hard')
            print("accuracy = %s",acc)
            print(classification_report(y_test, y_pred, target_names=target_names))
            print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
            b = time()
            print("Total time taken for all this: {0}".format(b-a))

        else:
            mls = list_mls(['lmnn', 'lsml', 'rca', 'lfda', 'ldml'])
            ml_strs = []
            y_preds = []
            for ml in mls:
                if len(ml) == 0:
                    continue
                print(ml)
                acc, y_pred = assemble_parallel(X_train_pca, y_train, X_test_pca, y_test, 'hard')
                accuracies['hard'].append(acc)
                y_preds.append(y_pred)
                ml_strs.append(getStr(ml))
                print("accuracy = %s", acc)
                print(classification_report(y_test, y_pred, target_names=target_names))
                print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

            y_preds = np.array(y_preds)
            num_samples = y_preds.shape[1]
            majority_pred = np.zeros(num_samples)
            
            for sample in xrange(y_preds.shape[1]):
                majority_pred[sample] = np.bincount(y_preds[:,sample]).argmax()
            majority_pred= np.array(majority_pred, dtype=np.int32)
            c = np.sum(majority_pred == y_test)
            accuracy = c * 100.0 / num_samples
            accuracies['hard'].append(accuracy)
            ml_strs.append('all')


    ###############################################################################
    print("Without the LMNN structure")
    # Train a SVM classification model

    print("Fitting the classifier to the training set")
    t0 = time()
    param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
                  'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
    clf = GridSearchCV(SVC(kernel='rbf'), param_grid)
    clf.fit(X_train_pca, y_train)
    print("done in %0.3fs" % (time() - t0))
    print("Best estimator found by grid search:")
    print(clf.best_estimator_)

    ###############################################################################
    # Quantitative evaluation of the model quality on the test set

    print("Predicting people's names on the test set")
    t0 = time()
    y_pred = clf.predict(X_test_pca)
    acc = 100.0*sum(y_pred == y_test) / len(y_test)
    print("accuracy = %s",acc)
    print("done in %0.3fs" % (time() - t0))

    print(classification_report(y_test, y_pred, target_names=target_names))
    print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

    print("Fitting the classifier to the training set")
    t0 = time()
    clf = LDA()
    clf.fit(X_train_pca, y_train)
    print("done in %0.3fs" % (time() - t0))
    print("Best estimator found by grid search:")

    ###############################################################################
    # Quantitative evaluation of the model quality on the test set

    print("Predicting people's names on the test set")
    t0 = time()
    y_pred = clf.predict(X_test_pca)
    acc1 = 100.0*sum(y_pred == y_test) / len(y_test)
    print("accuracy = %s",acc1)
    print("done in %0.3fs" % (time() - t0))

    print(classification_report(y_test, y_pred, target_names=target_names))
    print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

    if runall:
        accuracies['svm'] = acc
        accuracies['lda'] = acc1
        ml_strs.append('svm')
        ml_strs.append('lda')
        ml_strs = ", ".join(ml_strs)
        return ml_strs, accuracies
n_components = 150  #組成元素的數量

print("Extracting the top %d eigenfaces from %d faces" %
      (n_components, X_train.shape[0]))
t0 = time()
#print(t0)
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)  #隨機隆維
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape(
    (n_components, h, w))  #從人臉提取一些特徵點,叫做eigenface

print("Projecting te input data on the eigenfaces orthonormal basis")
t0 = time()
#print(t0)
X_train_pca = pca.transform(X_train)  #針對X_train執行降維動作
X_test_pca = pca.transform(X_test)  #針對X_test執行降維動作
print("done in %0.3fs" % (time() - t0))
#===========================================================================================
# Train a SCM classification model

print("fitting the classifier to the training set")
t0 = time()
param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}  #嚐試不同參數。共30種組合
# C:float,optional(default=1.0), Penalty parameter C of the error term 對錯誤進行懲罰,權重
# gamma:float, optional(default=0.0) Kernel coefficient for 'rbf", "poly" and "sigmoid"
# If gamma is 0.0 then 1/n_features will be used instead 多少特徵點會被使用將有一個比例
Example #50
0
class MKSHomogenizationModel(BaseEstimator):
    """
    The `MKSHomogenizationModel` takes in microstructures and a their
    associated macroscopic property, and created a low dimensional structure
    property linkage. The `MKSHomogenizationModel` model is designed to
    integrate with dimensionality reduction techniques and predictive models.

    Attributes:
        degree: Degree of the polynomial used by
            `property_linker`.
        n_components: Number of components used by `dimension_reducer`.
        dimension_reducer: Instance of a dimensionality reduction class.
        property_linker: Instance of class that maps materials property to the
            microstuctures.
        correlations: spatial correlations to be computed
        basis: instance of a basis class
        reduced_fit_data: Low dimensionality representation of spatial
            correlations used to fit the model.
        reduced_predict_data: Low dimensionality representation of spatial
            correlations predicted by the model.

    Below is an examlpe of using MKSHomogenizationModel to predict (or
    classify) the type of microstructure using PCA and Logistic Regression.

    >>> n_states = 3
    >>> domain = [-1, 1]

    >>> from pymks.bases import LegendreBasis
    >>> leg_basis = LegendreBasis(n_states=n_states, domain=domain)
    >>> from sklearn.decomposition import PCA
    >>> from sklearn.linear_model import LogisticRegression
    >>> reducer = PCA(n_components=3)
    >>> linker = LogisticRegression()
    >>> model = MKSHomogenizationModel(
    ...     basis=leg_basis, dimension_reducer=reducer, property_linker=linker)
    >>> from pymks.datasets import make_cahn_hilliard
    >>> X0, X1 = make_cahn_hilliard(n_samples=50)
    >>> y0 = np.zeros(X0.shape[0])
    >>> y1 = np.ones(X1.shape[0])

    >>> X = np.concatenate((X0, X1))
    >>> y = np.concatenate((y0, y1))

    >>> model.fit(X, y)

    >>> X0_test, X1_test = make_cahn_hilliard(n_samples=3)
    >>> y0_test = model.predict(X0_test)
    >>> y1_test = model.predict(X1_test)
    >>> assert np.allclose(y0_test, [0, 0, 0])
    >>> assert np.allclose(y1_test, [1, 1, 1])
    """
    def __init__(self,
                 basis=None,
                 dimension_reducer=None,
                 n_components=None,
                 property_linker=None,
                 degree=1,
                 correlations=None,
                 compute_correlations=True):
        """
        Create an instance of a `MKSHomogenizationModel`.

        Args:
            basis (class, optional): an instance of a bases class.
            dimension_reducer (class, optional): an instance of a
                dimensionality reduction class with a fit_transform method.
            property_linker (class, optional): an instance for a machine
                learning class with fit and predict methods.
            n_components (int, optional): number of components kept by the
                dimension_reducer
            degree (int, optional): degree of the polynomial used by
                property_linker.
            correlations (list, optional): list of spatial correlations to
                compute, default is the autocorrelation with the first local
                state and all of its cross correlations. For example if basis
                has n_states=3, correlation would be [(0, 0), (0, 1), (0, 2)]
            compute_correlations (boolean, optional): If false spatial
                correlations will not be calculated as part of the fit and
                predict methods. The spatial correlations can be passed as `X`
                to both methods, default is True.
        """

        self.basis = basis
        self.dimension_reducer = dimension_reducer
        if self.dimension_reducer is None:
            self.dimension_reducer = RandomizedPCA()
        if n_components is None:
            n_components = self.dimension_reducer.n_components
        if n_components is None:
            n_components = 2
        if property_linker is None:
            property_linker = LinearRegression()
        if correlations is None and basis is not None:
            if compute_correlations is True:
                correlations = [(0, l) for l in range(basis.n_states)]
        self._linker = Pipeline([('poly', PolynomialFeatures(degree=degree)),
                                 ('connector', property_linker)])
        self._check_methods
        self.degree = degree
        self.n_components = n_components
        self.property_linker = property_linker
        self.correlations = correlations
        self._fit = False
        self.compute_correlations = compute_correlations
        self.reduced_fit_data = None
        self.reduced_predict_data = None

    @property
    def n_components(self):
        return self._n_components

    @n_components.setter
    def n_components(self, value):
        """Setter for the number of components using by the dimension_reducer
        """
        self._n_components = value
        self.dimension_reducer.n_components = value

    @property
    def degree(self):
        return self._degree

    @degree.setter
    def degree(self, value):
        """Setter for the polynomial degree for property_linker.
        """
        self._degree = value
        self._linker.set_params(poly__degree=value)

    @property
    def property_linker(self):
        return self._property_linker

    @property_linker.setter
    def property_linker(self, prop_linker):
        """Setter for the property_linker class.
        """
        self._property_linker = prop_linker
        self._linker.set_params(connector=prop_linker)

    def _check_methods(self):
        """
        Helper function to make check that the dimensionality reduction and
        property linking methods have the appropriate methods.
        """
        if not callable(getattr(self.dimension_reducer, "fit_transform",
                                None)):
            raise RuntimeError(
                "dimension_reducer does not have fit_transform() method.")
        if not callable(getattr(self.dimension_reducer, "transform", None)):
            raise RuntimeError(
                "dimension_reducer does not have transform() method.")
        if not callable(getattr(self.linker, "fit", None)):
            raise RuntimeError("property_linker does not have fit() method.")
        if not callable(getattr(self.linker, "predict", None)):
            raise RuntimeError(
                "property_linker does not have predict() method.")

    def fit(self,
            X,
            y,
            reduce_labels=None,
            periodic_axes=None,
            confidence_index=None,
            size=None):
        """
        Fits data by calculating 2-point statistics from X, preforming
        dimension reduction using dimension_reducer, and fitting the reduced
        data with the property_linker.

        Args:
            X (ND array): The microstructures or spatial correlations, a
                `(n_samples, n_x, ...)` shaped array where `n_samples` is the
                number of samples and `n_x` is the spatial discretization.
            y (1D array): The material property associated with `X`.
            reducer_labels (1D array, optional): label for X used during the
                fit_transform method for the `dimension_reducer`.
            periodic_axes (list, optional): axes that are periodic. (0, 2)
                would indicate that axes x and z are periodic in a 3D
                microstrucure.
            confidence_index (ND array, optional): array with same shape as X
                used to assign a confidence value for each data point.

        Example

        >>> from sklearn.decomposition import PCA
        >>> from sklearn.linear_model import LinearRegression
        >>> from pymks.bases import PrimitiveBasis
        >>> from pymks.stats import correlate

        >>> reducer = PCA(n_components=2)
        >>> linker = LinearRegression()
        >>> prim_basis = PrimitiveBasis(n_states=2, domain=[0, 1])
        >>> correlations = [(0, 0), (1, 1), (0, 1)]
        >>> model = MKSHomogenizationModel(prim_basis,
        ...                                dimension_reducer=reducer,
        ...                                property_linker=linker,
        ...                                correlations=correlations)
        >>> np.random.seed(99)
        >>> X = np.random.randint(2, size=(3, 15))
        >>> y = np.array([1, 2, 3])
        >>> model.fit(X, y)
        >>> X_ = prim_basis.discretize(X)
        >>> X_stats = correlate(X_)
        >>> X_reshaped = X_stats.reshape((X_stats.shape[0], X_stats[0].size))
        >>> X_pca = reducer.fit_transform(X_reshaped - np.mean(X_reshaped,
        ...                               axis=1)[:, None])
        >>> assert np.allclose(model.reduced_fit_data, X_pca)

        Now let's use the same method with spatial correlations instead of
        microtructures.

        >>> from sklearn.decomposition import PCA
        >>> from sklearn.linear_model import LinearRegression
        >>> from pymks.bases import PrimitiveBasis
        >>> from pymks.stats import correlate

        >>> reducer = PCA(n_components=2)
        >>> linker = LinearRegression()
        >>> prim_basis = PrimitiveBasis(n_states=2, domain=[0, 1])
        >>> correlations = [(0, 0), (1, 1), (0, 1)]
        >>> model = MKSHomogenizationModel(dimension_reducer=reducer,
        ...                                property_linker=linker,
        ...                                compute_correlations=False)
        >>> np.random.seed(99)
        >>> X = np.random.randint(2, size=(3, 15))
        >>> y = np.array([1, 2, 3])
        >>> X_ = prim_basis.discretize(X)
        >>> X_stats = correlate(X_, correlations=correlations)
        >>> model.fit(X_stats, y)
        >>> X_reshaped = X_stats.reshape((X_stats.shape[0], X_stats[0].size))
        >>> X_pca = reducer.fit_transform(X_reshaped - np.mean(X_reshaped,
        ...                               axis=1)[:, None])
        >>> assert np.allclose(model.reduced_fit_data, X_pca)


        """
        if self.compute_correlations is True:
            if periodic_axes is None:
                periodic_axes = []
            if size is not None:
                new_shape = (X.shape[0], ) + size
                X = X.reshape(new_shape)
            X = self._correlate(X, periodic_axes, confidence_index)
        X_reshape = self._reduce_shape(X)
        X_reduced = self.dimension_reducer.fit_transform(
            X_reshape, reduce_labels)
        self._linker.fit(X_reduced, y)
        self.reduced_fit_data = X_reduced
        self._fit = True

    def predict(self, X, periodic_axes=None, confidence_index=None):
        """Predicts macroscopic property for the microstructures `X`.

        Args:
            X (ND array): The microstructure, an `(n_samples, n_x, ...)`
                shaped array where `n_samples` is the number of samples and
                `n_x` is the spatial discretization.
            periodic_axes (list, optional): axes that are periodic. (0, 2)
                would indicate that axes x and z are periodic in a 3D
                microstrucure.
            confidence_index (ND array, optional): array with same shape as X
                used to assign a confidence value for each data point.

        Returns:
            The predicted macroscopic property for `X`.

        Example

        >>> from sklearn.manifold import LocallyLinearEmbedding
        >>> from sklearn.linear_model import BayesianRidge
        >>> from pymks.bases import PrimitiveBasis
        >>> np.random.seed(99)
        >>> X = np.random.randint(2, size=(50, 100))
        >>> y = np.random.random(50)
        >>> reducer = LocallyLinearEmbedding()
        >>> linker = BayesianRidge()
        >>> prim_basis = PrimitiveBasis(2, domain=[0, 1])
        >>> model = MKSHomogenizationModel(prim_basis, n_components=2,
        ...                                dimension_reducer=reducer,
        ...                                property_linker=linker)
        >>> model.fit(X, y)
        >>> X_test = np.random.randint(2, size=(1, 100))

        Predict with microstructures

        >>> y_pred = model.predict(X_test)

        Predict with spatial correlations

        >>> from pymks.stats import correlate
        >>> model.compute_correlations = False
        >>> X_ = prim_basis.discretize(X_test)
        >>> X_corr = correlate(X_, correlations=[(0, 0), (0, 1)])
        >>> y_pred_stats = model.predict(X_corr)
        >>> assert y_pred_stats == y_pred

        """
        if not self._fit:
            raise RuntimeError('fit() method must be run before predict().')
        if self.compute_correlations is True:
            if periodic_axes is None:
                periodic_axes = []
            X = self._correlate(X, periodic_axes, confidence_index)
        X_reshape = self._reduce_shape(X)
        X_reduced = self.dimension_reducer.transform(X_reshape)
        self.reduced_predict_data = X_reduced
        return self._linker.predict(X_reduced)

    def _correlate(self, X, periodic_axes, confidence_index):
        """
        Helper function used to calculated 2-point statistics from `X` and
        reshape them appropriately for fit and predict methods.

        Args:
            X (ND array): The microstructure, an `(n_samples, n_x, ...)`
                shaped array where `n_samples` is the number of samples and
                `n_x` is the spatial discretization..
            periodic_axes (list, optional): axes that are periodic. (0, 2)
                would indicate that axes x and z are periodic in a 3D
                microstrucure.
            confidence_index (ND array, optional): array with same shape as X
                used to assign a confidence value for each data point.

        Returns:
            Spatial correlations for each sample formated with dimensions
            (n_samples, n_features).

        Example

        >>> from sklearn.manifold import Isomap
        >>> from sklearn.linear_model import ARDRegression
        >>> from pymks.bases import PrimitiveBasis
        >>> reducer = Isomap()
        >>> linker = ARDRegression()
        >>> prim_basis = PrimitiveBasis(2, [0, 1])
        >>> model = MKSHomogenizationModel(prim_basis, reducer, linker)
        >>> X = np.array([[0, 1],
        ...               [1, 0]])
        >>> X_stats = model._correlate(X, [], None)
        >>> X_test = np.array([[[ 0, 0],
        ...                     [0.5, 0]],
        ...                    [[0, 1,],
        ...                     [0.5, 0]]])
        >>> assert np.allclose(X_test, X_stats)
        """
        if self.basis is None:
            raise AttributeError('basis must be specified')
        X_ = self.basis.discretize(X)
        X_stats = correlate(X_,
                            periodic_axes=periodic_axes,
                            confidence_index=confidence_index,
                            correlations=self.correlations)
        return X_stats

    def _reduce_shape(self, X_stats):
        """
        Helper function used to reshape 2-point statistics appropriately for
        fit and predict methods.

        Args:
            `X_stats`: The discretized microstructure function, an
                `(n_samples, n_x, ..., n_states)` shaped array
                Where `n_samples` is the number of samples, `n_x` is thes
                patial discretization, and n_states is the number of local
                states.

        Returns:
            Spatial correlations for each sample formated with dimensions
            (n_samples, n_features).

        Example
        >>> X_stats = np.zeros((2, 2, 2, 2))
        >>> X_stats[1] = 3.
        >>> X_stats[..., 1] = 1.
        >>> X_results = np.array([[-.5, .5, -.5, .5, -.5, .5, -.5,  0.5],
        ...                       [1., -1., 1., -1., 1., -1., 1., -1.]])
        >>> from pymks import PrimitiveBasis
        >>> prim_basis = PrimitiveBasis(2)
        >>> model = MKSHomogenizationModel(prim_basis)
        >>> assert np.allclose(X_results, model._reduce_shape(X_stats))
        """
        X_reshaped = X_stats.reshape((X_stats.shape[0], X_stats[0].size))
        return X_reshaped - np.mean(X_reshaped, axis=1)[:, None]

    def score(self, X, y, periodic_axes=None, confidence_index=None):
        """
        The score function for the MKSHomogenizationModel. It formats the
        data and uses the score method from the property_linker.

        Args:
            X (ND array): The microstructure, an `(n_samples, n_x, ...)`
                shaped array where `n_samples` is the number of samples and
                `n_x` is the spatial discretization.
            y (1D array): The material property associated with `X`.
            periodic_axes (list, optional): axes that are periodic. (0, 2)
                would indicate that axes x and z are periodic in a 3D
                microstrucure.
            confidence_index (ND array, optional): array with same shape as X
                used to assign a confidence value for each data point.

        Returns:
             Score for MKSHomogenizationModel from the selected
             property_linker.
        """
        if periodic_axes is None:
            periodic_axes = []
        if not callable(getattr(self._linker, "score", None)):
            raise RuntimeError("property_linker does not have score() method.")
        X_corr = self._correlate(X, periodic_axes, confidence_index)
        X_reshaped = self._reduce_shape(X_corr)
        X_reduced = self.dimension_reducer.transform(X_reshaped)
        return self._linker.score(X_reduced, y)
Example #51
0
except IndexError:
    print "Please specify trainingfile.csv testingfile.csv NumComponents"
    sys.exit(1)

traindf = pandas.read_csv(trainfile)
testdf = pandas.read_csv(testfile)
columns = ["tfidfpca_%s" % x for x in xrange(ncomponents)]

trainCleanEssay = traindf.essay.str.decode('mac-roman')
testCleanEssay = testdf.essay.str.decode('mac-roman')

vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
trainvec = vectorizer.fit_transform(trainCleanEssay)
testvec = vectorizer.transform(testCleanEssay)

pca = RandomizedPCA(n_components=ncomponents)
pca.fit(trainvec)
trainpca = pca.transform(trainvec)
trainpcadf = pandas.DataFrame(trainpca, columns=columns)
testpca = pca.transform(testvec)
testpcadf = pandas.DataFrame(testpca, columns=columns)

traindf = traindf.combine_first(trainpcadf)
testdf = testdf.combine_first(testpcadf)

nf = lambda x: os.path.splitext(os.path.basename(x))[0] + "_tfidf.csv"
traindf.to_csv(nf(trainfile))
testdf.to_csv(nf(testfile))

print "+".join(columns)
Example #52
0
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

#use pca
from sklearn.decomposition import RandomizedPCA
n_components = 1
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(features_train)
pca_train = pca.transform(features_train)
pca_test = pca.transform(features_test)

#use pca train data
# clf.fit(pca_train,labels_train)
# print clf.score(pca_test,labels_test)

clf.fit(features_train, labels_train)
print clf.score(features_test, labels_test)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
Example #53
0
                                        pre_dispatch=1))])
print '\nGridSearchCV finished\n', clf

pca.fit(finance_features)
print '\npca.explained_variance_ratio_', pca.explained_variance_ratio_, '\n'

print 'CLF', clf

#print '\nBest estimator:', clf.best_estimator_

# extraction of components to plot them
print '\npca.explained_variance_ratio_', pca.explained_variance_ratio_, '\n'
financial_pc1 = pca.components_[0]
financial_pc2 = pca.components_[1]

transformed_data = pca.transform(features)
for ii, jj in zip(transformed_data, features):
    plt.scatter(financial_pc1[0] * ii[0], financial_pc1[1] * ii[0], color='r')
    plt.scatter(financial_pc2[0] * ii[0], financial_pc2[1] * ii[0], color='c')
    plt.scatter(jj[0], jj[1], color="b")

plt.xlabel("bonus")
plt.ylabel("long-term incentive")
plt.show()

clf.fit(features, labels)

########################################################################
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
Example #54
0
# and an array with ID of the people on each image y
X = np.zeros([NUM_TRAINIMAGES, IMG_RES], dtype='int8')
y = []

# Populate training array with flattened imags from subfolders of train_faces and names
c = 0
for x, folder in enumerate(folders):
    train_faces = glob.glob(folder + '/*')
    for i, face in enumerate(train_faces):
        X[c, :] = prepare_image(face)
        y.append(ID_from_filename(face))
        c = c + 1

# perform principal component analysis on the images
pca = RandomizedPCA(n_components=NUM_EIGENFACES, whiten=True).fit(X)
X_pca = pca.transform(X)

while 1:
    r = ""

    # load test faces (usually one), located in folder test_faces
    test_faces = glob.glob('test_faces/*')

    # Create an array with flattened images X
    X = np.zeros([len(test_faces), IMG_RES], dtype='int8')

    # Populate test array with flattened imags from subfolders of train_faces
    for i, face in enumerate(test_faces):
        img = cv2.imread(face)
        if img is not None:
            X[i, :] = prepare_image(face)
def feature_reduction_pca(full_features):
    pca = RandomizedPCA(n_components=8, whiten=True).fit(full_features)
    return pca.transform(full_features)
Example #56
0
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

n_components = 150

print("extracting the top %d eigenfaces from the %d faces" %
      (n_components, x_train.shape[0]))

start_time = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(x_train)
print("运行 %0.3fs" % (time() - start_time))

eigenfaces = pca.components_.reshape((n_components, h, w))

print("将输入数据降维")
start_time = time()
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)
print("运行 %0.3fs" % (time() - start_time))

print("分类数据集的拟合")
start_time = time()
param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight="balanced"), param_grid)
clf = clf.fit(x_train_pca, y_train)
print("运行 %0.3fs" % (time() - start_time))

print("grid search 最佳估计:")
print(clf.best_estimator_)
Example #57
0
n_components = 10
cv2.destroyAllWindows()
pca = RandomizedPCA(n_components=n_components, whiten=True)

param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)

testing_data = []
for i in range(len(images)):
    testing_data.append(images[i].flatten())
pca = pca.fit(testing_data)

transformed = pca.transform(testing_data)
clf.fit(transformed, labels)

image_paths = [
    os.path.join(directory, filename) for filename in os.listdir(directory)
    if filename.endswith('sad')
]
for image_path in image_paths:
    pred_image_pil = Image.open(image_path).convert('L')
    pred_image = np.array(pred_image_pil, 'uint8')
    faces = faceCascade.detectMultiScale(pred_image)
    for (x, y, w, h) in faces:
        X_test = pca.transform(
            np.array(pred_image[y:y + col, x:x + row]).flatten())
        mynbr = clf.predict(X_test)
        nbr_act = int(
Example #58
0
def main():
    """
    CLI Arguments allowed:
        --display_graphs       Displays graphs
        --retrain              Trains a new model
        --cross-validate       Runs cross validation to fine tune the model
        --test=validation_set  Tests the latest trained model against the validation set
        --test=test_set        Tests the latets trained model against the test set
    """

    global trainer, classifier
    inputs_train, targets_train, inputs_valid, targets_valid, inputs_test, targets_test = load_parsed_data()

    if '--display_graphs' in sys.argv:
        display_graphs = True

    print('using {} percent of all data in corpus'.format(PERCENTAGE_DATA_SET_TO_USE*100))
    print('using {} most common words as features'.format(NUM_FEATURES))

    if not trained_model_exists() or '--retrain' in sys.argv:
        train_features, valid_features, test_features = extract_features(
            inputs_train[:len(inputs_train)*PERCENTAGE_DATA_SET_TO_USE],
            targets_train[:len(targets_train)*PERCENTAGE_DATA_SET_TO_USE],
            inputs_valid[:len(inputs_valid)*PERCENTAGE_DATA_SET_TO_USE],
            targets_valid[:len(targets_valid)*PERCENTAGE_DATA_SET_TO_USE],
            inputs_test[:len(inputs_test)*PERCENTAGE_DATA_SET_TO_USE],
            targets_test[:len(targets_test)*PERCENTAGE_DATA_SET_TO_USE]
        )

        save_features(train_features, valid_features, test_features)
        pca = RandomizedPCA(n_components=N_COMPONENTS, whiten=False).fit(train_features)
        save_pca(pca)
        print ("Saved PCA")

        X_train = pca.transform(train_features)
        X_valid = pca.transform(valid_features)
        pca = None
        print ("Created PCAd features")

        valid_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2)
        for i in range(len(X_valid)):
            valid_data.addSample(X_valid[i], targets_test[i])
        valid_data._convertToOneOfMany()
        X_valid = None

        train_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2)
        for i in range(len(X_train)):
            train_data.addSample( X_train[i], targets_train[i])
        train_data._convertToOneOfMany()
        X_train = None

        classifier = buildNetwork( train_data.indim, N_HIDDEN, train_data.outdim, outclass=SoftmaxLayer)
        trainer = BackpropTrainer( classifier, dataset=train_data, momentum=0.1, learningrate=0.01 , verbose=True)
        train_model(train_data, valid_data)

        save_model(classifier)
        train_data = None
        valid_data = None

    else:
        train_features, valid_features, test_features = load_features()
        pca = load_pca()
        X_train = pca.transform(train_features)

        pca = None
        print ("Created PCAd features")

        train_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2)
        for i in range(len(X_train)):
            train_data.addSample( X_train[i], targets_train[i])
        train_data._convertToOneOfMany()
        X_train = None

        classifier = load_trained_model()
        trainer = BackpropTrainer( classifier, dataset=train_data, momentum=0.1, learningrate=0.01 , verbose=True)


    if '--test=validation_set' in sys.argv:
        print ("Running against validation set")
        pca = load_pca()
        X_valid = pca.transform(valid_features)
        pca = None
        valid_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2)
        for i in range(len(X_valid)):
            valid_data.addSample( X_valid[i], targets_test[i])
        valid_data._convertToOneOfMany()
        X_valid = None

        make_prediction(valid_data)


    if '--test=test_set' in sys.argv:
        print ("Running against test set")
        pca = load_pca()
        X_test = pca.transform(test_features)
        pca = None
        test_data = ClassificationDataSet(N_COMPONENTS, target=1, nb_classes=2)
        for i in range(len(X_test)):
            test_data.addSample( X_test[i], targets_test[i])
        test_data._convertToOneOfMany()
        y_pred = trainer.testOnClassData(dataset=test_data)
        plot_precision_and_recall(y_pred, targets_test[:len(targets_test) * PERCENTAGE_DATA_SET_TO_USE])
        X_test = None

        make_prediction(test_data)
Example #59
0
def main():
    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

    # Download the data, if not already on disk and load it as numpy arrays
    lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

    # introspect the images arrays to find the shapes (for plotting)
    n_samples, h, w = lfw_people.images.shape
    np.random.seed(42)

    # for machine learning we use the data directly (as relative pixel
    # position info is ignored by this model)
    X = lfw_people.data
    n_features = X.shape[1]

    # the label to predict is the id of the person
    y = lfw_people.target
    target_names = lfw_people.target_names
    n_classes = target_names.shape[0]

    print("Total dataset size:")
    print("n_samples: %d" % n_samples)
    print("n_features: %d" % n_features)
    print("n_classes: %d" % n_classes)

    # Split into a training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)

    def xprint(*args, **kwargs):
        pass

    N_COMPONENTS = [10, 15, 25, 50, 100, 250]
    for n_components in N_COMPONENTS:
        # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
        # dataset): unsupervised feature extraction / dimensionality reduction

        xprint("Extracting the top %d eigenfaces from %d faces" %
               (n_components, X_train.shape[0]))
        t0 = time()
        pca = RandomizedPCA(n_components=n_components,
                            whiten=True).fit(X_train)
        xprint("done in %0.3fs" % (time() - t0))

        eigenfaces = pca.components_.reshape((n_components, h, w))

        xprint("Projecting the input data on the eigenfaces orthonormal basis")
        t0 = time()
        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)
        xprint("done in %0.3fs" % (time() - t0))

        # Train a SVM classification model

        xprint("Fitting the classifier to the training set")
        t0 = time()
        param_grid = {
            'C': [1e3, 5e3, 1e4, 5e4, 1e5],
            'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
        }
        # for sklearn version 0.16 or prior,
        # the class_weight parameter value is 'auto'
        clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                           param_grid)
        clf = clf.fit(X_train_pca, y_train)
        xprint("done in %0.3fs" % (time() - t0))
        xprint("Best estimator found by grid search:")
        xprint(clf.best_estimator_)

        # Quantitative evaluation of the model quality on the test set

        xprint("Predicting the people names on the testing set")
        t0 = time()
        y_pred = clf.predict(X_test_pca)
        xprint("done in %0.3fs" % (time() - t0))

        print(n_components,
              classification_report(y_test, y_pred, target_names=target_names))

    return

    # Qualitative evaluation of the predictions using matplotlib

    def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
        """Helper function to plot a gallery of portraits"""
        pl.figure(figsize=(1.8 * n_col, 2.4 * n_row))
        pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
        for i in range(n_row * n_col):
            pl.subplot(n_row, n_col, i + 1)
            pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray)
            pl.title(titles[i], size=12)
            pl.xticks(())
            pl.yticks(())

    # plot the result of the prediction on a portion of the test set

    def title(y_pred, y_test, target_names, i):
        pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
        true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
        return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

    prediction_titles = [
        title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])
    ]

    plot_gallery(X_test, prediction_titles, h, w)

    # plot the gallery of the most significative eigenfaces

    eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
    plot_gallery(eigenfaces, eigenface_titles, h, w)

    pl.show()
Example #60
0

pca = RandomizedPCA(n_components=5)
train_x = pca.fit_transform(data)

ones = np.ones(23)
zeros = np.zeros(33)

train_y = np.concatenate((ones,zeros))


knn = KNeighborsClassifier()
knn.fit(train_x, train_y)

 #Test images
img_dir_test = "image_classification/test/

images_test = [img_dir_test+ f for f in os.listdir(img_dir_test)]

data_test = []
for image in images_test:
    img = img_to_matrix(image)
    img = flatten_image(img)
    data_test.append(img)

test_x = pca.transform(data_test)

knn.predict(test_x)

pd.crosstab(train_y,knn.predict(train_x),rownames=['Act'],colnames=['Predicted'])