def get_olivetti_faces(): faces = fetch_olivetti_faces() faces.data = faces.data.astype(np.float32) faces.target = faces.target.astype(np.int32) return faces.data, faces.target
def load_data(train_num, train_repeat): test_size = (10. - train_num) / 10 data = fetch_olivetti_faces() X = data.images y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=3, stratify=y) if train_repeat > 1: X_train = X_train.repeat(train_repeat, axis=0) y_train = y_train.repeat(train_repeat) return X_train, y_train, X_test, y_test
def main(): nComponents = 50 # Import a dataset for testing Faces = data.fetch_olivetti_faces() Images = Faces.images trainData = Images[:100,:,:] testData = Images[100:,:,:] # Produce a low dimensional representation lowDimTrainData, lowDimTestData = reduceDim( trainData, testData, \ nComponents )
def data_processing_olivetti(): """ Python function for importing the Olivetti data set. """ dataset = fetch_olivetti_faces() faces = dataset.data n_samles, n_features = faces.shape class_indices = dataset['target'] train_set = [] train_class_indices = [] train_batches = [] test_set = [] test_class_indices = [] test_batches = [] curr_idx_count = 0 batch_count_train = 0 batch_count_test = 0 for i in range(len(class_indices)): if curr_idx_count <= 6: train_set.append(faces[i].reshape((1,len(faces[i])))) train_class_indices.append(array([class_indices[i]])) train_batches.append(batch_count_train) batch_count_train += 1 elif curr_idx_count <=9: test_set.append(faces[i].reshape((1,len(faces[i])))) test_class_indices.append(array([class_indices[i]])) test_batches.append(batch_count_test) batch_count_test += 1 if curr_idx_count == 9: curr_idx_count = -1 curr_idx_count += 1 train_path = "output/train/bag_of_words" os.makedirs(train_path) m.dump(array(train_batches),open(os.path.join(train_path,"batches.p"),"wb")) for i in range(len(train_set)): m.dump(train_set[i],open(os.path.join(train_path,"bow_batch_"+str(train_batches[i]))+".p","wb")) m.dump(train_class_indices[i],open(os.path.join(train_path,"class_indices_batch_"+str(train_batches[i]))+".p","wb")) test_path = "output/test/bag_of_words" os.makedirs(test_path) m.dump(array(test_batches),open(os.path.join(test_path,"batches.p"),"wb")) for i in range(len(test_set)): m.dump(test_set[i],open(os.path.join(test_path,"bow_batch_"+str(test_batches[i]))+".p","wb")) m.dump(test_class_indices[i],open(os.path.join(test_path,"class_indices_batch_"+str(test_batches[i]))+".p","wb"))
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == 'lfw_people': X = fetch_lfw_people().data elif dataset_name == '20newsgroups': X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == 'olivetti_faces': X = fetch_olivetti_faces().data elif dataset_name == 'rcv1': X = fetch_rcv1().data elif dataset_name == 'CIFAR': if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5)] X = np.vstack(X1) del X1 elif dataset_name == 'SVHN': if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X'] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == 'low rank matrix': X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4), effective_rank=100, tail_strength=.5, random_state=random_state) elif dataset_name == 'uncorrelated matrix': X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == 'big sparse matrix': sparsity = np.int(1e6) size = np.int(1e6) small_size = np.int(1e4) data = np.random.normal(0, 1, np.int(sparsity/10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_mldata(dataset_name).data return X
def task4(): data = fetch_olivetti_faces(shuffle=True, random_state=0).data image_shape = (64, 64) model = RandomizedPCA(n_components=10) model.fit(data) data_new = model.transform(data) mean_components = [data_new[:, i].mean() for i in xrange(data_new.shape[1])] influence = np.zeros((data_new.shape[0], data_new.shape[1])) for i in xrange(data_new.shape[0]): for j in xrange(data_new.shape[1]): influence[i, j] = cos(data_new[i, :], mean_components, np.abs(data_new[i, j]), mean_components[j]) res = [] for i in xrange(influence.shape[1]): res.append(np.argmax(influence[:, i])) print res write_answer_4(res)
def __init__(self, batch_size, max_patches=50, patch_size=(20, 20), images_num=None, rng=None): from sklearn import datasets as sklearn_datasets from sklearn.feature_extraction.image import extract_patches_2d self._train_batch_size = batch_size self._test_batch_size = batch_size rng = rng if not rng is None else np.random.RandomState(12) faces = sklearn_datasets.fetch_olivetti_faces() images_num = images_num if not images_num is None else faces.images.shape[0] x_v = np.zeros((max_patches * images_num, patch_size[0]*patch_size[1])) classes = np.zeros((max_patches * images_num,)) for img_id, img in enumerate(faces.images): if img_id >= images_num: break patches_id = ((img_id * max_patches),((img_id+1) * max_patches)) x_v[patches_id[0]:patches_id[1], :] = extract_patches_2d( img, patch_size, max_patches=max_patches, random_state=rng ).reshape((max_patches, patch_size[0]*patch_size[1])) classes[patches_id[0]:patches_id[1]] = faces.target[img_id] y_v = one_hot_encode(classes) test_prop = x_v.shape[0]/5 self._xt_v = x_v self._yt_v = y_v self._x_v = x_v self._y_v = y_v self._i = 0 self._x_v -= np.mean(self._x_v, axis=0) self._x_v /= np.std(self._x_v, axis=0) self._x_v *= 0.1
def init_features(self): if self.feature_coef_ is None: self.feature_coef_ = self.redis.get("feature_coef") if self.feature_coef_ is None: pca = PCA(self._n_components) test_faces = fetch_olivetti_faces() features = np.array(pca.fit_transform(test_faces.data), dtype=np.float32) self.redis.set("name:0", "olivetti_faces") self.redis.set("name_id:olivetti_faces", 0) feature_coef = np.array(pca.components_.T, np.float64) dim1, dim2 = feature_coef.shape self.redis.hmset("feature_coef", {"dim1":dim1, "dim2":dim2, "data":feature_coef.tostring()}) test_features = [f.tostring() for f in features] self.redis.rpush("features", *test_features) test_face_data = [np.array(f, dtype=np.float32).tostring() for f in test_faces.data] self.redis.rpush("faces", *test_face_data) for i in xrange(len(test_faces.data)): self.redis.hmset("picture:%d" % (i), {"name_id":0, "pic_path":DUMMY_PATH}) self.redis.set("last_pic_id", len(test_faces.data) - 1)
def OnlineLearningTest01(): import time import matplotlib.pyplot as plt import numpy as np from sklearn import datasets from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.image import extract_patches_2d faces = datasets.fetch_olivetti_faces() print "Learning the dictionary..." rng = np.random.RandomState(0) kmeans = MiniBatchKMeans(n_clusters = 81, random_state = rng, verbose = True) patch_size = (20, 20) buffer = [] index = 1 t0 = time.time() #Online Learning index = 0 for _ in range(6): for img in faces.images: data = extract_patches_2d(img, patch_size, max_patches = 50, random_state = rng) data = np.reshape(data, (len(data), -1)) buffer.append(data) index += 1 if index % 10 == 0: data = np.concatenate(buffer, axis = 0) #这里是把一个数组合并成矩阵 #这里要先做标准化 data -= np.mean(data, axis = 0) data /= np.std(data, axis = 0) kmeans.partial_fit(data) #每次都是调用partial_fit函数进行学习 buffer = [] if index % 100 == 0: print "Partial fit of %4i out of %i" % (index, 6 * len(faces.images)) dt = time.time() - t0 print "done in %.2fs. " % dt #plot result plt.figure(figsize = (4.2, 4)) for i, patch in enumerate(kmeans.cluster_centers_): plt.subplot(9,9, i + 1) plt.imshow(patch.reshape(patch_size), cmap = plt.cm.gray, interpolation = "nearest") plt.xticks(()) plt.xticks(()) plt.suptitle('Patches of faces\nTrain time %.1fs on %d patches' % (dt, 8 * len(faces.images)), fontsize = 16) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) plt.show()
from numpy.random import RandomState import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import fetch_olivetti_faces from sklearn import decomposition from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.cross_validation import ShuffleSplit # -- Prepare data and define utility functions --------------------------------- image_shape = (64, 64) rng = RandomState(0) # Load faces data dataset = fetch_olivetti_faces(data_home='/tmp/',shuffle=True, random_state=rng) faces = dataset.data n_samples, n_features = faces.shape # global centering faces_centered = faces - faces.mean(axis=0, dtype=np.float64) print "Dataset consists of %d faces" % n_samples print "********************************" def plot_gallery(title, images,n_col,n_row): plt.figure(figsize=(2. * n_col, 2.26 * n_row)) plt.suptitle(title, size=16) for i, comp in enumerate(images):
def get_olive(): olive = datasets.fetch_olivetti_faces() return olive.data, olive.target
from sklearn.datasets import fetch_olivetti_faces from sklearn.decomposition import PCA import numpy as np import matplotlib.pyplot as plt data, target = fetch_olivetti_faces(return_X_y=True) extractor = PCA(n_components=0.7) per_face_features = extractor.fit_transform(data, target) basis = extractor.components_ mean_face = extractor.mean_ rank = extractor.n_components_ vector_length = np.linalg.norm(basis, axis=1, ord=2) print("the length of each vector in the basis is:", vector_length) covariance_matrix = np.zeros(shape=(rank, rank)) for i in range(covariance_matrix.shape[0]): for j in range(covariance_matrix.shape[1]): covariance_matrix[i, j] = np.dot(basis[i], basis[j]) print("the inner product of each vector with another one in the basis is", covariance_matrix) # for the part of verifying each value in coordinate vector equals the inner product of the basis vector # and (X-X_mean) , we will only test this one 1 example difference_vector = [ per_face_features[0, i] - np.dot((data[0] - mean_face), basis[i]) for i in range(rank) ] print( "so the difference in result between this formula and the standard one in PCA is:", difference_vector)
import matplotlib.pyplot as plt from sklearn import decomposition from sklearn.datasets import fetch_olivetti_faces from numpy.random import RandomState n_row, n_col = 2, 3 n_components = n_row * n_col image_shape = (64, 64) dataset = fetch_olivetti_faces(shuffle=True, random_state=RandomState(0)) faces = dataset.data def plot_gallery(title, images, n_col=n_col, n_row=n_row): plt.figure(figsize=(2. * n_col, 2.26 * n_row)) plt.suptitle(title, size=16) for i, comp in enumerate(images): plt.subplot(n_row, n_col, i+1) vmax = max(comp.max(),-comp.min()) plt.imshow(comp.reshape(image_shape),cmap=plt.cm.gray,interpolation='nearest',vmin=-vmax,vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01,0.05,0.99,0.93,0.04,0.) estimators = [ ('Eigenfaces - PCA using randomized SVD', decomposition.PCA(n_components=6,whiten=True)), ('Non-negative components - NMF', decomposition.NMF(n_components=6,init='nndsvda', tol=5e-3))]
__FILENAME__ = download_data """ Run this script to make sure data is cached in the appropriate place on your computer. The data are only a few megabytes, but conference wireless is often not very reliable... """ import os import sys from sklearn import datasets #------------------------------------------------------------ # Faces data: this will be stored in the scikit_learn_data # sub-directory of your home folder faces = datasets.fetch_olivetti_faces() print "Successfully fetched olivetti faces data" #------------------------------------------------------------ # SDSS galaxy data: this will be stored in notebooks/datasets/data sys.path.append(os.path.abspath('notebooks')) from datasets import fetch_sdss_galaxy_mags colors = fetch_sdss_galaxy_mags() print "Successfully fetched SDSS galaxy data" #------------------------------------------------------------ # SDSS filters & vega spectrum: stored in notebooks/figures/downloads from figures.sdss_filters import fetch_filter, fetch_vega_spectrum spectrum = fetch_vega_spectrum() print "Successfully fetched vega spectrum"
#### Template-3 | FACE CLASSIFICATION ######################################### ############################################################################### import sklearn as sk from scipy.stats import sem import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import fetch_olivetti_faces from sklearn.svm import SVCfrom sklearn.svm import SVC from sklearn.cross_validation import train_test_split from sklearn.cross_validation import cross_val_score, KFold #### Get Data faces = fetch_olivetti_faces() ## 400 images of 40 person print(faces.DESCR) #### Data exploration print("faces dataset:",faces.keys()) print("images data dim:",faces.images.shape) print("numerical data dim:",faces.data.shape) print("Labels:",faces.target.shape) #### Data pre-processing - Data is b/w 0 to 1 so no scaling/transformation required print(np.max(faces.data)) print(np.min(faces.data)) print(np.mean(faces.data)) #### Print images def print_faces(images, target, top_n):
import numpy as np import matplotlib.pylab as plt ,pylab from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline from sklearn.datasets import fetch_olivetti_faces faces = fetch_olivetti_faces().data print(faces.shape) # there are 400 faces each of them is of 64x64=4096 pixels fig = plt.figure(figsize=(5,5)) fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) # plot 25 random faces j = 1 np.random.seed(0) for i in np.random.choice(range(faces.shape[0]), 25): ax = fig.add_subplot(5, 5, j, xticks=[], yticks=[]) ax.imshow(np.reshape(faces[i,:],(64,64)), cmap=plt.cm.bone, interpolation='nearest') j += 1 #plt.show() n_comp =64 pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=n_comp))]) faces_proj = pipeline.fit_transform(faces) print(faces_proj.shape) # (400, 64) mean_face = np.reshape(pipeline.named_steps['scaling'].mean_, (64,64)) sd_face = np.reshape(np.sqrt(pipeline.named_steps['scaling'].var_), (64,64)) pylab.figure(figsize=(8, 6)) pylab.plot(np.cumsum(pipeline.named_steps['pca'].explained_variance_ratio_), linewidth=2) pylab.grid(), pylab.axis('tight'), pylab.xlabel('n_components'), pylab.ylabel('cumulative explained_variance_ratio_') pylab.show() pylab.figure(figsize=(10,5))
from sklearn.datasets import fetch_olivetti_faces from sklearn.datasets import fetch_lfw_people from sklearn.datasets import get_data_home if __name__ == "__main__": fetch_olivetti_faces() print("Loading Labeled Faces Data (~200MB)") fetch_lfw_people(min_faces_per_person=70, resize=0.4) print("=> Success!") print("Data saved in %s" % get_data_home())
from sklearn.datasets import fetch_olivetti_faces OUTPUT_BASE_DIR = "/neurospin/brainomics/2014_pca_struct/Olivetti_faces" if not os.path.exists(OUTPUT_BASE_DIR): os.makedirs(OUTPUT_BASE_DIR) OUTPUT_DATASET_FILE = os.path.join(OUTPUT_BASE_DIR, "X.npy") OUTPUT_TARGET_FILE = os.path.join(OUTPUT_BASE_DIR, "y.npy") OUTPUT_VAR_FILE = os.path.join(OUTPUT_BASE_DIR, "pixel_var.png") OUTPUT_IMAGE_FILE = os.path.join(OUTPUT_BASE_DIR, "example.png") IM_SHAPE = (64, 64) ############################################################################### # Load faces data dataset = fetch_olivetti_faces() faces = dataset.data n, p = shape = faces.shape # global centering faces_centered_global = faces - faces.mean(axis=0) # local centering local_centering = faces_centered_global.mean(axis=1).reshape(n, -1) faces_centered_local = faces_centered_global - local_centering print("Dataset shape: {s}".format(s=shape)) # Load ground truth (useful for cross validation) y = dataset.target
# In[10]: ax=plt.axes() ax.plot(np.random.rand(50)) ax.yaxis.set_major_locator(plt.NullLocator()) ax.xaxis.set_major_formatter(plt.NullFormatter()) # In[11]: from sklearn.datasets import fetch_olivetti_faces faces=fetch_olivetti_faces().images fig,ax=plt.subplots(5,5,figsize=(5,5)) fig.subplots_adjust(hspace=0,wspace=0) for i in range(5): for j in range(5): ax[i,j].xaxis.set_major_locator(plt.NullLocator()) ax[i,j].yaxis.set_major_locator(plt.NullLocator()) ax[i,j].imshow(faces[10*i+j],cmap='bone') # In[18]: from matplotlib.ticker import MultipleLocator, FormatStrFormatter t=np.linspace(0,100,100) s=9.8*np.power(t,2)/2
import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') import numpy as np # Next, we will retrieve the Olivetti faces dataset. # In[2]: from sklearn.datasets import fetch_olivetti_faces, fetch_lfw_people from ipywidgets import interact get_ipython().run_line_magic('matplotlib', 'inline') image_shape = (64, 64) # Load faces data dataset = fetch_olivetti_faces('./') faces = dataset.data # ### Advice for testing numerical algorithms # Before we begin this week's assignment, there are some advice that we would like to give for writing functions that work with numerical data. They are useful for finding bugs in your implementation. # # Testing machine learning algorithms (or numerical algorithms in general) # is sometimes really hard as it depends on the dataset # to produce an answer, and you will never be able to test your algorithm on all the datasets # we have in the world. Nevertheless, we have some tips for you to help you identify bugs in # your implementations. # # #### 1. Test on small dataset # Test your algorithms on small dataset: datasets of size 1 or 2 sometimes will suffice. This # is useful because you can (if necessary) compute the answers by hand and compare them with # the answers produced by the computer program you wrote. In fact, these small datasets can even have special numbers,
def faces_decomposition(): import logging from numpy.random import RandomState #随机数生成器种子,从高斯分布或者其他等分布产生 import matplotlib.pyplot as plt from time import time from sklearn.datasets import fetch_olivetti_faces from sklearn.cluster import MiniBatchKMeans from sklearn import decomposition logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') n_row, n_col = 2, 3 n_components = n_row * n_col image_shape = (64, 64) rng = RandomState(0) #加载数据集 dataset = fetch_olivetti_faces(shuffle=True, random_state=rng) faces = dataset.data n_samples, n_features = faces.shape faces_centered = faces - faces.mean(axis=0) faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) print("dataset consits of %d faces" % n_samples) #样本个数 def plot_gallery(title, images, n_col=n_col, n_row=n_row): plt.figure(figsize=(2. * n_col, 2.26 * n_row)) plt.suptitle(title, size=16) for i, comp in enumerate(images): plt.subplot(n_row, n_col, i + 1) vmax = max(comp.max(), -comp.min()) plt.imshow(comp.reshape(image_shape), cmap=plt.cm.gray, interpolation='nearest', vmin=-vmax, vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) estimators = [ ('Eigenfaces - PCA using randomized SVD', decomposition.PCA(n_components=n_components, svd_solver='randomized', whiten=True), True), ('Non-negative components - NMF', decomposition.NMF(n_components=n_components, init='nndsvda', tol=5e-3), False), ('Independent components - FastICA', decomposition.FastICA(n_components=n_components, whiten=True), True), ('Sparse comp. - MiniBatchSparsePCA', decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True), ('Factor Analysis components - FA', decomposition.FactorAnalysis(n_components=n_components, max_iter=2), True), ] # ############################################################################# # Plot a sample of the input data plot_gallery("First centered Olivetti faces", faces_centered[:n_components]) # ############################################################################# # Do the estimation and plot it for name, estimator, center in estimators: print("Extracting the top %d %s..." % (n_components, name)) t0 = time() data = faces if center: data = faces_centered estimator.fit(data) train_time = (time() - t0) print("done in %0.3fs" % train_time) if hasattr(estimator, 'cluster_centers_'): components_ = estimator.cluster_centers_ else: components_ = estimator.components_ # Plot an image representing the pixelwise variance provided by the # estimator e.g its noise_variance_ attribute. The Eigenfaces estimator, # via the PCA decomposition, also provides a scalar noise_variance_ # (the mean of pixelwise variance) that cannot be displayed as an image # so we skip it. if (hasattr(estimator, 'noise_variance_') and estimator.noise_variance_.ndim > 0): # Skip the Eigenfaces case plot_gallery("Pixelwise variance", estimator.noise_variance_.reshape(1, -1), n_col=1, n_row=1) plot_gallery('%s - Train time %.1fs' % (name, train_time), components_[:n_components]) plt.show()
def load_data(type=0, train_kfold=0): #0:yale,1:ar,2:orl,3:olivetti,4:imm if int(type) == 0: for j in range(15): for i in range(11): filename = 'C:\\Users\\hasee\\Desktop\\yale\\%d\\s%d.bmp' % ( j + 1, i + 1) # print filename if i == 0 and j == 0: x = ImageToMatrix(filename).ravel() else: x = np.concatenate((x, ImageToMatrix(filename).ravel()), axis=0) y = np.ones(165) for i in range(15): for j in range(11): y[i * 11 + j] = i + 1 if int(type) == 1: import os file = os.listdir('C:\\Users\\hasee\\Desktop\\AR\\') k = 0 for i in file: k += 1 filename = 'C:\\Users\\hasee\\Desktop\\AR\\' + i if k == 1: x = ImageToMatrix(filename).ravel() else: x = np.concatenate((x, ImageToMatrix(filename).ravel()), axis=0) y = np.ones(1680) for i in range(120): for j in range(14): y[i * 14 + j] = i + 1 if int(type) == 2: import os file = os.listdir('C:\\Users\\hasee\\Desktop\\ORL\\') k = 0 for i in file: filename = 'C:\\Users\\hasee\\Desktop\\ORL\\' + i file2 = os.listdir(filename) for j in file2: if j.split(".")[1] == 'bmp': k += 1 filename3 = filename + "\\" + j if k == 1: x = ImageToMatrix(filename3).ravel() else: x = np.concatenate( (x, ImageToMatrix(filename3).ravel()), axis=0) else: pass y = np.zeros(400) for i in range(40): for j in range(10): y[i * 10 + j] = i + 1 if int(type) == 3: from sklearn.datasets import fetch_olivetti_faces data = fetch_olivetti_faces() x = data.images.reshape((len(data.images), -1)) y = data.target if int(type) == 4: import os k = 0 for i in range(240): filename = 'C:\\Users\\hasee\\Desktop\\crop2\\%d.jpg' % i k += 1 if k == 1: x = ImageToMatrix(filename).ravel() else: x = np.concatenate((x, ImageToMatrix(filename).ravel()), axis=0) y = np.zeros(240) for i in range(40): for j in range(6): y[i * 6 + j] = i + 1 if int(train_kfold) <= 0: index = np.array(random.sample(range(len(y)), len(y))) x = x[index] y = y[index] return x, y else: index = np.array(random.sample(range(len(y)), len(y))) test_index = index[0:round(len(y) / int(train_kfold))] train_index = index[round(len(y) / int(train_kfold))::] train_x = x[train_index] train_y = y[train_index] test_x = x[test_index] test_y = y[test_index] return train_x, train_y, test_x, test_y
if __name__ == "__main__": #Overview: #Olivetti dataset #Split into test and training #extract keypoints and compute sift features on training images #cluster sift features into a visual dictionary of size V #represent each image as visual words histogram #apply tf-idf (need text data) #fit LDA topic model on bags of visual words #given test data transform test image into tf_idf vector #use cosine similarity for image retrieval #display top-K images # Load the faces datasets data = fetch_olivetti_faces(shuffle=True, random_state=0) targets = data.target data = data.images.reshape((len(data.images), -1)) data_train = data[targets < 30] data_test = data[targets >= 30] num_train_images = data_train.shape[0] #show mean training image plt.figure() plt.imshow(np.mean(data_train,axis=0).reshape(64,64)) plt.title('Olivetti Dataset (Mean Training Image)') plt.show() #show random selection of images rnd_idx = np.arange(num_train_images)
def train_images2(self): self.x = datasets.fetch_olivetti_faces() for i in range(41): self.all_data.addSample(self.x.data[i], self.x.target[i])
from sklearn.utils import as_float_array from sklearn import linear_model import matplotlib.pyplot as plt import numpy as np from SPCA import SPCA from SLR import SLR ######################################################################################################### if __name__ == "__main__": dataset = fetch_olivetti_faces() data = dataset.data labels = dataset.target print 'dataset data dimensions : ', data.shape print 'dataset labels dimensions : ', labels.shape # TODO print eigenfaces normally!!!! #n_samples, h, w = dataset.images.shape #faces_images = dataset.images #print_faces(faces_images, labels, 20) # split dataset for training and evaluation test_percent = 0.3 features_train, features_test, labels_train, labels_test = train_test_split(data, labels, test_size = test_percent)
# # Beispiel einer Visualisierung des Olivetti-Dataset # import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt from sklearn.datasets import fetch_olivetti_faces olivetti = fetch_olivetti_faces(data_home="./data", shuffle=False, random_state=0, download_if_missing=True) # Visualisierung des ersten Bilds plt.imshow(olivetti.data[0].reshape(64,64),cmap='gray') plt.show()
def _is_olivetti_faces_not_available(): try: datasets.fetch_olivetti_faces(download_if_missing=False) return False except IOError: return True
import tensorflow as tf from sklearn.datasets import fetch_olivetti_faces # Set random seed for reproducibility np.random.seed(1000) nb_epochs = 600 batch_size = 50 code_length = 256 width = 32 height = 32 if __name__ == '__main__': # Load the dataset faces = fetch_olivetti_faces(shuffle=True, random_state=1000) X_train = faces['images'] # Create graph graph = tf.Graph() with graph.as_default(): input_images_xl = tf.placeholder(tf.float32, shape=(None, X_train.shape[1], X_train.shape[2], 1)) input_noisy_images_xl = tf.placeholder(tf.float32, shape=(None, X_train.shape[1], X_train.shape[2], 1)) input_images = tf.image.resize_images( input_images_xl, (width, height),
from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score from sklearn import svm from sklearn import datasets import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier olivetti = datasets.fetch_olivetti_faces() X, y = olivetti.data, olivetti.target X.shape y.shape plt.imshow(X[0].reshape(64, 64), cmap=plt.cm.gray_r) plt.imshow(X[1].reshape(64, 64), cmap=plt.cm.gray_r) y[0:2] plt.imshow(X[200].reshape(64, 64), cmap=plt.cm.gray_r) plt.imshow(X[201].reshape(64, 64), cmap=plt.cm.gray_r) y[200:202] # Try SVM clf = svm.SVC() clf.fit(X,y) cross_val_score(clf, X, y, cv=5, scoring='accuracy').mean() # Try Logistic Regression logistic = LogisticRegression() cross_val_score(logistic, X, y, cv=5, scoring='accuracy').mean()
def get_olivetti_data(): olivetti_path = 'olivetti' face_data = fetch_olivetti_faces(olivetti_path) return face_data.images, face_data.target
print(__doc__) import numpy as np import matplotlib.pyplot as plt from sklearn.svm import SVR from sklearn.datasets import fetch_olivetti_faces from sklearn.utils.validation import check_random_state from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LinearRegression from sklearn.linear_model import RidgeCV # Load the faces datasets data = fetch_olivetti_faces() targets = data.target #separação dos dados data = data.images.reshape((len(data.images), -1)) train = data[targets < 30] test = data[targets >= 30] # Test on independent people # Test on a subset of people n_faces = 5 rng = check_random_state(4) face_ids = rng.randint(test.shape[0], size=(n_faces, )) test = test[face_ids, :] n_pixels = data.shape[1] # Upper half of the faces
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, cluster from sklearn.feature_extraction.image import grid_to_graph faces = datasets.fetch_olivetti_faces(shuffle=True) print(faces.keys()) images = faces.images target = faces.target X = np.reshape(images, (len(images), -1)) connectivity = grid_to_graph(*images[0].shape) agglo = cluster.FeatureAgglomeration(connectivity=connectivity, n_clusters=32) agglo.fit(X) X_reduced = agglo.transform(X) X_restored = agglo.inverse_transform(X_reduced) images_restored = np.reshape(X_restored, images.shape) plt.figure(1, figsize=(4, 3.5)) plt.clf() plt.subplots_adjust(left=0.01, right=0.99, bottom=0.01, top=0.91) for i in range(4): plt.subplot(3, 4, i + 1) plt.imshow(images[i], cmap=plt.cm.gray, interpolation='nearest')
partial-fit. This is because the number of patches that they represent has become too low, and it is better to choose a random new cluster. """ print(__doc__) import time import pylab as pl import numpy as np from sklearn import datasets from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.image import extract_patches_2d faces = datasets.fetch_olivetti_faces() ############################################################################### # Learn the dictionary of images print('Learning the dictionary... ') rng = np.random.RandomState(0) kmeans = MiniBatchKMeans(n_clusters=81, random_state=rng, verbose=True) patch_size = (20, 20) buffer = [] index = 1 t0 = time.time() # The online learning part: cycle over the whole dataset 4 times index = 0
from openpyxl.workbook import Workbook from mpl_toolkits.mplot3d import axes3d import sklearn.datasets as dt from sklearn.neighbors import KernelDensity from sklearn.model_selection import GridSearchCV seed = 11 rand_state = 11 # Define the color maps for plots color_map = plt.cm.get_cmap('RdYlBu') color_map_discrete = matplotlib.colors.LinearSegmentedColormap.from_list( "", ["red", "cyan", "magenta", "blue"]) # Fetch the dataset and store in X faces = dt.fetch_olivetti_faces() X = faces.data # Fit a kernel density model using GridSearchCV to determine the best parameter for bandwidth bandwidth_params = {'bandwidth': np.arange(0.01, 1, 0.05)} grid_search = GridSearchCV(KernelDensity(), bandwidth_params) grid_search.fit(X) kde = grid_search.best_estimator_ # Generate/sample 8 new faces from this dataset new_faces = kde.sample(8, random_state=rand_state) # Show a sample of 8 original face images and 8 generated faces derived from the faces dataset fig, ax = plt.subplots(nrows=2, ncols=8, figsize=(18, 6),
import matplotlib.pyplot as plt from sklearn.datasets import fetch_olivetti_faces from sklearn.decomposition import PCA from matplotlib import font_manager, rc font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name() rc('font', family=font_name) faces_all = fetch_olivetti_faces() K = 20 # 20번호에 해당하는 사람 얼굴 사진 pca3 = PCA(n_components=2) # 주성분 2개 분석 X3 = faces_all.data[faces_all.target==K] W3 = pca3.fit_transform(X3) # 위분석 결과를 토대로 X3의 차원 축소 X32 = pca3.inverse_transform(W3) # 다시 차원 복귀 (결과적으로 주성분이 강조된 형태로) face_mean = pca3.mean_.reshape(64, 64) # 평균 얼굴 이미지 face_p1 = pca3.components_[0].reshape(64, 64) face_p2 = pca3.components_[1].reshape(64, 64) N = 2 # 2행 M = 5 # 5열 fig = plt.figure(figsize=(10,5)) # top에서 1픽셀 간격, bottom 간격 0 hspace=0, wspace=0.05 사진사이 간격 plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05) for i in range(N): for j in range(M): k = i *M +j ax = fig.add_subplot(N,M,k+1) w = 1.5*(k-5) if k < 5 else 1.5*(k-4) # 주성분을 반영해서 다양한 조합 # face_p1 성분을 더하거나 마이너스 했을때 얼굴 방향이 바뀐다
def getImgAsMat(index): ds = datasets.fetch_olivetti_faces() return np.mat(ds.images[index])
def load_faces(): print("Loading Olivetti face dataset") print("-----------------------------") from sklearn.datasets import fetch_olivetti_faces faces = fetch_olivetti_faces(shuffle=True) return faces.data
else: second_component_features.append(i + 1) max_correlations.append((i, max_corr[0], max_corr[1][0])) plot_iris(np.array(components_centered).transpose(), target, target_names) write_answer_3(first_component_features, second_component_features) # 4 def write_answer_4(list_pc): with open("pca_answer4.txt", "a") as fout: fout.write(" ".join([str(num) for num in list_pc])) data = fetch_olivetti_faces(shuffle=True, random_state=0).data image_shape = (64, 64) d = 10 model = RandomizedPCA(n_components=d) model.fit(data) faces_transformed = model.transform(data) def center_features(matrix): matrix_t = matrix.transpose() means = [np.mean(col) for col in matrix_t] matrix_t_centered = [[item - col[1] for item in col[0]] for col in zip(matrix_t, means)] return np.array(matrix_t_centered).transpose()
def get_data(): face_data=datasets.fetch_olivetti_faces() #face_data=datasets.load_iris() data=face_data.data target=face_data.target return data,target
vmin=-vmax, vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) # # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') n_components = 10 image_shape = (64, 64) rng = RandomState(0) # # ############################################################################### # Load faces data dataset = fetch_olivetti_faces(shuffle=True, random_state=rng) faces = dataset.data n_samples, n_features = faces.shape # global centering faces_centered = faces - faces.mean(axis=0) # local centering faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) print("Dataset consists of %d faces" % n_samples) print("Preprocess the face data such that max norm of each image is less than 1") X = faces_centered
def face_completion_Test01(): import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import fetch_olivetti_faces from sklearn.utils.validation import check_random_state from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LinearRegression from sklearn.linear_model import RidgeCV #load the faces datasets data = fetch_olivetti_faces() targets = data.target #print len(data.data) #print len(data.data[0]) #data.data 是 400 * 4096 的数据 #感觉这里的4096维和原图不一样啊...ravelled image #face = data.data[1].reshape(64,64) #注意这里的data和image #face = data.images[1] #face_ccw_90 = zip(*face)[::-1] #face_cw_90 = zip(*face[::-1]) #plt.imshow(face_cw_90, cmap = plt.cm.gray_r) #plt.show() #这里是为了做左右预测, 所以把原图旋转了90度 #for i in range(len(data.images)): # face = data.images[i] # data.images[i] = face_cw_90 = zip(*face[::-1]) #print data.images[0] data = data.images.reshape((len(data.images), -1)) #相当于就是data.data...把一张图片变成了一个行向量 #print len(data[0]) train = data[targets < 30] test = data[targets >= 30] #注意这里的test和targe没有关系 n_faces = 5 rng = check_random_state(4) #test.shape = [100, 4096] face_ids = rng.randint(test.shape[0], size = (n_faces, )) #这里相当于是在0-99中随机选择出5个数 test = test[face_ids, :] #print face_ids n_pixels = data.shape[1] X_train = train[:, :np.ceil(0.5 * n_pixels)] #脸的上半部分 Y_train = train[:, np.floor(0.5 * n_pixels):] #脸的下半部分 X_test = test[:, :np.ceil(0.5 * n_pixels)] #相当于是那脸的前半部分预测后半部分 -- 是一个多对多的学习过程, train和test的维度相同 Y_test = test[:, np.floor(0.5 * n_pixels):] #注意因为是要做completion, 所以是regression 而不是 classification #这里的ESTMATORS是一个字典 ESTIMATORS = { "Extra trees": ExtraTreesRegressor(n_estimators = 10, max_features = 32, random_state = 0), "k-nn": KNeighborsRegressor(), "Linear regression": LinearRegression(), "Ridge": RidgeCV(), } #这里是直接进行预测, 也就是fit + predict的过程 print "start fiting and predicting" y_test_predict = dict() for name, estimator in ESTIMATORS.items(): estimator.fit(X_train, Y_train) y_test_predict[name] = estimator.predict(X_test) print "start plotting" #下面是画图 image_shape = (64, 64) n_cols = 1 + len(ESTIMATORS) plt.figure(figsize=(2.0 * n_cols, 2.26 * n_faces)) plt.suptitle("Face completion with multi-output estimators GoGoGo", size = 16) for i in range(n_faces): true_face = np.hstack((X_test[i], Y_test[i])) if i: sub = plt.subplot(n_faces, n_cols, i * n_cols + 1) else: sub = plt.subplot(n_faces, n_cols, i * n_cols + 1, title = "true faces") sub.axis("off") sub.imshow(true_face.reshape(image_shape), cmap = plt.cm.gray, interpolation = "nearest") #a = true_face.reshape(image_shape) #sub.imshow(zip(*a)[::-1], cmap = plt.cm.gray, interpolation = "nearest") for j, est in enumerate(sorted(ESTIMATORS)): completed_face = np.hstack((X_test[i], y_test_predict[est][i])) if i: sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j) else: sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j, title = est) sub.axis("off") sub.imshow(completed_face.reshape(image_shape), cmap = plt.cm.gray, interpolation = "nearest") #b = completed_face.reshape(image_shape) #sub.imshow(zip(*b)[::-1], cmap = plt.cm.gray, interpolation = "nearest") plt.show()
# -*- coding: utf-8 -*- #[email protected] """ 对原始图片进行可视化 """ print(__doc__) from time import time import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import fetch_olivetti_faces from sklearn import decomposition as dcp plot_grid = (3, 3) image_shape = (64, 64) dataset = fetch_olivetti_faces(data_home="data", shuffle=True) faces = dataset.data n_samples, n_features = faces.shape plt.figure(1) for itr in range(9): plt.subplot(3, 3, itr+1) plt.imshow(np.reshape(faces[itr+66], image_shape)) plt.show()
import pylab import pickle import numpy import theano import theano.tensor as T from theano.tensor.signal import downsample from theano.tensor.nnet import conv from logistic_sgd import LogisticRegression, load_data from mlp import HiddenLayer from convolutional_mlp import LeNetConvPoolLayer from sklearn import datasets # load the saved model layer0,layer1,layer2,layer3 = pickle.load(open('weight.pkl','rb')) face=datasets.fetch_olivetti_faces(shuffle=True) x=face.data[0,:] x=x.reshape(1,1,64,64) input = T.tensor4(name='input') conv_out = conv.conv2d(input,filters=layer0.params[0]) pooled_out = downsample.max_pool_2d( input=conv_out, ds=(2,2), ignore_border=True ) output = T.tanh(pooled_out + layer0.params[1].dimshuffle('x', 0, 'x', 'x')) f = theano.function([input], output) filtered_img = f(x) pylab.gray(); pylab.subplot(1, 3, 1)
def dimension_comprasion(): np.set_printoptions(threshold=np.inf, precision=1) olivetti = datasets.fetch_olivetti_faces() glasses = np.genfromtxt('olivetti_glasses.txt', delimiter=',').astype(int) # tworzymy wektor z labelkami, czy dane zdjęcie przedstawia okularnika y_glasses = np.zeros(olivetti.data.shape[0]) y_glasses = y_glasses.astype(int) y_glasses[glasses] = 1 # ile osób ma okulary w zbiorze danych # print(np.where(y_glasses == 1)[0].size / float(olivetti.data.shape[0])) # Wybraliśmy, że będziemy uczyć klasyfikator po okularach. y = y_glasses # y = y.target # show_some_images(olivetti.images, glasses, title="Okularnicy") X_train, X_test, y_train, y_test = train_test_split(olivetti.data, y, test_size=0.2, stratify=y, random_state=0) L, V = load_pca_or_generate(X_train) ## # Classificatione experiments ## dimensions = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70] errors_train = [] errors_test = [] #n = 50 for n in dimensions: X_train_pca = X_train.dot(V[:, :n]) X_test_pca = X_test.dot(V[:, :n]) data_all = olivetti.data.dot(V[:, :n]) dt = DecisionTree(impurity="impurity_entropy") t1 = time.time() dt.fit(X_train_pca, y_train) t2 = time.time() print("Time: ", t2 - t1) print(dt.tree_) print(dt.tree_.shape) print(np.sum(dt.tree_[:, DecisionTree.COL_CHILD_LEFT] == 0.0)) predictions = dt.predict(X_test_pca[:10, :]) print(predictions) print("Dimension:", n) print("Wynik klasyfikacji dla zbioru uczącego:", dt.score(X_train_pca, y_train)) errors_test.append(dt.score(X_test_pca, y_test)) print("Wynik klasyfikacji dla zbioru testowego:", dt.score(X_test_pca, y_test)) print("Wynik klasyfikacji dla zbioru testowego (custom):", np.sum(y_test == dt.predict(X_test_pca)) / y_test.size) plt.figure() plt.plot(dimensions, errors_test) plt.title("Dokładność testowa dla liczby użytych cech") plt.xlabel("Lizba użytych cech") plt.ylabel("Dokładność testowy") plt.savefig("docs/dimensions_test.eps")
from sparse_filtering import SparseFiltering from sklearn.feature_extraction.image import extract_patches_2d from sklearn.datasets import fetch_olivetti_faces patch_width = 16 # Learn features for patches of size patch_width*patch_width n_patches = 25 # Determines number of random patches extracted from each image n_features = 64 # How many features are learned maxfun = 200 # The maximal number of evaluations of the objective function iprint = 10 # after how many function evaluations is information printed # by L-BFGS. -1 for no information ############################################################################### # Load faces data, normalize faces, and convert 2d structures dataset = fetch_olivetti_faces(shuffle=True) faces = dataset.data n_samples, _ = faces.shape faces_centered = faces - faces.mean(axis=0) # global centering faces_centered -= \ faces_centered.mean(axis=1).reshape(n_samples, -1) # local centering faces_centered = \ faces_centered.reshape(n_samples, 64, 64) # Reshaping to 64*64 pixel images print("Dataset consists of %d faces" % n_samples) ###############################################################################
def loadData(): data = fetch_olivetti_faces() targets = data.target return data, targets
def test2(self): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') n_row, n_col = 2, 3 n_components = n_row * n_col image_shape = (64, 64) rng = RandomState(0) ############################################################################### # Load faces data dataset = fetch_olivetti_faces(shuffle=True, random_state=rng) faces = dataset.data n_samples, n_features = faces.shape # global centering faces_centered = faces - faces.mean(axis=0) print 'faces_centered has %d dimensions: ', faces_centered.shape # local centering faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) print("Dataset consists of %d faces" % n_samples) print("each face has %d features" % n_features ) # List of the different estimators, whether to center and transpose the # problem, and whether the transformer uses the clustering API. estimators = [ ('Independent components - FastICA', decomposition.FastICA(n_components=n_components, whiten=True), True), ] ############################################################################### # Plot a sample of the input data self.plotGallery("First centered Olivetti faces", faces_centered[:n_components]) ############################################################################### # Do the estimation and plot it for name, estimator, center in estimators: print("Extracting the top %d %s..." % (n_components, name)) t0 = time() data = faces if center: data = faces_centered estimator.fit(data) train_time = (time() - t0) print("done in %0.3fs" % train_time) if hasattr(estimator, 'cluster_centers_'): components_ = estimator.cluster_centers_ else: components_ = estimator.components_ if hasattr(estimator, 'noise_variance_'): self.plotGallery("Pixelwise variance", estimator.noise_variance_.reshape(1, -1), n_col=1, n_row=1) self.plotGallery('%s - Train time %.1fs' % (name, train_time), components_[:n_components]) plt.show()
from sklearn.datasets import fetch_olivetti_faces from sklearn.cluster import MiniBatchKMeans from sklearn import decomposition # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') n_row, n_col = 2, 3 n_components = n_row * n_col image_shape = (64, 64) rng = RandomState(0) ############################################################################### # Load faces data dataset = fetch_olivetti_faces(shuffle=True, random_state=rng) faces = dataset.data n_samples, n_features = faces.shape # global centering faces_centered = faces - faces.mean(axis=0) # local centering faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) print("Dataset consists of %d faces" % n_samples) ############################################################################### def plot_gallery(title, images, n_col=n_col, n_row=n_row):
def sample_size_comparison(): np.set_printoptions(threshold=np.inf, precision=1) olivetti = datasets.fetch_olivetti_faces() glasses = np.genfromtxt('olivetti_glasses.txt', delimiter=',').astype(int) # tworzymy wektor z labelkami, czy dane zdjęcie przedstawia okularnika y_glasses = np.zeros(olivetti.data.shape[0]) y_glasses = y_glasses.astype(int) y_glasses[glasses] = 1 # ile osób ma okulary w zbiorze danych # print(np.where(y_glasses == 1)[0].size / float(olivetti.data.shape[0])) # Wybraliśmy, że będziemy uczyć klasyfikator po okularach. y = y_glasses # y = y.target # show_some_images(olivetti.images, glasses, title="Okularnicy") X_train, X_test, y_train, y_test = train_test_split(olivetti.data, y, test_size=0.2, stratify=y, random_state=0) L, V = load_pca_or_generate(X_train) n = 50 X_train_pca = X_train.dot(V[:, :n]) X_test_pca = X_test.dot(V[:, :n]) data_all = olivetti.data.dot(V[:, :n]) dt = DecisionTree(impurity="impurity_entropy") dt.fit(X_train_pca, y_train) predictions = dt.predict(X_test_pca[:10, :]) min_node_vals = np.arange(0.10, 0, -0.01) errors_train = np.zeros(min_node_vals.size) errors_test = np.zeros(min_node_vals.size) for i, min_node_examples in enumerate(min_node_vals): dt = DecisionTree(impurity="impurity_entropy", min_node_examples=min_node_examples) t1 = time.time() dt.fit(X_train_pca, y_train) t2 = time.time() print("time:", t2 - t1) print('min node examples: ', min_node_examples) errors_train[i] = 1 - dt.score(X_train_pca, y_train) errors_test[i] = 1 - dt.score(X_test_pca, y_test) np.set_printoptions(threshold=np.inf, precision=5) best_depth = np.argmin(errors_test) print('BEST DEPTH:', str(best_depth), " WITH TEST ACCURACY:", 1 - errors_test[best_depth]) print('ERRORS TEST: ', errors_test) print('ERRORS TRAIN: ', errors_train) plt.figure() plt.plot(min_node_vals, errors_train, marker='o', label="train errors") plt.plot(min_node_vals, errors_test, marker='o', label="test errors") plt.xlim(np.max(min_node_vals), np.min(min_node_vals)) plt.legend() plt.title("Procentowa zawartość przykładów w węźle") plt.savefig("docs/min_node_vals_test.eps")
#(4) Report accuracy as percent correct from sklearn import datasets import matplotlib.pyplot as plt import numpy as np import random from sklearn.decomposition import PCA from sklearn import preprocessing from pybrain.datasets import SupervisedDataSet from pybrain.tools.shortcuts import buildNetwork from pybrain.supervised.trainers import BackpropTrainer from pybrain.structure.modules import SoftmaxLayer # # # # LOAD DATA # # # # oFace = datasets.fetch_olivetti_faces() # load data X,y = oFace.data, oFace.target # extract data: [face x 1D vectorized image] nFaces = len(np.unique(y)); nObs = len(y) # number of unique faces, total observations # # # # SET PARAMETERS # # # # perTrain = 0.85 # fraction of data to use for training perTest = 1-perTrain dimRed = 0.99 # percent variance retained for PCA dimensionality reduction #=====================DATA PREPROCESSING======================================= # # # Perform PCA to reduce dimensionality of data (in space) # # # pcInit = PCA(); pcaOne=pcInit.fit(X) eigNorm = pcaOne.explained_variance_ratio_; # percent variance (normalized eigenvalues) nComps = [ n for n,i in enumerate(np.cumsum(eigNorm)) if i<dimRed ][-1:] nComps = int(nComps[0]) print( "Reduced Dims from %d to %d" % (X.shape[1],nComps) )
def load_faces(): X = datasets.fetch_olivetti_faces() X.data.dtype='float64' return (NATURAL, X)
def pruning_comparison_exhaustive(): np.set_printoptions(threshold=np.inf, precision=1) olivetti = datasets.fetch_olivetti_faces() glasses = np.genfromtxt('olivetti_glasses.txt', delimiter=',').astype(int) # tworzymy wektor z labelkami, czy dane zdjęcie przedstawia okularnika y_glasses = np.zeros(olivetti.data.shape[0]) y_glasses = y_glasses.astype(int) y_glasses[glasses] = 1 # ile osób ma okulary w zbiorze danych # print(np.where(y_glasses == 1)[0].size / float(olivetti.data.shape[0])) # Wybraliśmy, że będziemy uczyć klasyfikator po okularach. y = y_glasses # y = y.target # show_some_images(olivetti.images, glasses, title="Okularnicy") X_train, X_test, y_train, y_test = train_test_split(olivetti.data, y, test_size=0.2, stratify=y, random_state=0) L, V = load_pca_or_generate(X_train) n = 50 X_train_pca = X_train.dot(V[:, :n]) X_test_pca = X_test.dot(V[:, :n]) data_all = olivetti.data.dot(V[:, :n]) dt = DecisionTree(impurity="impurity_entropy") dt.fit(X_train_pca, y_train) pentalties = np.arange(0.015, 0.0, -0.0025) errors_train = np.zeros(pentalties.size) errors_test = np.zeros(pentalties.size) for i, penalty in enumerate(pentalties): print('penalty', penalty) dt = DecisionTree(impurity="impurity_entropy", pruning='exhaustive_subtrees', penalty=penalty) t1 = time.time() dt.fit(X_train_pca, y_train) t2 = time.time() print('time:', t2 - t1) errors_train[i] = 1 - dt.score(X_train_pca, y_train) errors_test[i] = 1 - dt.score(X_test_pca, y_test) np.set_printoptions(threshold=np.inf, precision=5) best_penalty_index = np.argmin(errors_test) print('BEST PENALTY:', str(pentalties[best_penalty_index]), " WITH TEST ACCURACY:", 1 - errors_test[best_penalty_index]) print('ERRORS TEST: ', errors_test) print('ERRORS TRAIN: ', errors_train) plt.figure() plt.plot(pentalties, errors_train, color='black', marker='o', label="train") plt.plot(pentalties, errors_test, color='red', marker='o', label="test") plt.legend() plt.xlabel("penalty") plt.xlim(np.max(pentalties), np.min(pentalties)) plt.title("Pruning - exhaustive subtrees") plt.savefig("docs/pruning_exhaustive.eps")
def main(): np.set_printoptions(threshold=np.inf, precision=1) olivetti = datasets.fetch_olivetti_faces() glasses = np.genfromtxt('olivetti_glasses.txt', delimiter=',').astype(int) # tworzymy wektor z labelkami, czy dane zdjęcie przedstawia okularnika y_glasses = np.zeros(olivetti.data.shape[0]) y_glasses = y_glasses.astype(int) y_glasses[glasses] = 1 # ile osób ma okulary w zbiorze danych # print(np.where(y_glasses == 1)[0].size / float(olivetti.data.shape[0])) # Wybraliśmy, że będziemy uczyć klasyfikator po okularach. y = y_glasses # y = y.target # show_some_images(olivetti.images, glasses, title="Okularnicy") X_train, X_test, y_train, y_test = train_test_split(olivetti.data, y, test_size=0.2, stratify=y, random_state=0) L, V = load_pca_or_generate(X_train) ## # Classificatione experiments ## n = 50 X_train_pca = X_train.dot(V[:, :n]) X_test_pca = X_test.dot(V[:, :n]) data_all = olivetti.data.dot(V[:, :n]) dt = DecisionTree(impurity="impurity_entropy") t1 = time.time() dt.fit(X_train_pca, y_train) t2 = time.time() print("Time: ", t2 - t1) print(dt.tree_) print(dt.tree_.shape) print(np.sum(dt.tree_[:, DecisionTree.COL_CHILD_LEFT] == 0.0)) predictions = dt.predict(X_test_pca[:10, :]) print(predictions) print("Wynik klasyfikacji dla zbioru uczącego:", dt.score(X_train_pca, y_train)) print("Wynik klasyfikacji dla zbioru testowego:", dt.score(X_test_pca, y_test)) print("Wynik klasyfikacji dla zbioru testowego (custom):", np.sum(y_test == dt.predict(X_test_pca)) / y_test.size) # # # show_some_images(V.T, indexes=[6, 3, 7]) # show_some_images(X_test[:10, :], subtitles=predictions) # # ## # # Testy dla głębokości # ## # # max_depth = int(np.max(dt.tree_[:, DecisionTree.COL_DEPTH])) # errors_train = np.zeros(max_depth + 1) # errors_test = np.zeros(max_depth + 1) # for d in range(max_depth + 1): # dt = DecisionTree(impurity="impurity_entropy", max_depth=d) # dt.fit(X_train_pca, y_train) # print('depth: ', d, 'shape:', dt.tree_.shape) # errors_train[d] = 1 - dt.score(X_train_pca, y_train) # errors_test[d] = 1 - dt.score(X_test_pca, y_test) # # np.set_printoptions(threshold=np.inf, precision=5) # best_depth = np.argmin(errors_test) # print('BEST DEPTH:', str(best_depth), " WITH TEST ACCURACY:", 1 - errors_test[best_depth]) # print('ERRORS TEST: ', errors_test) # print('ERRORS TRAIN: ', errors_train) # # plt.figure() # plt.plot(errors_train, color='black', marker='o') # plt.plot(errors_test, color='red', marker='o') # plt.show() # # ## # # Testy dla sample # ## # # min_node_vals = np.arange(0.10, 0, -0.01) # errors_train = np.zeros(min_node_vals.size) # errors_test = np.zeros(min_node_vals.size) # for i, min_node_examples in enumerate(min_node_vals): # dt = DecisionTree(impurity="impurity_entropy", min_node_examples=min_node_examples) # dt.fit(X_train_pca, y_train) # print('min node examples: ', min_node_examples) # errors_train[i] = 1 - dt.score(X_train_pca, y_train) # errors_test[i] = 1 - dt.score(X_test_pca, y_test) # # np.set_printoptions(threshold=np.inf, precision=5) # best_depth = np.argmin(errors_test) # print('BEST DEPTH:', str(best_depth), " WITH TEST ACCURACY:", 1 - errors_test[best_depth]) # print('ERRORS TEST: ', errors_test) # print('ERRORS TRAIN: ', errors_train) # # plt.figure() # plt.plot(errors_train, color='black', marker='o') # plt.plot(errors_test, color='red', marker='o') # plt.show() # # ## # # Jak kara lambda wpływa # ## # dt = DecisionTree(impurity="impurity_entropy") # dt.fit(X_train_pca, y_train) # # pentalties = np.arange(0.015, 0.0, -0.0025) # errors_train = np.zeros(pentalties.size) # errors_test = np.zeros(pentalties.size) # for i, penalty in enumerate(pentalties): # print('penalty', penalty) # dt = DecisionTree(impurity="impurity_entropy", pruning='greedy_subtrees', penalty=penalty) # t1 = time.time() # dt.fit(X_train_pca, y_train) # t2 = time.time() # print('time:', t2-t1) # errors_train[i] = 1 - dt.score(X_train_pca, y_train) # errors_test[i] = 1 - dt.score(X_test_pca, y_test) # # np.set_printoptions(threshold=np.inf, precision=5) # best_penalty_index = np.argmin(errors_test) # print('BEST PENALTY:', str(pentalties[best_penalty_index]), " WITH TEST ACCURACY:", 1 - # errors_test[best_penalty_index]) # print('ERRORS TEST: ', errors_test) # print('ERRORS TRAIN: ', errors_train) # # plt.figure() # plt.plot(errors_train, color='black', marker='o') # plt.plot(errors_test, color='red', marker='o') # plt.title("greedy") # plt.show() # # # # # Exhaustive # # # dt = DecisionTree(impurity="impurity_entropy") # dt.fit(X_train_pca, y_train) # # pentalties = np.arange(0.015, 0.0, -0.0025) # errors_train = np.zeros(pentalties.size) # errors_test = np.zeros(pentalties.size) # for i, penalty in enumerate(pentalties): # print('penalty', penalty) # dt = DecisionTree(impurity="impurity_entropy", pruning='exhaustive_subtrees', penalty=penalty) # t1 = time.time() # dt.fit(X_train_pca, y_train) # t2 = time.time() # print('time:', t2-t1) # errors_train[i] = 1 - dt.score(X_train_pca, y_train) # errors_test[i] = 1 - dt.score(X_test_pca, y_test) # # np.set_printoptions(threshold=np.inf, precision=5) # best_penalty_index = np.argmin(errors_test) # print('BEST PENALTY:', str(pentalties[best_penalty_index]), " WITH TEST ACCURACY:", 1 - # errors_test[best_penalty_index]) # print('ERRORS TEST: ', errors_test) # print('ERRORS TRAIN: ', errors_train) # svc = SVC() # svc.fit(X_train, y_train) # print("SVC Default scores [train, test]:" + str(svc.score(X_train, y_train)) + ', ' + str(svc.score(X_test, y_test))) # # svc = SVC(C=10.0**1, kernel='rbf') # svc.fit(X_train, y_train) # print("SVC Default scores [train, test]:" + str(svc.score(X_train, y_train)) + ', ' + str(svc.score(X_test, y_test))) # Cs = 2.0**np.arange(-8, 2) # svm_errs_train = np.zeros(Cs.size) # svm_errs_test = np.zeros(Cs.size) # # for i, C in enumerate(Cs): # svc = SVC(C=C, kernel='linear') # svc.fit(X_train_pca, y_train) # print( # "SVC Default scores [train, test]:" + str(svc.score(X_train_pca, y_train)) + ', ' + str(svc.score(X_test_pca, y_test))) # svm_errs_test[i] = svc.score(X_test_pca, y_test) # svm_errs_train[i] = svc.score(X_train_pca, y_train) # # plt.figure() # plt.plot(np.log(Cs), svm_errs_test, color='black', marker='o') # plt.plot(np.log(Cs), svm_errs_train, color='red', marker='o') # plt.title("") # plt.grid(True) # plt.show() # # Drzewko z sklearn # sklearn_tree = DecisionTreeClassifier(min_samples_split=2, criterion='entropy', random_state=0) t1 = time.time() sklearn_tree.fit(X_train_pca, y_train) t2 = time.time() print("Sklearn time", t2 - t1) print("Node count", sklearn_tree.tree_.node_count) print("Train, test", sklearn_tree.score(X_train_pca, y_train), sklearn_tree.score(X_test_pca, y_test))
The code below also illustrates how the construction and the computation of the predictions can be parallelized within multiple jobs. """ print(__doc__) from time import time import pylab as pl from sklearn.datasets import fetch_olivetti_faces from sklearn.ensemble import ExtraTreesClassifier # Number of cores to use to perform parallel fitting of the forest model n_jobs = 1 # Load the faces dataset data = fetch_olivetti_faces() X = data.images.reshape((len(data.images), -1)) y = data.target mask = y < 5 # Limit to 5 classes X = X[mask] y = y[mask] # Build a forest and compute the pixel importances print("Fitting ExtraTreesClassifier on faces data with %d cores..." % n_jobs) t0 = time() forest = ExtraTreesClassifier(n_estimators=1000, max_features=128, compute_importances=True, n_jobs=n_jobs, random_state=0)
import numpy as np import cv2 from matplotlib import pyplot as plt from matplotlib.pyplot import figure from sklearn.datasets import fetch_olivetti_faces import scipy from scipy import fftpack import time from pprint import pprint from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import cross_val_score df = fetch_olivetti_faces() def plot_3(data, num_photo): fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2,3, figsize = (15,6)) ax4.imshow(data[num_photo[0]], cmap=plt.cm.gray) ax5.imshow(data[num_photo[1]], cmap=plt.cm.gray) ax1.imshow(df.images[num_photo[0]], cmap=plt.cm.gray) ax2.imshow(df.images[num_photo[1]], cmap=plt.cm.gray) plt.show() def plot_3_hist(data, num_photo): fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2,3, figsize = (15,6)) ax4.hist(data[num_photo[0]])
from numpy.random import RandomState import matplotlib.pyplot as plt from sklearn.datasets import fetch_olivetti_faces from sklearn import decomposition n_row, n_col = 2, 3 n_components = n_row * n_col image_shape = (64, 64) ############################################################################### # Load faces data dataset = fetch_olivetti_faces(shuffle=True, random_state=RandomState(0)) faces = dataset.data ############################################################################### def plot_gallery(title, images, n_col=n_col, n_row=n_row): plt.figure(figsize=(2. * n_col, 2.26 * n_row)) plt.suptitle(title, size=16) for i, comp in enumerate(images): plt.subplot(n_row, n_col, i + 1) vmax = max(comp.max(), -comp.min()) plt.imshow(comp.reshape(image_shape), cmap=plt.cm.gray, interpolation='nearest', vmin=-vmax, vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01, 0.05, 0.99, 0.94, 0.04, 0.)
import matplotlib.pyplot as plt import numpy as np import time import logging from sklearn.datasets import fetch_olivetti_faces logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') data_home = 'face_dientity/' logging.info('Start to load dataset') faces = fetch_olivetti_faces(data_home=data_home) logging.info('Done with load dataset') #400*4096有400张图,每张图有4096个特征 X = faces.data #训练数据集 y = faces.target #类别目标索引 targets = np.unique(faces.target) #给图片命名 target_names = np.array(["c%d" % t for t in targets]) #人物个数 n_targets = target_names.shape[0] #总图片 高 宽 n_samples, h, w = faces.images.shape print('Sample count: {}\nTarget count: {}'.format(n_samples, n_targets)) print('Image size: {}x{}\nDataset shape: {}\n'.format(w, h, X.shape)) #images是二维数据,每一行是图片数据 def plot_gallery(images, titles, h, w, n_row=2, n_col=5): """显示图片阵列""" plt.figure(figsize=(2 * n_col, 2.2 * n_row), dpi=144)
def get_faces(): data_images = fetch_olivetti_faces() return [data_images.images, data_images.target]
def plot_multioutput_face_completion(): # Load the faces datasets data = fetch_olivetti_faces() targets = data.target data = data.images.reshape((len(data.images), -1)) train = data[targets < 30] test = data[targets >= 30] # Test on independent people # Test on a subset of people n_faces = 5 rng = check_random_state(4) face_ids = rng.randint(test.shape[0], size=(n_faces, )) test = test[face_ids, :] n_pixels = data.shape[1] X_train = train[:, :np.ceil(0.5 * n_pixels)] # Upper half of the faces y_train = train[:, np.floor(0.5 * n_pixels):] # Lower half of the faces X_test = test[:, :np.ceil(0.5 * n_pixels)] y_test = test[:, np.floor(0.5 * n_pixels):] # Fit estimators ESTIMATORS = { "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32, random_state=0), "K-nn": KNeighborsRegressor(), "Linear regression": LinearRegression(), "Ridge": RidgeCV(), } y_test_predict = dict() for name, estimator in ESTIMATORS.items(): estimator.fit(X_train, y_train) y_test_predict[name] = estimator.predict(X_test) # Plot the completed faces image_shape = (64, 64) n_cols = 1 + len(ESTIMATORS) plt.figure(figsize=(2. * n_cols, 2.26 * n_faces)) plt.suptitle("Face completion with multi-output estimators", size=16) for i in range(n_faces): true_face = np.hstack((X_test[i], y_test[i])) if i: sub = plt.subplot(n_faces, n_cols, i * n_cols + 1) else: sub = plt.subplot(n_faces, n_cols, i * n_cols + 1, title="true faces") sub.axis("off") sub.imshow(true_face.reshape(image_shape), cmap=plt.cm.gray, interpolation="nearest") for j, est in enumerate(sorted(ESTIMATORS)): completed_face = np.hstack((X_test[i], y_test_predict[est][i])) if i: sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j) else: sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j, title=est) sub.axis("off") sub.imshow(completed_face.reshape(image_shape), cmap=plt.cm.gray, interpolation="nearest") plt.show()