def preprocess_uwash_depth_dataset(attribs): pipeline = preprocessing.Pipeline() # rgbd : (num_examples, 72, 72, 4) # labels: (num_examples, 1) pipeline.items.append( hdf5_data_preprocessors.ExtractRawUWashData( attribs["raw_data_folder"], data_labels=("rgbd_patches", "patch_labels"))) pipeline.items.append( hdf5_data_preprocessors.PerChannelGlobalContrastNormalizePatches( data_to_normalize_key='rgbd_patches', normalized_data_key='normalized_rgbd_patches', batch_size=100)) #this extracts a valid set and test set pipeline.items.append( hdf5_data_preprocessors.SplitData( data_to_split_key=('rgbd_patches', 'patch_labels'), sets=attribs["sets"], patch_shape=attribs["patch_shape"], num_patches_per_set=attribs["num_patches_per_set"])) pipeline.items.append(hdf5_data_preprocessors.MakeC01B()) #now lets actually make a new dataset and run it through the pipeline hd5f_dataset = h5py.File(attribs["output_filepath"]) pipeline.apply(hd5f_dataset)
def preprocess_nyu_depth_dataset(attribs): pipeline = preprocessing.Pipeline() # rgbd : (1449, 640, 480, 4) # labels: (1449, 640, 480) pipeline.items.append( hdf5_data_preprocessors.ExtractRawNYUData(attribs["raw_filepath"], data_labels=("rgbd", "labels"))) #add the steps necessary to generate data for # valid, test and training datasets for i in range(len(attribs["sets"])): which_set = attribs["sets"][i] num_patches = attribs["num_patches_per_set"][i] #labels for the hdf5 file patch_label = which_set + "_patches" patch_labels = (patch_label, which_set + "_patch_labels") pipeline.items.append( hdf5_data_preprocessors.ExtractPatches( patch_shape=attribs["patch_shape"], patch_labels=patch_labels, patch_source_labels=("rgbd", "labels"), num_patches=num_patches)) pipeline.items.append(hdf5_data_preprocessors.MakeC01B()) #now lets actually make a new dataset and run it through the pipeline hd5f_dataset = h5py.File(attribs["output_filepath"]) pipeline.apply(hd5f_dataset)
def test1(): path_org = '/Tmp/gulcehrc/imagenet_256x256_filtered.h5' path = '/Tmp/gulcehrc/imagenetTemp.h5' train = Imagenet(which_set='train', path=path, path_org=path_org, size_of_receptive_field=(8, 8), center=True, scale=True, start=0, stop=1000, imageShape=(256, 256), mode='a', axes=('b', 0, 1, 'c'), preprocessor=None) pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.GlobalContrastNormalizationPyTables()) pipeline.items.append( preprocessing.LeCunLCN((256, 256), channels=[0], kernel_size=7)) # apply preprocessing to train train.apply_preprocessor(pipeline, can_fit=True) train.view_shape() #testing batch_size = 10 num_batches = 1 mode = SequentialSubsetIterator targets1 = [] targets2 = []
def main(): data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print('Loading STL10-10 unlabeled and train datasets...') downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print('Concatenating datasets...') data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0)) del supplement print("Preparing output directory...") patch_dir = data_dir + '/stl10_patches_8x8' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write(textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from a downsampled (to 32x32) version of the STL-10 train and unlabeled datasets. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_stl10_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2*1000*1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def randomize_datasets(self, datasets): center_shift = np.array(self._window_shape) / 2. - 0.5 tform_center = skimage.transform.SimilarityTransform( translation=-center_shift) tform_uncenter = skimage.transform.SimilarityTransform( translation=center_shift) if self._preprocess is not None: pipeline = preprocessing.Pipeline() #window the rotations to get rid of the uniform background if self._central_window_shape is not None: print 'adding window' pipeline.items.append(CentralWindow( self._central_window_shape)) for item in self._preprocess: pipeline.items.append(item) im_shape = (self._window_shape[0], self._window_shape[1], 1) for d_idx, dataset in enumerate(datasets): data = self._original[dataset] #randomly window data print data.shape arr = np.empty((data.shape[0], self._window_shape[0], self._window_shape[1], data.shape[3]), dtype=np.float32) for idx, example in enumerate(data): scale_x = np.random.uniform(1 - self._scale_diff, 1 + self._scale_diff) scale_y = np.random.uniform(1 - self._scale_diff, 1 + self._scale_diff) translation_x = np.random.uniform(1 - self._translation, 1 + self._translation) translation_y = np.random.uniform(1 - self._translation, 1 + self._translation) shear = np.random.uniform(0. - self._shear, 0. + self._shear) rotation = np.random.uniform(0, 360) tform = AffineTransform(scale=(scale_x, scale_y), rotation=np.deg2rad(rotation), translation=(translation_x, translation_y), shear=shear) tform = tform_center + tform + tform_uncenter img = warp(example, tform, output_shape=self._window_shape) arr[idx] = img dataset.set_topological_view(arr, axes=dataset.view_converter.axes) #assumes self._randomize in in order of [train, valid/test] if self._preprocess is not None: can_fit = True if d_idx == 1: can_fit = False dataset.apply_preprocessor(preprocessor=pipeline, can_fit=can_fit)
def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}') print('Loading CIFAR-100 train dataset...') data = CIFAR100(which_set='train') print("Preparing output directory...") patch_dir = data_dir + '/cifar100/cifar100_patches' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from the CIFAR-100 train set. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_cifar100_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(6, 6), num_patches=2 * 1000 * 1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def get_pipeline(img_shape, patch_size, batch_size): pipeline = preprocessing.Pipeline() conf = get_config() if conf['preprocessing']['remove_mean']: pipeline.items.append(preprocessing.RemoveMean()) if conf['preprocessing']['gcn']: pipeline.items.append( preprocessing.GlobalContrastNormalization(batch_size=batch_size) ) if conf['preprocessing']['lcn']: # LCN requires uneven patch size lcn_patch_size = patch_size + 1 - (patch_size % 2) pipeline.items.append( preprocessing.LeCunLCN( img_shape, kernel_size=lcn_patch_size) ) return pipeline
def get_processed_dataset(): train_path = 'pp_cifar10_train.pkl' test_path = 'pp_cifar10_test.pkl' if os.path.exists(train_path) and os.path.exists( test_path) and not new_params: print 'loading preprocessed data' trainset = serial.load(train_path) testset = serial.load(test_path) else: print 'loading raw data...' pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatchesWithPosition( patch_shape=patch_shape, patches_per_image=patches_per_image)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append( preprocessing.PCA(num_components=num_components, keep_var_fraction=keep_var_fraction)) pipeline.items.append( preprocessing.ExtractPatchPairs( patches_per_image=patches_per_image, num_images=train_size, input_width=input_width)) trainset = cifar10.CIFAR10(which_set="train", start=start, stop=stop) testset = cifar10.CIFAR10(which_set="test") trainset.preprocessor = pipeline trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) # the pkl-ing is having issues, the dataset is maybe too big. serial.save(train_path, trainset) serial.save(test_path, testset) # this path will be used for visualizing weights after training is done trainset.yaml_src = '!pkl: "%s"' % train_path testset.yaml_src = '!pkl: "%s"' % test_path return trainset, testset
def main(): train = cifar10.CIFAR10(which_set="train", center=True) pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.GlobalContrastNormalization(subtract_mean=False, sqrt_bias=0.0, use_std=True)) pipeline.items.append(preprocessing.PCA(num_components=512)) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) test.apply_preprocessor(preprocessor=pipeline, can_fit=False) serial.save('cifar10_preprocessed_train.pkl', train) serial.save('cifar10_preprocessed_test.pkl', test)
def get_dataset_cifar10(): """ The orginal pipeline on cifar10 from pylearn2. Please refer to pylearn2/scripts/train_example/make_dataset.py for details. """ train_path = 'cifar10_preprocessed_train.pkl' test_path = 'cifar10_preprocessed_test.pkl' if os.path.exists(train_path) and \ os.path.exists(test_path): print 'loading preprocessed data' trainset = serial.load(train_path) testset = serial.load(test_path) else: print 'loading raw data...' trainset = cifar10.CIFAR10(w5B5B5B5Bhich_set="train") testset = cifar10.CIFAR10(which_set="test") print 'preprocessing data...' pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000)) pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) trainset.use_design_loc('train_design.npy') testset.apply_preprocessor(preprocessor=pipeline, can_fit=True) testset.use_design_loc('test_design.npy') print 'saving preprocessed data...' serial.save('cifar10_preprocessed_train.pkl', trainset) serial.save('cifar10_preprocessed_test.pkl', testset) trainset.yaml_src = '!pkl: "%s"' % train_path testset.yaml_src = '!pkl: "%s"' % test_path # this path will be used for visualizing weights after training is done #global YAML return trainset, testset
def process_data(): # pre-process unsupervised data if not os.path.exists(DATA_DIR+'preprocess.pkl') \ or not os.path.exists(DATA_DIR+'unsup_prep_data.pkl') \ or not os.path.exists(DATA_DIR+'sup_prep_data.pkl'): unsup_data = black_box_dataset.BlackBoxDataset('extra') pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.Standardize(global_mean=False, global_std=False)) #pipeline.items.append(preprocessing.ZCA(filter_bias=.1)) unsup_data.apply_preprocessor(preprocessor=pipeline, can_fit=True) serial.save(DATA_DIR + 'preprocess.pkl', pipeline) # why the hell do I get pickling errors if I use serial here? solve by pickling myself #serial.save(DATA_DIR+'unsup_prep_data.pkl', unsup_data) out = open(DATA_DIR + 'unsup_prep_data.pkl', 'w') pickle.dump(unsup_data, out) out.close() # process supervised training data sup_data = [] which_data = ['train'] * 3 + ['public_test'] starts = [0, 800, None, None] stops = [800, 1000, None, None] fits = [False, False, False, False] for curstr, start, stop, fit in zip(which_data, starts, stops, fits): sup_data.append( black_box_dataset.BlackBoxDataset(which_set=curstr, start=start, stop=stop, preprocessor=pipeline, fit_preprocessor=fit)) serial.save(DATA_DIR + 'sup_prep_data.pkl', sup_data) else: pipeline = serial.load(DATA_DIR + 'preprocess.pkl') #unsup_data = serial.load(DATA_DIR+'unsup_prep_data.pkl') unsup_data = pickle.load(open(DATA_DIR + 'unsup_prep_data.pkl', 'r')) sup_data = serial.load(DATA_DIR + 'sup_prep_data.pkl') return unsup_data, sup_data
def generate_patches(): datasets = OrderedDict() datasets['train'] = GenderWrite.gwdata.GWData(which_set='train', start=1, stop=201) datasets['valid'] = GenderWrite.gwdata.GWData(which_set='train', start=201, stop=283) datasets['test'] = GenderWrite.gwdata.GWData(which_set='test') datasets['tottrain'] = GenderWrite.gwdata.GWData(which_set='train') # preprocess patches pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) for dstr, dset in datasets.iteritems(): print dstr # only fit on train data trainbool = dstr == 'train' or dstr == 'tottrain' dset.apply_preprocessor(preprocessor=pipeline, can_fit=trainbool) # save dset.use_design_loc(DATA_DIR + dstr + '_design.npy') serial.save(DATA_DIR + 'gw_preprocessed_' + dstr + '.pkl', dset)
def load_preprocessor(self, preprocess_array): if preprocess_array is None: self.preprocessor = None return None preprocess_list = [] for preprocess_id in preprocess_array: row = self.db.executeSQL( """ SELECT preprocess_class FROM hps3.preprocess WHERE preprocess_id = %s """, (preprocess_id, ), self.db.FETCH_ONE) if not row or row is None: raise HPSData("No preprocess for preprocess_id="\ +str(preprocess_id)) preprocess_class = row[0] fn = getattr(self, 'get_preprocess_' + preprocess_class) preprocess_list.append(fn(preprocess_id)) if len(preprocess_list) > 1: preprocessor = pp.Pipeline(preprocess_list) else: preprocessor = preprocess_list[0] self.preprocessor = preprocessor
def get_dataset(which_data, tot=False): train_path = DATA_DIR + 'train' + which_data + '_preprocessed.pkl' valid_path = DATA_DIR + 'valid' + which_data + '_preprocessed.pkl' tottrain_path = DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl' test_path = DATA_DIR + 'test' + which_data + '_preprocessed.pkl' if os.path.exists(train_path) and os.path.exists( valid_path) and os.path.exists(test_path): print 'loading preprocessed data' trainset = serial.load(train_path) validset = serial.load(valid_path) if tot: tottrainset = serial.load(tottrain_path) testset = serial.load(test_path) else: print 'loading raw data...' trainset = Whales(which_set="train", which_data=which_data, start=0, stop=56671) validset = Whales(which_set="train", which_data=which_data, start=56671, stop=66671) tottrainset = Whales(which_set="train", which_data=which_data) testset = Whales(which_set="test", which_data=which_data) print 'preprocessing data...' pipeline = preprocessing.Pipeline() if which_data == 'melspectrum': pipeline.items.append( preprocessing.Standardize(global_mean=True, global_std=True)) # ZCA = zero-phase component analysis # very similar to PCA, but preserves the look of the original image better pipeline.items.append(preprocessing.ZCA()) else: # global_mean/std=False voor per-feature standardization pipeline.items.append( preprocessing.Standardize(global_mean=False, global_std=False)) trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) # this uses numpy format for storage instead of pickle, for memory reasons trainset.use_design_loc(DATA_DIR + 'train_' + which_data + '_design.npy') # note the can_fit=False: no sharing between train and test data validset.apply_preprocessor(preprocessor=pipeline, can_fit=False) validset.use_design_loc(DATA_DIR + 'valid_' + which_data + '_design.npy') tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) tottrainset.use_design_loc(DATA_DIR + 'tottrain_' + which_data + '_design.npy') # note the can_fit=False: no sharing between train and test data testset.apply_preprocessor(preprocessor=pipeline, can_fit=False) testset.use_design_loc(DATA_DIR + 'test_' + which_data + '_design.npy') # this path can be used for visualizing weights after training is done trainset.yaml_src = '!pkl: "%s"' % train_path validset.yaml_src = '!pkl: "%s"' % valid_path tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path testset.yaml_src = '!pkl: "%s"' % test_path print 'saving preprocessed data...' serial.save(DATA_DIR + 'train' + which_data + '_preprocessed.pkl', trainset) serial.save(DATA_DIR + 'valid' + which_data + '_preprocessed.pkl', validset) serial.save(DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl', tottrainset) serial.save(DATA_DIR + 'test' + which_data + '_preprocessed.pkl', testset) if tot: return tottrainset, validset, testset else: return trainset, validset, testset
from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.datasets.tfd import TFD train = TFD(which_set='train') preprocessor = preprocessing.Pipeline() preprocessor.items.append(preprocessing.GlobalContrastNormalization()) preprocessor.items.append(preprocessing.ZCA()) preprocessor.apply(train, can_fit=True) serial.save('tfd_gcn_whitener.pkl', preprocessor)
def generate(opc): """ Summary (Generates a dataset with the chosen transformation). Parameters ---------- opc: string Only two options, shifts or rotations. """ dim = 19 # outer square # A bigger image is used to avoid empty pixels in the # borders. reg = 13 # inner square total = 20000 # Number of training examples im1 = numpy.zeros((total, reg, reg, 1), dtype='float32') im2 = numpy.zeros((total, reg, reg, 1), dtype='float32') Y = numpy.zeros((total, 1), dtype='uint8') rng = make_np_rng(9001, [1, 2, 3], which_method="uniform") transformation = opc if transformation == 'shifts': # Shifts # only shifts between [-3, +3] pixels shifts = list(itertools.product(range(-3, 4), range(-3, 4))) t = 0 while t < total: x = rng.uniform(0, 1, (dim, dim)) x = numpy.ceil(x * 255) im_x = x[3:16, 3:16][:, :, None] ind = rng.randint(0, len(shifts)) Y[t] = ind txy = shifts[ind] tx, ty = txy im_y = x[(3 + tx):(16 + tx), (3 + ty):(16 + ty)][:, :, None] im1[t, :] = im_x im2[t, :] = im_y t += 1 else: assert transformation == 'rotations' # Rotations import Image # import cv2 angs = numpy.linspace(0, 359, 90) t = 0 while t < total: x = rng.uniform(0, 1, (dim, dim)) x = numpy.ceil(x * 255) im_x = x[3:16, 3:16][:, :, None] ind = rng.randint(0, len(angs)) Y[t] = ind ang = angs[ind] y = numpy.asarray(Image.fromarray(x).rotate(ang)) # scale = 1 # M1 = cv2.getRotationMatrix2D((dim/2, dim/2), ang, scale) # y = cv2.warpAffine(x, M1, (dim, dim)) im_y = y[3:16, 3:16][:, :, None] im1[t, :] = im_x im2[t, :] = im_y t += 1 view_converter = dense_design_matrix.DefaultViewConverter((reg, reg, 1)) design_X = view_converter.topo_view_to_design_mat(im1) design_Y = view_converter.topo_view_to_design_mat(im2) # Normalize data: pipeline = preprocessing.Pipeline() gcn = preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True) pipeline.items.append(gcn) XY = numpy.concatenate((design_X, design_Y), 0) XY_ImP = dense_design_matrix.DenseDesignMatrix(X=XY) XY_ImP.apply_preprocessor(preprocessor=pipeline, can_fit=True) X1 = XY_ImP.X[0:design_X.shape[0], :] X2 = XY_ImP.X[design_X.shape[0]:, :] # As a Conv2DSpace topo_X1 = view_converter.design_mat_to_topo_view(X1) topo_X2 = view_converter.design_mat_to_topo_view(X2) axes = ('b', 0, 1, 'c') data_specs = (CompositeSpace([ Conv2DSpace((reg, reg), num_channels=1, axes=axes), Conv2DSpace((reg, reg), num_channels=1, axes=axes), VectorSpace(1) ]), ('featuresX', 'featuresY', 'targets')) train = VectorSpacesDataset((topo_X1, topo_X2, Y), data_specs=data_specs) # As a VectorSpace # data_specs = (CompositeSpace( # [VectorSpace(reg * reg), # VectorSpace(reg * reg), # VectorSpace(1)]), # ('featuresX', 'featuresY', 'targets')) # train = VectorSpacesDataset(data=(X1, X2, Y), data_specs=data_specs) import os save_path = os.path.dirname(os.path.realpath(__file__)) serial.save(os.path.join(save_path, 'train_preprocessed.pkl'), train)
def get_dataset(tot=False, preprocessor='normal'): if not os.path.exists(DATA_DIR+'train.npy') or \ not os.path.exists(DATA_DIR+'test.npy') or \ not os.path.exists(DATA_DIR+'targets.npy'): initial_read() train_path = DATA_DIR+'train_'+preprocessor+'_preprocessed.pkl' valid_path = DATA_DIR+'valid_'+preprocessor+'_preprocessed.pkl' tottrain_path = DATA_DIR+'tottrain_'+preprocessor+'_preprocessed.pkl' test_path = DATA_DIR+'test_'+preprocessor+'_preprocessed.pkl' if os.path.exists(train_path) and os.path.exists(valid_path) and os.path.exists(test_path): print 'loading preprocessed data' trainset = serial.load(train_path) validset = serial.load(valid_path) if tot: tottrainset = serial.load(tottrain_path) testset = serial.load(test_path) else: print 'loading raw data...' trainset = Digits(which_set='train', start=0, stop=34000) validset = Digits(which_set='train', start=34000, stop=42000) tottrainset = Digits(which_set='train') testset = Digits(which_set='test') print 'preprocessing data...' pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) if preprocessor != 'nozca': # ZCA = zero-phase component analysis # very similar to PCA, but preserves the look of the original image better pipeline.items.append(preprocessing.ZCA()) # note the can_fit=False's: no sharing between train and valid data trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) validset.apply_preprocessor(preprocessor=pipeline, can_fit=False) tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) testset.apply_preprocessor(preprocessor=pipeline, can_fit=False) if preprocessor not in ('normal','nozca'): for data in (trainset, validset, tottrainset, testset): for ii in range(data.X.shape[0]): # normalize to [0,1] dmax = np.max(data.X[ii,:]) dmin = np.min(data.X[ii,:]) dnorm = (data.X[ii,:] - dmin) / (dmax - dmin) # and convert to PIL image img = Image.fromarray(dnorm.reshape(28, 28) * 255.).convert('L') # apply preprocessor if preprocessor == 'rotate': rot = rng.randint(-40, 41) img = img.rotate(rot, Image.BILINEAR) elif preprocessor == 'emboss': img = emboss(img) elif preprocessor == 'hshear': # coef = 0 means unsheared coef = -1 + np.random.rand()*2 # note: image is moved with (coef/2)*28 to center it after shearing img = img.transform((28,28), Image.AFFINE, (1,coef,-(coef/2)*28,0,1,0), Image.BILINEAR) elif preprocessor == 'vshear': coef = -1 + np.random.rand()*2 img = img.transform((28,28), Image.AFFINE, (1,0,0,coef,1,-(coef/2)*28), Image.BILINEAR) elif preprocessor == 'patch': # negative values are not possible in PIL, so do a zoom only transform then x1 = np.random.randint(0, 5) y1 = np.random.randint(0, 5) x2 = np.random.randint(0, 5) y2 = np.random.randint(0, 5) img = img.transform((28,28), Image.EXTENT, (x1, y1, 28-x2, 28-y2), Image.BILINEAR) # convert back to numpy array data.X[ii,:] = np.array(img.getdata()) / 255. if preprocessor == 'noisy': # add noise data.X[ii,:] += np.random.randn(28*28) * 0.1 # bound between [0,1] data.X[ii,:] = np.minimum(np.ones(28*28), np.maximum(np.zeros(28*28), data.X[ii,:])) # this uses numpy format for storage instead of pickle, for memory reasons trainset.use_design_loc(DATA_DIR+'train_'+preprocessor+'_design.npy') validset.use_design_loc(DATA_DIR+'valid_'+preprocessor+'_design.npy') tottrainset.use_design_loc(DATA_DIR+'tottrain_'+preprocessor+'_design.npy') testset.use_design_loc(DATA_DIR+'test_'+preprocessor+'_design.npy') # this path can be used for visualizing weights after training is done trainset.yaml_src = '!pkl: "%s"' % train_path validset.yaml_src = '!pkl: "%s"' % valid_path tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path testset.yaml_src = '!pkl: "%s"' % test_path print 'saving preprocessed data...' serial.save(train_path, trainset) serial.save(valid_path, validset) serial.save(tottrain_path, tottrainset) serial.save(test_path, testset) if tot: return tottrainset, validset, testset else: return trainset, validset, testset
def test_works(): load = True if load == False: ddmTrain = FacialKeypoint(which_set='train', start=0, stop=6000) ddmValid = FacialKeypoint(which_set='train', start=6000, stop=7049) # valid can_fit = false pipeline = preprocessing.Pipeline() stndrdz = preprocessing.Standardize() stndrdz.apply(ddmTrain, can_fit=True) #doubt, how about can_fit = False? stndrdz.apply(ddmValid, can_fit=False) GCN = preprocessing.GlobalContrastNormalization() GCN.apply(ddmTrain, can_fit=True) GCN.apply(ddmValid, can_fit=False) pcklFile = open('kpd.pkl', 'wb') obj = (ddmTrain, ddmValid) pickle.dump(obj, pcklFile) pcklFile.close() return else: pcklFile = open('kpd.pkl', 'rb') (ddmTrain, ddmValid) = pickle.load(pcklFile) pcklFile.close() #creating layers #2 convolutional rectified layers, border mode valid layer1 = ConvRectifiedLinear(layer_name='convRect1', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[3, 3], pool_stride=[2, 2], max_kernel_norm=1.9365) layer2 = ConvRectifiedLinear(layer_name='convRect2', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[3, 3], pool_stride=[2, 2], max_kernel_norm=1.9365) # Rectified linear units layer3 = RectifiedLinear(dim=3000, sparse_init=15, layer_name='RectLin3') #multisoftmax n_groups = 30 n_classes = 98 irange = 0 layer_name = 'multisoftmax' layerMS = MultiSoftmax(n_groups=n_groups, irange=0.05, n_classes=n_classes, layer_name=layer_name) #setting up MLP MLPerc = MLP(batch_size=8, input_space=Conv2DSpace(shape=[96, 96], num_channels=1), layers=[layer1, layer2, layer3, layerMS]) #mlp_cost missing_target_value = -1 mlp_cost = MLPCost(cost_type='default', missing_target_value=missing_target_value) #algorithm # learning rate, momentum, batch size, monitoring dataset, cost, termination criteria term_crit = MonitorBased(prop_decrease=0.00001, N=30, channel_name='validation_objective') kpSGD = KeypointSGD(learning_rate=0.001, init_momentum=0.5, monitoring_dataset={ 'validation': ddmValid, 'training': ddmTrain }, batch_size=8, batches_per_iter=750, termination_criterion=term_crit, train_iteration_mode='random_uniform', cost=mlp_cost) #train extension train_ext = ExponentialDecayOverEpoch(decay_factor=0.998, min_lr_scale=0.01) #train object train = Train(dataset=ddmTrain, save_path='kpd_model2.pkl', save_freq=1, model=MLPerc, algorithm=kpSGD, extensions=[ train_ext, MonitorBasedSaveBest(channel_name='validation_objective', save_path='kpd_best.pkl'), MomentumAdjustor(start=1, saturate=20, final_momentum=.9) ]) train.main_loop() train.save()
to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_cifar100_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2 * 1000 * 1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def get_data(tot=True, flatgrey=False): tottrain_path = DATA_DIR+'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl' test_path = DATA_DIR+'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl' if os.path.exists(test_path): print 'loading preprocessed data' datasets = OrderedDict() # datasets['train'] = serial.load(train_path) # datasets['valid'] = serial.load(valid_path) if tot: datasets['tottrain'] = serial.load(tottrain_path) datasets['test'] = serial.load(test_path) if tot: return datasets['tottrain'], datasets['test'] else: return datasets['train'], datasets['test'] else: print 'preprocessing data...' pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.GlobalContrastNormalization(use_std=True)) pipeline.items.append(preprocessing.ZCA()) # print 'traindata' # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=0, stop=39999) # data.apply_preprocessor(preprocessor=pipeline, can_fit=True) # # this path can be used for visualizing weights after training is done # data.yaml_src = '!pkl: "%s"' % data # # save # data.use_design_loc(DATA_DIR+'train_design' + str(SUBMODEL) + '.npy') # serial.save(DATA_DIR+'gz_preprocessed_train'+str(SUBMODEL) + '.pkl', data) # print 'validdata' # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=40000, stop=61577) # data.apply_preprocessor(preprocessor=pipeline, can_fit=False) # # this path can be used for visualizing weights after training is done # data.yaml_src = '!pkl: "%s"' % data # # save # data.use_design_loc(DATA_DIR+'valid_design' + str(SUBMODEL) + '.npy') # serial.save(DATA_DIR+'gz_preprocessed_valid'+str(SUBMODEL) + '.pkl', data) print 'tottraindata' data = GalaxyZoo.gzdeepdata.GZData(which_set='training', flatgrey=flatgrey) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) # this path can be used for visualizing weights after training is done data.yaml_src = '!pkl: "%s"' % data # save data.use_design_loc(DATA_DIR + 'tottrain_design' + str(SUBMODEL) + '_64x.npy') serial.save(DATA_DIR + 'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl', data) print 'testdata' data = GalaxyZoo.gzdeepdata.GZData(which_set='test', flatgrey=flatgrey) data.apply_preprocessor(preprocessor=pipeline, can_fit=False) # this path can be used for visualizing weights after training is done data.yaml_src = '!pkl: "%s"' % data # save data.use_design_loc(DATA_DIR + 'test_design' + str(SUBMODEL) + '_64x.npy') serial.save(DATA_DIR + 'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl', data) print 'Finished, now re-run for running model on GPU' return None, None