def test_mean_keep_dimensions(self): data_set = cifar10.CIFAR10(which_set="train") pp = RemoveMean(axis=1) data_set.apply_preprocessor(pp, can_fit=True) result = data_set.get_design_matrix() assert isfinite(result)
def get_processed_dataset(): train_path = 'pp_cifar10_train.pkl' test_path = 'pp_cifar10_test.pkl' if os.path.exists(train_path) and os.path.exists( test_path) and not new_params: print 'loading preprocessed data' trainset = serial.load(train_path) testset = serial.load(test_path) else: print 'loading raw data...' pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatchesWithPosition( patch_shape=patch_shape, patches_per_image=patches_per_image)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append( preprocessing.PCA(num_components=num_components, keep_var_fraction=keep_var_fraction)) pipeline.items.append( preprocessing.ExtractPatchPairs( patches_per_image=patches_per_image, num_images=train_size, input_width=input_width)) trainset = cifar10.CIFAR10(which_set="train", start=start, stop=stop) testset = cifar10.CIFAR10(which_set="test") trainset.preprocessor = pipeline trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) # the pkl-ing is having issues, the dataset is maybe too big. serial.save(train_path, trainset) serial.save(test_path, testset) # this path will be used for visualizing weights after training is done trainset.yaml_src = '!pkl: "%s"' % train_path testset.yaml_src = '!pkl: "%s"' % test_path return trainset, testset
def main(): train = cifar10.CIFAR10(which_set="train", center=True) pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.GlobalContrastNormalization(subtract_mean=False, sqrt_bias=0.0, use_std=True)) pipeline.items.append(preprocessing.PCA(num_components=512)) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) test.apply_preprocessor(preprocessor=pipeline, can_fit=False) serial.save('cifar10_preprocessed_train.pkl', train) serial.save('cifar10_preprocessed_test.pkl', test)
def get_dataset_cifar10(): """ The orginal pipeline on cifar10 from pylearn2. Please refer to pylearn2/scripts/train_example/make_dataset.py for details. """ train_path = 'cifar10_preprocessed_train.pkl' test_path = 'cifar10_preprocessed_test.pkl' if os.path.exists(train_path) and \ os.path.exists(test_path): print 'loading preprocessed data' trainset = serial.load(train_path) testset = serial.load(test_path) else: print 'loading raw data...' trainset = cifar10.CIFAR10(w5B5B5B5Bhich_set="train") testset = cifar10.CIFAR10(which_set="test") print 'preprocessing data...' pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000)) pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) trainset.use_design_loc('train_design.npy') testset.apply_preprocessor(preprocessor=pipeline, can_fit=True) testset.use_design_loc('test_design.npy') print 'saving preprocessed data...' serial.save('cifar10_preprocessed_train.pkl', trainset) serial.save('cifar10_preprocessed_test.pkl', testset) trainset.yaml_src = '!pkl: "%s"' % train_path testset.yaml_src = '!pkl: "%s"' % test_path # this path will be used for visualizing weights after training is done #global YAML return trainset, testset
def get_dataset_cifar10(): train_path = 'cifar10_train.pkl' test_path = 'cifar10_test.pkl' if os.path.exists(train_path) and \ os.path.exists(test_path): print 'loading preprocessed data' trainset = serial.load(train_path) testset = serial.load(test_path) else: print 'loading raw data...' trainset = cifar10.CIFAR10(which_set="train", one_hot=True) testset = cifar10.CIFAR10(which_set="test", one_hot=True) serial.save('cifar10_train.pkl', trainset) serial.save('cifar10_test.pkl', testset) # this path will be used for visualizing weights after training is done trainset.yaml_src = '!pkl: "%s"' % train_path testset.yaml_src = '!pkl: "%s"' % test_path return trainset, testset
def main(): # Only the trainset is processed by this function. print 'getting preprocessed data to train model' pp_trainset, testset = get_processed_dataset() # remember to change here when changing datasets print 'loading unprocessed data for input displays' trainset = cifar10.CIFAR10(which_set="train") dmat = trainset.get_design_matrix() nvis = dmat.shape[1] model = DenoisingAutoencoder( corruptor=BinomialCorruptor(corruption_level=0.5), nhid=nhid, nvis=nvis, act_enc='sigmoid', act_dec='sigmoid', irange=.01) algorithm = SGD( learning_rate=0.1, cost=MeanSquaredReconstructionError(), batch_size=1000, monitoring_batches=10, monitoring_dataset=pp_trainset, termination_criterion=EpochCounter(max_epochs=MAX_EPOCHS_UNSUPERVISED), update_callbacks=None) extensions = None trainer = Train(model=model, algorithm=algorithm, save_path='testrun.pkl', save_freq=1, extensions=extensions, dataset=pp_trainset) trainer.main_loop()
# # This is also a common use case because often you will want to preprocess # your data once and then train several models on the preprocessed data. # We'll need the serial module to save the dataset from pylearn2.utils import serial # Our raw dataset will be the CIFAR10 image dataset from pylearn2.datasets import cifar10 # We'll need the preprocessing module to preprocess the dataset from pylearn2.datasets import preprocessing if __name__ == "__main__": # Our raw training set is 32x32 color images train = cifar10.CIFAR10(which_set="train") # We'd like to do several operations on them, so we'll set up a pipeline to # do so. pipeline = preprocessing.Pipeline() # First we want to pull out small patches of the images, since it's easier # to train an RBM on these pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000) ) # Next we contrast normalize the patches. The default arguments use the # same "regularization" parameters as those used in Adam Coates, Honglak # Lee, and Andrew Ng's paper "An Analysis of Single-Layer Networks in # Unsupervised Feature Learning"
from pylearn2.utils import serial from pylearn2.datasets import cifar10 from pylearn2.datasets import preprocessing train = cifar10.CIFAR10(which_set="train") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2000000)) pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) test.apply_preprocessor(preprocessor=pipeline, can_fit=False) train.use_design_loc( '/data/lisatmp/goodfeli/cifar10_preprocessed_train_2M_design.npy') test.use_design_loc( '/data/lisatmp/goodfeli/cifar10_preprocessed_test_2M_design.npy') serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_train_2M.pkl', train) serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_test_2M.pkl', test)
# end def lcn_2d if __name__ == '__main__': from pylearn2.datasets import cifar10 import matplotlib.pylab as plt from classification import load_initial_data from fileop import loadfile import copy flag_cifar10 = False flag_covmat = False if flag_cifar10: img_shape = (32, 32, 3) train = cifar10.CIFAR10(which_set="train", one_hot=True) test = cifar10.CIFAR10(which_set="test", one_hot=True) X = train.X X_test = test.X else: # use moth data for test img_shape = (28, 28, 3) config = loadfile('config.yaml') X, _, X_test, _ = \ load_initial_data(data_path=config['data_path'], target_width=config['target_width'], target_height=config['target_height'], flag_rescale=config['flag_rescale'], flag_multiscale=config['flag_multiscale'], detect_width_list=config['detect_width_list'],
from kaggle_dataset import kaggle_cifar10 from pylearn2.datasets.preprocessing import Pipeline, ZCA from pylearn2.datasets.preprocessing import GlobalContrastNormalization from pylearn2.space import Conv2DSpace from pylearn2.train import Train from pylearn2.train_extensions import best_params, window_flip from pylearn2.utils import serial trn = kaggle_cifar10('train', one_hot=True, datapath='/home/kkastner/kaggle_data/kaggle-cifar10', max_count=40000, axes=('c', 0, 1, 'b')) tst = cifar10.CIFAR10('test', toronto_prepro=False, one_hot=True, axes=('c', 0, 1, 'b')) in_space = Conv2DSpace(shape=(32, 32), num_channels=3, axes=('c', 0, 1, 'b')) l1 = maxout.MaxoutConvC01B(layer_name='l1', pad=4, tied_b=1, W_lr_scale=.05, b_lr_scale=.05, num_channels=96, num_pieces=2, kernel_shape=(8, 8), pool_shape=(4, 4),
#replicate the preprocessing described in Kai Yu's paper Improving LCC with Local Tangents from pylearn2.utils import serial from pylearn2.datasets import cifar10 from pylearn2.datasets import preprocessing train = cifar10.CIFAR10(which_set="train", center=True) pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.GlobalContrastNormalization(subtract_mean=False, sqrt_bias=0.0, use_std=True)) pipeline.items.append(preprocessing.PCA(num_components=512)) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) test.apply_preprocessor(preprocessor=pipeline, can_fit=False) serial.save('cifar10_preprocessed_train.pkl', train) serial.save('cifar10_preprocessed_test.pkl', test)
import numpy as np from pylearn2.utils import serial from pylearn2.utils import string_utils from pylearn2.datasets import preprocessing from pylearn2.datasets import cifar10 import pylearn2.pca as pca output_dir = string_utils.preprocess( '/u/kruegers/repo/current/pylearn2/pylearn2/datasets/cifar10') print "Preparing output directory..." serial.mkdir(output_dir) print 'Loading CIFAR-10 train and test datasets...' trainset = cifar10.CIFAR10(which_set='train') testset = cifar10.CIFAR10(which_set='test') print "Learning the preprocessor" preprocessor = pca.PCA() print "Preprocessing the unsupervised train data..." trainset.apply_preprocessor(preprocessor=preprocessor, can_fit=True) print 'Saving the unsupervised train data' trainset.use_design_loc(output_dir + '/train.npy') serial.save(output_dir + '/train.pkl', trainset) print "Preprocessing the test data..." testset.apply_preprocessor(preprocessor=preprocessor, can_fit=False) print "Saving the test data" testset.use_design_loc(output_dir + '/test.npy')
def main(): # Only the trainset is processed by this function. print 'getting preprocessed data for training model' pp_trainset, testset = get_processed_dataset() # remember to change here when changing datasets print 'loading unprocessed data for input displays' trainset = cifar10.CIFAR10(which_set="train") dmat = pp_trainset.get_design_matrix() nvis = dmat.shape[1] model = DenoisingAutoencoder( corruptor=BinomialCorruptor(corruption_level=0.3), nhid=nhid, nvis=nvis, act_enc='sigmoid', act_dec='sigmoid', irange=.01) algorithm = SGD( learning_rate=learning_rate, cost=MeanSquaredReconstructionError(), batch_size=100, monitoring_batches=10, monitoring_dataset=pp_trainset, termination_criterion=EpochCounter(max_epochs=MAX_EPOCHS_UNSUPERVISED), update_callbacks=None) extensions = None trainer = Train(model=model, algorithm=algorithm, save_path='run.pkl', save_freq=1, extensions=extensions, dataset=pp_trainset) trainer.main_loop() #################### # Plot and Save: # choose random patch-pairs to plot stamps = pp_trainset.stamps num_examples = stamps.shape[0] to_plot = np.random.randint(0, num_examples, num2plot) # use to_plot indices to extract data stamps_data = stamps[to_plot] image_numbers = stamps[to_plot, 0].astype(int) X = trainset.X images_data = trainset.get_topological_view(X[image_numbers]) p1x = stamps_data[:, 1] p1y = stamps_data[:, 2] p2x = stamps_data[:, 3] p2y = stamps_data[:, 4] # For input ppd's, once we've identified the patches, we just outline them and draw an arrow for d # This might mess with original trainset (I dunno), in which case, we should make a copy add_outlines(images_data, p1x, p1y, patch_width) add_outlines(images_data, p2x, p2y, patch_width) ################################################## # translating outputs back into things we can plot dataset = pp_trainset Xout = dataset.X.astype('float32') max_stamp = input_width - patch_width d_size = (2 * max_stamp + 1)**input_dim # displacement d_enc = Xout[:, -d_size:] d_out_flat = np.argmax(d_enc, axis=1) d_shape = [2 * max_stamp + 1, 2 * max_stamp + 1] # assumed 2D d_out = flat_to_2D(d_out_flat, d_shape) d_out[to_plot, ] # patches vc = dataset.view_converter p_enc = Xout[:, :len(Xout.T) - d_size] p_size = p_enc.shape[1] / 2 p1_enc = p_enc[:, :p_size] p2_enc = p_enc[:, p_size:] p1_enc = vc.design_mat_to_topo_view(p1_enc) p2_enc = vc.design_mat_to_topo_view(p2_enc) pp = dataset.preprocessor gcn = pp.items[1] means = gcn.means normalizers = gcn.normalizers toshape = (num_examples, ) for i in range(input_dim): toshape += (1, ) if num_channels != 1: toshape += (1, ) # When the number of patches and patch-pairs differs, this breaks. # I need to match up normalizers/means with their corresponding patches # undoing the PCA might be breaking too, but without errors... normalizers1 = expand_p1(normalizers) normalizers2 = expand_p2(normalizers) means1 = expand_p1(means) means2 = expand_p2(means) p1_enc *= normalizers1.reshape(toshape) p1_enc += means1.reshape(toshape) p2_enc *= normalizers2.reshape(toshape) p2_enc += means2.reshape(toshape) # Now, we pull off the same examples from the data to compare to dAE inputs in plots outputs = copy.deepcopy(images_data) insertpatches(outputs, p1_enc[to_plot], p1x, p1y, patch_width) insertpatches(outputs, p2_enc[to_plot], p2x, p2y, patch_width) plt.figure() for i in range(num2plot): # Inputs plt.subplot(num2plot, 2, 2 * i + 1) plt.imshow(images_data[i], cmap=cm.Greys_r) print stamps_data[i] a = (stamps_data[i, 2] + patch_width / 2, stamps_data[i, 1] + patch_width / 2, stamps_data[i, 6], stamps_data[i, 5]) plt.arrow(a[0], a[1], a[2], a[3], head_width=1.0, head_length=0.6) # Outputs plt.subplot(num2plot, 2, 2 * i + 2) plt.imshow(outputs[i], cmap=cm.Greys_r) plt.arrow(a[0], a[1], d_out[to_plot[i], 1], d_out[to_plot[i], 0], head_width=1.0, head_length=0.6) plt.show() savestr = 'cifar_ppd.png' plt.savefig(savestr)