def setUp(self): """Load the train and test sets; check for nan and inf.""" skip_if_no_data() self.train_set = CIFAR100(which_set='train') self.test_set = CIFAR100(which_set='test') assert not np.any(np.isnan(self.train_set.X)) assert not np.any(np.isinf(self.train_set.X)) assert not np.any(np.isnan(self.test_set.X)) assert not np.any(np.isinf(self.test_set.X))
def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100') print('Loading CIFAR-100 train dataset...') train = CIFAR100(which_set='train', gcn=55.) print("Preparing output directory...") output_dir = data_dir + '/pylearn2_gcn_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 contrast normalized, approximately whitened version of the CIFAR-100 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used to approximately whiten the images. You may want to use this object later to preprocess other images. They were created with the pylearn2 script make_cifar100_gcn_whitened.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Learning the preprocessor \ and preprocessing the unsupervised train data...") preprocessor = preprocessing.ZCA() train.apply_preprocessor(preprocessor=preprocessor, can_fit=True) print('Saving the training data') train.use_design_loc(output_dir + '/train.npy') serial.save(output_dir + '/train.pkl', train) print("Loading the test data") test = CIFAR100(which_set='test', gcn=55.) print("Preprocessing the test data") test.apply_preprocessor(preprocessor=preprocessor, can_fit=False) print("Saving the test data") test.use_design_loc(output_dir + '/test.npy') serial.save(output_dir + '/test.pkl', test) serial.save(output_dir + '/preprocessor.pkl', preprocessor)
def get_labels_and_fold_indices(cifar10, cifar100, stl10): assert stl10 or cifar10 or cifar100 assert stl10+cifar10+cifar100 == 1 if stl10: print 'loading entire stl-10 train set just to get the labels and folds' stl10 = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/train.pkl") train_y = stl10.y fold_indices = stl10.fold_indices elif cifar10 or cifar100: if cifar10: print 'loading entire cifar10 train set just to get the labels' cifar = CIFAR10(which_set = 'train') else: assert cifar100 print 'loading entire cifar100 train set just to get the labels' cifar = CIFAR100(which_set = 'train') cifar.y = cifar.y_fine train_y = cifar.y assert train_y is not None fold_indices = np.zeros((5,40000),dtype='uint16') idx_list = np.cast['uint16'](np.arange(1,50001)) #mimic matlab format of stl10 for i in xrange(5): mask = idx_list < i * 10000 + 1 mask += idx_list >= (i+1) * 10000 + 1 fold_indices[i,:] = idx_list[mask] assert fold_indices.min() == 1 assert fold_indices.max() == 50000 return train_y, fold_indices
def test_iterator(self): """ Tests that batches returned by an iterator with topological data_specs are the same as the ones returned by calling get_topological_view on the dataset with the corresponding order """ batch_size = 100 b01c_X = self.test_set.X[0:batch_size, :] b01c_topo = self.test_set.get_topological_view(b01c_X) b01c_b01c_it = self.test_set.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(32, 32), num_channels=3, axes=('b', 0, 1, 'c')), 'features')) b01c_b01c = b01c_b01c_it.next() assert np.all(b01c_topo == b01c_b01c) c01b_test = CIFAR100(which_set='test', axes=('c', 0, 1, 'b')) c01b_X = c01b_test.X[0:batch_size, :] c01b_topo = c01b_test.get_topological_view(c01b_X) c01b_c01b_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(32, 32), num_channels=3, axes=('c', 0, 1, 'b')), 'features')) c01b_c01b = c01b_c01b_it.next() assert np.all(c01b_topo == c01b_c01b) # Also check that samples from iterators with the same data_specs # with Conv2DSpace do not depend on the axes of the dataset b01c_c01b_it = self.test_set.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(32, 32), num_channels=3, axes=('c', 0, 1, 'b')), 'features')) b01c_c01b = b01c_c01b_it.next() assert np.all(b01c_c01b == c01b_c01b) c01b_b01c_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(32, 32), num_channels=3, axes=('b', 0, 1, 'c')), 'features')) c01b_b01c = c01b_b01c_it.next() assert np.all(c01b_b01c == b01c_b01c)
def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}') print('Loading CIFAR-100 train dataset...') data = CIFAR100(which_set='train') print("Preparing output directory...") patch_dir = data_dir + '/cifar100/cifar100_patches' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from the CIFAR-100 train set. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_cifar100_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(6, 6), num_patches=2 * 1000 * 1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def get_labels(): cifar100 = CIFAR100(which_set="train") train_y = cifar100.y_coarse assert train_y.shape == (50000, ) for i in xrange(50000): if train_y[i] not in [3, 4, 6, 7, 11, 12]: train_y[i] = 0 tlc = TL_Challenge(which_set='train') test_y = tlc.y_coarse return train_y, test_y
def get_test_labels(cifar10, cifar100, stl10): assert cifar10 + cifar100 + stl10 == 1 if stl10: print 'loading entire stl-10 test set just to get the labels' stl10 = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/test.pkl") return stl10.y if cifar10: print 'loading entire cifar10 test set just to get the labels' cifar10 = CIFAR10(which_set = 'test') return np.asarray(cifar10.y) if cifar100: print 'loading entire cifar100 test set just to get the fine labels' cifar100 = CIFAR100(which_set = 'test') return np.asarray(cifar100.y_fine) assert False
def test_topo_c01b(self): """ Tests that a topological batch with axes ('c',0,1,'b') can be dimshuffled back to match the standard ('b',0,1,'c') format. """ batch_size = 100 c01b_test = CIFAR100(which_set='test', axes=('c', 0, 1, 'b')) c01b_X = c01b_test.X[0:batch_size, :] c01b = c01b_test.get_topological_view(c01b_X) assert c01b.shape == (3, 32, 32, batch_size) b01c = c01b.transpose(3, 1, 2, 0) b01c_X = self.test_set.X[0:batch_size, :] assert c01b_X.shape == b01c_X.shape assert np.all(c01b_X == b01c_X) b01c_direct = self.test_set.get_topological_view(b01c_X) assert b01c_direct.shape == b01c.shape assert np.all(b01c_direct == b01c)
def get_ddm_cifar100(self, ddm_id): row = self.db.executeSQL( """ SELECT which_set, center, gcn, toronto_prepro, axes, start, stop, one_hot FROM hps3.ddm_cifar100 WHERE ddm_id = %s """, (ddm_id, ), self.db.FETCH_ONE) if not row or row is None: raise HPSData("No cifar100 ddm for ddm_id="\ +str(ddm_id)) (which_set, center, gcn, toronto_prepro, axes_char, start, \ stop, one_hot) = row axes = self.get_axes(axes_char) return CIFAR100(which_set=which_set, center=center, gcn=gcn, toronto_prepro=toronto_prepro, axes=axes, start=start, stop=stop, one_hot=one_hot)
import numpy as np from pylearn2.datasets.cifar100 import CIFAR100 print 'loading cifar100...' y = CIFAR100(which_set='train').y_fine print 'loading full' X = np.load('/data/lisatmp2/goodfeli/bet_the_farm.npy') print 'loading restricted' Z = np.load('/data/lisatmp2/goodfeli/hack.npy') idx = 0 for i in xrange(50000): if y[i] < 2: cur_X = X[i, :, :, :] cur_Z = Z[idx, :, :, :] diffs = cur_X - cur_Z max_diff = np.abs(diffs).max() print i, '\t', max_diff idx += 1
import sys f = open(sys.argv[2]) l = f.readlines() f.close() s = '\n'.join(l) args = eval(s) clf = TheanoSGDClassifier(100, ** args) from pylearn2.datasets.cifar100 import CIFAR100 X = np.load(sys.argv[1]) if len(X.shape) == 4: X = X.reshape(X.shape[0], X.shape[1] * X.shape[2] * X.shape[3]) y = CIFAR100(which_set="train").y_fine.astype(int) print 'fit' clf.fit(X, y) del X del y y = np.asarray(CIFAR100(which_set="test").y_fine).astype(int) print 'loading test data' X = np.load(sys.argv[3]) X = X.reshape(X.shape[0], X.shape[1] * X.shape[2] * X.shape[3]) print 'evaluating svm' yhat = clf.predict(X) print (yhat == y).mean()
(-1, 3, 32, 32)) valid_set.X = np.reshape( np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32)) test_set.X = np.reshape( np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32)) elif args.dataset == 'cifar100': print('Loading CIFAR-100 dataset...') from pylearn2.datasets.cifar100 import CIFAR100 pylearn_path = os.environ['PYLEARN2_DATA_PATH'] path = os.path.join(pylearn_path, 'cifar100', 'cifar-100-python') if not os.path.exists(path): cmd = subprocess.call('scripts/download_cifar100.sh') train_set = CIFAR100(which_set="train", start=0, stop=45000) valid_set = CIFAR100(which_set="train", start=45000, stop=50000) test_set = CIFAR100(which_set="test") classes = 100 save_path = "../weights/cifar100-w1a1.npz" print("save_path = " + str(save_path)) train_set.X = np.reshape( np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32)) valid_set.X = np.reshape( np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32)) test_set.X = np.reshape( np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32))
""" This script makes a dataset of 32x32 approximately whitened CIFAR-10 images. """ from __future__ import print_function from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.utils import string_utils as string from pylearn2.datasets.cifar100 import CIFAR100 data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/cifar100') print('Loading CIFAR-100 train dataset...') train = CIFAR100(which_set='train') print("Preparing output directory...") output_dir = data_dir + '/whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of an approximately whitened version of the CIFAR-100 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples.
parser = argparse.ArgumentParser(description='Testing Script') parser.add_argument('--dataset', '-d', default="cifar10", help='dataset to use cifar10, cifar100, mnist') parser.add_argument('--model', '-m', default="cnv", help='model to use resnet, lenet, inception, cnv') args = parser.parse_args() batch_size = 10000 if args.dataset == 'cifar10': print('Loading CIFAR-10 dataset...') from pylearn2.datasets.cifar10 import CIFAR10 test_set = CIFAR10(which_set="test", start=0, stop = batch_size) classes = 10 test_set.X = np.reshape(np.subtract(np.multiply(2./255,test_set.X),1.),(-1,3,32,32)) elif args.dataset == 'cifar100': print('Loading CIFAR-100 dataset...') from pylearn2.datasets.cifar100 import CIFAR100 test_set = CIFAR100(which_set="test", start=0, stop = batch_size) classes = 100 test_set.X = np.reshape(np.subtract(np.multiply(2./255,test_set.X),1.),(-1,3,32,32)) elif args.dataset == 'mnist': print('Loading MNIST dataset...') from pylearn2.datasets.mnist import MNIST test_set = MNIST(which_set="test", start=0, stop = batch_size) classes = 10 test_set.X = 2* test_set.X.reshape(-1, 1, 28, 28) - 1. # flatten targets test_set.y = np.hstack(test_set.y) # one hot test_set.y = np.float32(np.eye(classes)[test_set.y])
This script is intended to reproduce the preprocessing used by Adam Coates et. al. in their work from the first half of 2011 on the CIFAR-10 and STL-10 datasets. """ from __future__ import print_function from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.datasets.cifar100 import CIFAR100 from pylearn2.utils import string data_dir = string.preprocess('${PYLEARN2_DATA_PATH}') print('Loading CIFAR-100 train dataset...') data = CIFAR100(which_set='train') print("Preparing output directory...") patch_dir = data_dir + '/cifar100/cifar100_patches_8x8' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 8x8 approximately whitened, contrast-normalized patches drawn uniformly at random from the CIFAR-100 train set. preprocessor.pkl contains a pylearn2 Pipeline object that was used
""" This script makes a dataset of 32x32 contrast normalized, approximately whitened CIFAR-100 images. """ from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.utils import string_utils from pylearn2.datasets.cifar100 import CIFAR100 data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100') print 'Loading CIFAR-100 train dataset...' train = CIFAR100(which_set = 'train', gcn = 55.) print "Preparing output directory..." output_dir = data_dir + '/pylearn2_gcn_whitened' serial.mkdir( output_dir ) README = open(output_dir + '/README','w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 contrast normalized, approximately whitened version of the CIFAR-100 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples.
from pylearn2.utils import serial from pylearn2.datasets.cifar100 import CIFAR100 from pylearn2.datasets.tl_challenge import TL_Challenge from pylearn2.datasets import preprocessing import os import numpy as np goodfeli_tmp = os.environ['GOODFELI_TMP'] train = CIFAR100(which_set="train") aug = TL_Challenge(which_set="unlabeled") aug2 = TL_Challenge(which_set="train") train.set_design_matrix(np.concatenate((train.X, aug.X, aug2.X), axis=0)) del aug del aug2 pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(6, 6), num_patches=2000000)) pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) train.apply_preprocessor(preprocessor=pipeline, can_fit=True) train.use_design_loc(goodfeli_tmp + '/tl_challenge_patches_2M_6x6_design.npy') serial.save(goodfeli_tmp + '/tl_challenge_patches_2M_6x6.pkl', train)