def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar10') print('Loading CIFAR-10 train dataset...') train = CIFAR10(which_set='train') print("Preparing output directory...") output_dir = data_dir + '/pylearn2_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 approximately whitened version of the STL-10 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used to approximately whiten the images. You may want to use this object later to preprocess other images. They were created with the pylearn2 script make_cifar10_whitened.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Learning the preprocessor and preprocessing \ the unsupervised train data...") preprocessor = preprocessing.ZCA() train.apply_preprocessor(preprocessor=preprocessor, can_fit=True) print('Saving the unsupervised data') train.use_design_loc(output_dir + '/train.npy') serial.save(output_dir + '/train.pkl', train) print("Loading the test data") test = CIFAR10(which_set='test') print("Preprocessing the test data") test.apply_preprocessor(preprocessor=preprocessor, can_fit=False) print("Saving the test data") test.use_design_loc(output_dir + '/test.npy') serial.save(output_dir + '/test.pkl', test) serial.save(output_dir + '/preprocessor.pkl', preprocessor)
def get_labels_and_fold_indices(cifar10, cifar100, stl10): assert stl10 or cifar10 or cifar100 assert stl10+cifar10+cifar100 == 1 if stl10: print 'loading entire stl-10 train set just to get the labels and folds' stl10 = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/train.pkl") train_y = stl10.y fold_indices = stl10.fold_indices elif cifar10 or cifar100: if cifar10: print 'loading entire cifar10 train set just to get the labels' cifar = CIFAR10(which_set = 'train') else: assert cifar100 print 'loading entire cifar100 train set just to get the labels' cifar = CIFAR100(which_set = 'train') cifar.y = cifar.y_fine train_y = cifar.y assert train_y is not None fold_indices = np.zeros((5,40000),dtype='uint16') idx_list = np.cast['uint16'](np.arange(1,50001)) #mimic matlab format of stl10 for i in xrange(5): mask = idx_list < i * 10000 + 1 mask += idx_list >= (i+1) * 10000 + 1 fold_indices[i,:] = idx_list[mask] assert fold_indices.min() == 1 assert fold_indices.max() == 50000 return train_y, fold_indices
def get_test_set(self): return CIFAR10(which_set='test', center=self.center, rescale=self.rescale, gcn=self.gcn, one_hot=self.one_hot, toronto_prepro=self.toronto_prepro, axes=self.axes)
def get_valid(ds, limit_size=-1, fold=0): if ds == 'mnist': data = MNIST('train', start=50000, stop=60000) return data.X[:limit_size] elif ds == 'tfd': data = TFD('valid', fold=fold, scale=True) return data.X elif ds == 'cifar10': data = CIFAR10(which_set='train', start=4000, stop=50000, gcn=55.) return data.X[:limit_size] else: raise ValueError("Unknow dataset: {}".format(args.dataet))
def get_test_labels(cifar10, stl10): assert cifar10 or stl10 assert not (cifar10 and stl10) if stl10: print 'loading entire stl-10 test set just to get the labels' stl10 = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/test.pkl") return stl10.y if cifar10: print 'loading entire cifar10 test set just to get the labels' cifar10 = CIFAR10(which_set = 'test') return np.asarray(cifar10.y)
def test_iterator(self): # Tests that batches returned by an iterator with topological # data_specs are the same as the ones returned by calling # get_topological_view on the dataset with the corresponding order batch_size = 100 b01c_X = self.test.X[0:batch_size, :] b01c_topo = self.test.get_topological_view(b01c_X) b01c_b01c_it = self.test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(32, 32), num_channels=3, axes=('b', 0, 1, 'c')), 'features')) b01c_b01c = b01c_b01c_it.next() assert np.all(b01c_topo == b01c_b01c) c01b_test = CIFAR10(which_set='test', axes=('c', 0, 1, 'b')) c01b_X = c01b_test.X[0:batch_size, :] c01b_topo = c01b_test.get_topological_view(c01b_X) c01b_c01b_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(32, 32), num_channels=3, axes=('c', 0, 1, 'b')), 'features')) c01b_c01b = c01b_c01b_it.next() assert np.all(c01b_topo == c01b_c01b) # Also check that samples from iterators with the same data_specs # with Conv2DSpace do not depend on the axes of the dataset b01c_c01b_it = self.test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(32, 32), num_channels=3, axes=('c', 0, 1, 'b')), 'features')) b01c_c01b = b01c_c01b_it.next() assert np.all(b01c_c01b == c01b_c01b) c01b_b01c_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(32, 32), num_channels=3, axes=('b', 0, 1, 'c')), 'features')) c01b_b01c = c01b_b01c_it.next() assert np.all(c01b_b01c == b01c_b01c)
def get_test_labels(cifar10, cifar100, stl10): assert cifar10 + cifar100 + stl10 == 1 if stl10: print 'loading entire stl-10 test set just to get the labels' stl10 = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/test.pkl") return stl10.y if cifar10: print 'loading entire cifar10 test set just to get the labels' cifar10 = CIFAR10(which_set = 'test') return np.asarray(cifar10.y) if cifar100: print 'loading entire cifar100 test set just to get the fine labels' cifar100 = CIFAR100(which_set = 'test') return np.asarray(cifar100.y_fine) assert False
def test_topo_c01b(self): """ Tests that a topological batch with axes ('c',0,1,'b') can be dimshuffled back to match the standard ('b',0,1,'c') format. """ batch_size = 100 c01b_test = CIFAR10(which_set='test', axes=('c', 0, 1, 'b')) c01b_X = c01b_test.X[0:batch_size, :] c01b = c01b_test.get_topological_view(c01b_X) assert c01b.shape == (3, 32, 32, batch_size) b01c = c01b.transpose(3, 1, 2, 0) b01c_X = self.test.X[0:batch_size, :] assert c01b_X.shape == b01c_X.shape assert np.all(c01b_X == b01c_X) b01c_direct = self.test.get_topological_view(b01c_X) assert b01c_direct.shape == b01c.shape assert np.all(b01c_direct == b01c)
def _get_dataset(dataset, no_imgs): """ Dump machine learning dataset into C source file and create a header file with array declaration and few macro definition """ dataset = dataset.lower() if (dataset not in dataset_factory['name']): print('Dataset is not in the factory') sys.exit() if (dataset == 'mnist'): print('Reading MNIST dataset from pylearn2 database') data = MNIST(which_set='test') if (no_imgs > data.X.shape[0]): cprint( 'Only {:d} images are available in this dataset. Dumping only those many' .format(data.X.shape[0]), 'red') data_x = data.X[0:min(no_imgs, data.X.shape[0])] data_y = data.y[0:min(no_imgs, data.y.shape[0])] data_x = np.uint8(data_x * 255) # conversion from 0 -> 0.99 to 0->255 elif (dataset == 'cifar10'): print('Reading CIFAR-10 dataset from pylearn2 database') data = CIFAR10(which_set='test') if (no_imgs > data.X.shape[0]): cprint( 'Only {:d} images are available in this dataset. Dumping only those many' .format(data.X.shape[0]), 'red') data_x = data.X[0:min(no_imgs, data.X.shape[0])] data_y = data.y[0:min(no_imgs, data.y.shape[0])] data_x = np.uint8(data_x) # already in the range 0 -> 255 elif (dataset == 'svhn'): cprint('Not supported', 'red') return return data_x, data_y
def setUp(self): skip_if_no_data() self.test = CIFAR10(which_set='test')
def test_topo(self): """Tests that a topological batch has 4 dimensions""" train = CIFAR10(which_set='train') topo = train.get_batch_topo(1) assert topo.ndim == 4
model = serial.load(model_path) model.set_dtype('float32') stl10 = False cifar10 = False stl10 = model.dataset_yaml_src.find('stl10') != -1 cifar10 = model.dataset_yaml_src.find('cifar10') != -1 cifar100 = model.dataset_yaml_src.find('cifar100') != -1 if cifar100: cifar10 = False assert int(cifar10) + int(cifar100) + int(stl10) == 1 print 'loading dataset' if cifar10: print 'CIFAR10 detected' dataset = CIFAR10(which_set="train") elif cifar100: print 'CIFAR100 detected' dataset = CIFAR100(which_set='train') elif stl10: print 'STL10 detected' dataset = serial.load( '${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/train.pkl') X = dataset.get_design_matrix()[batch_start:batch_start + batch_size, :] size = np.sqrt(model.nvis / 3) if cifar10 or cifar100: pv1 = make_viewer((X - 127.5) / 127.5, is_color=True, rescale=False) elif stl10: pv1 = make_viewer(X / 127.5, is_color=True, rescale=False)
def main(): # BN parameters batch_size = 200 print("batch_size = " + str(batch_size)) # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # BinaryOut activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") # activation = binary_net.binary_sigmoid_unit # print("activation = binary_net.binary_sigmoid_unit") # BinaryConnect binary = True print("binary = " + str(binary)) stochastic = False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Training parameters num_epochs = 500 print("num_epochs = " + str(num_epochs)) # Decaying LR LR_start = 0.01 print("LR_start = " + str(LR_start)) LR_fin = 0.0000003 print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... train_set_size = 45000 print("train_set_size = " + str(train_set_size)) shuffle_parts = 1 print("shuffle_parts = " + str(shuffle_parts)) print('\nLoading CIFAR-10 dataset...') train_set = CIFAR10(which_set="train", start=0, stop=train_set_size) valid_set = CIFAR10(which_set="train", start=train_set_size, stop=50000) test_set = CIFAR10(which_set="test") # bc01 format # Inputs in the range [-1,+1] # print("Inputs in the range [-1,+1]") train_set.X = np.reshape( np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32)) valid_set.X = np.reshape( np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32)) test_set.X = np.reshape( np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32)) # flatten targets train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) if oneHot: # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. else: train_set.y = np.int32(train_set.y) valid_set.y = np.int32(valid_set.y) test_set.y = np.int32(test_set.y) #import pdb;pdb.set_trace() print('\nBuilding the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') if oneHot: target = T.matrix('targets') else: target = T.ivector('targets') LR = T.scalar('LR', dtype=theano.config.floatX) cnn = buildCNN(dataType='cifar10', networkType='cifar10', oneHot=oneHot, input=input, epsilon=epsilon, alpha=alpha, activation=activation, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale) train_output = lasagne.layers.get_output(cnn, deterministic=False) # squared hinge loss if oneHot: loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) else: loss = LO.categorical_crossentropy(train_output, target) loss = loss.mean() # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, cnn) # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) test_output = lasagne.layers.get_output(cnn, deterministic=True) if oneHot: test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) else: test_loss = LO.categorical_crossentropy(test_output, target) test_loss = test_loss.mean() test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') binary_net.train(train_fn, val_fn, cnn, batch_size, LR_start, LR_decay, num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y, test_set.X, test_set.y, shuffle_parts=shuffle_parts)
def load_dataset(dataset): if (dataset == "CIFAR-10"): print('Loading CIFAR-10 dataset...') train_set_size = 45000 train_set = CIFAR10(which_set="train", start=0, stop=train_set_size) valid_set = CIFAR10(which_set="train", start=train_set_size, stop=50000) test_set = CIFAR10(which_set="test") train_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32)),(0,2,3,1)) valid_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32)),(0,2,3,1)) test_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32)),(0,2,3,1)) # flatten targets train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. # enlarge train data set by mirrroring x_train_flip = train_set.X[:, :, ::-1, :] y_train_flip = train_set.y train_set.X = np.concatenate((train_set.X, x_train_flip), axis=0) train_set.y = np.concatenate((train_set.y, y_train_flip), axis=0) elif (dataset == "MNIST"): print('Loading MNIST dataset...') train_set_size = 50000 train_set = MNIST(which_set="train", start=0, stop=train_set_size) valid_set = MNIST(which_set="train", start=train_set_size, stop=60000) test_set = MNIST(which_set="test") train_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 1, 28, 28)),(0,2,3,1)) valid_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 1, 28, 28)),(0,2,3,1)) test_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 1, 28, 28)),(0,2,3,1)) # flatten targets train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. # enlarge train data set by mirrroring x_train_flip = train_set.X[:, :, ::-1, :] y_train_flip = train_set.y train_set.X = np.concatenate((train_set.X, x_train_flip), axis=0) train_set.y = np.concatenate((train_set.y, y_train_flip), axis=0) else: print("wrong dataset given") return train_set, valid_set, test_set
ignore, model_path, data_override = sys.argv model = serial.load(model_path) # Get access to the intermediate layers of the augmented DBM if hasattr(model, 'super_dbm'): model = model.super_dbm if hasattr(model,'dataset_yaml_src'): dataset = yaml_parse.load(model.dataset_yaml_src) else: from pylearn2.datasets.cifar10 import CIFAR10 dataset = CIFAR10(which_set = 'test', gcn = 55.) rng = np.random.RandomState([2012,10,24]) if data_override == 'binary_noise': dataset.X = rng.uniform(0., 1., dataset.X.shape) > 0.5 elif data_override == 'gaussian_noise': dataset.X = rng.randn( * dataset.X.shape).astype(dataset.X.dtype) batch_size = 25 model.set_batch_size(batch_size) perc = .99 num_examples = 50000 num_layers = len(model.hidden_layers) num_filters = [] act_record = []
""" This script makes a dataset of 32x32 contrast normalized, approximately whitened CIFAR-10 images. """ from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.utils import string_utils from pylearn2.datasets.cifar10 import CIFAR10 data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar10') print 'Loading CIFAR-10 train dataset...' train = CIFAR10(which_set='train', gcn=55.) print "Preparing output directory..." output_dir = data_dir + '/pylearn2_gcn_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 contrast normalized, approximately whitened version of the STL-10 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples.
num_epochs = 10 print("num_epochs = " + str(num_epochs)) # Decaying LR LR_start = 0.001 print("LR_start = " + str(LR_start)) LR_fin = 0.0000003 print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) shuffle_parts = 1 print("shuffle_parts = " + str(shuffle_parts)) if args.dataset == 'cifar10': print('Loading CIFAR-10 dataset...') from pylearn2.datasets.cifar10 import CIFAR10 train_set = CIFAR10(which_set="train", start=0, stop=45000) valid_set = CIFAR10(which_set="train", start=45000, stop=50000) test_set = CIFAR10(which_set="test") classes = 10 save_path = "../weights/cifar10-w1a1.npz" print("save_path = " + str(save_path)) train_set.X = np.reshape( np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32)) valid_set.X = np.reshape( np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32)) test_set.X = np.reshape( np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32))
return self.predict_fn_(X) import sys f = open(sys.argv[2]) l = f.readlines() f.close() s = '\n'.join(l) args = eval(s) clf = TheanoSGDClassifier(10, ** args) from pylearn2.datasets.cifar10 import CIFAR10 X = np.load(sys.argv[1]) X = X.reshape(X.shape[0], X.shape[1] * X.shape[2] * X.shape[3]) y = CIFAR10(which_set="train").y.astype(int) print 'fit' clf.fit(X, y) del X del y y = np.asarray(CIFAR10(which_set="test").y).astype(int) print 'loading test data' X = np.load(sys.argv[3]) X = X.reshape(X.shape[0], X.shape[1] * X.shape[2] * X.shape[3]) print 'evaluating svm' yhat = clf.predict(X) print (yhat == y).mean()
def __init__(self, which_set, center=False, rescale=False, gcn=None, one_hot=False, start=None, stop=None, axes=('b', 0, 1, 'c'), toronto_prepro=False, preprocessor=None): # note: there is no such thing as the cifar10 validation set; # pylearn1 defined one but really it should be user-configurable # (as it is here) self.axes = axes # we define here: dtype = 'uint8' ntrain = 50000 nvalid = 0 # artefact, we won't use it ntest = 10000 # we also expose the following details: self.img_shape = (3, 32, 32) self.img_size = np.prod(self.img_shape) self.n_classes = 10 self.label_names = [ 'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck' ] # # prepare loading # fnames = ['data_batch_%i' % i for i in range(1,6)] # lenx = np.ceil((ntrain + nvalid) / 10000.)*10000 # x = np.zeros((lenx,self.img_size), dtype=dtype) # y = np.zeros(lenx, dtype=dtype) # # # load train data # nloaded = 0 # for i, fname in enumerate(fnames): # data = CIFAR10._unpickle(fname) # x[i*10000:(i+1)*10000, :] = data['data'] # y[i*10000:(i+1)*10000] = data['labels'] # nloaded += 10000 # if nloaded >= ntrain + nvalid + ntest: break; # # # load test data # data = CIFAR10._unpickle('test_batch') # # # process this data # Xs = { # 'train' : x[0:ntrain], # 'test' : data['data'][0:ntest] # } # # Ys = { # 'train' : y[0:ntrain], # 'test' : data['labels'][0:ntest] # } if which_set == 'train': # pkl = self._unpickle(os.environ['PYLEARN2_DATA_PATH']+ # 'cifar10/pylearn2_gcn_whitened/train.pkl') #pkl = self._unpickle(os.environ['PYLEARN2_DATA_PATH']+ # 'cifar10/pylearn2_gcn_whitened/test.pkl') #X = pkl.X #y = pkl.y X = np.load(os.environ['PYLEARN2_DATA_PATH'] + '/cifar10/train_X.npy') y = np.load(os.environ['PYLEARN2_DATA_PATH'] + '/cifar10/train_y.npy') X = np.cast['float32'](X) y = np.cast['float32'](y) elif which_set == 'test': # pkl = self._unpickle(os.environ['PYLEARN2_DATA_PATH']+ # 'cifar10/pylearn2_gcn_whitened/test.pkl') # X = pkl.X # y = pkl.y X = np.load(os.environ['PYLEARN2_DATA_PATH'] + '/cifar10/test_X.npy') y = np.load(os.environ['PYLEARN2_DATA_PATH'] + '/cifar10/test_y.npy') X = np.cast['float32'](X) y = np.cast['float32'](y) # X = np.cast['float32'](Xs[which_set]) # y = Ys[which_set] if which_set == 'test': assert X.shape[0] == 10000 if isinstance(y, list): y = np.asarray(y) if center: X -= 127.5 self.center = center if rescale: X /= 127.5 self.rescale = rescale if toronto_prepro: assert not center assert not gcn X = X / 255. if which_set == 'test': other = CIFAR10(which_set='train') oX = other.X oX /= 255. X = X - oX.mean(axis=0) else: X = X - X.mean(axis=0) self.toronto_prepro = toronto_prepro self.gcn = gcn if gcn is not None: gcn = float(gcn) X = global_contrast_normalize(X, scale=gcn) if start is not None: # This needs to come after the prepro so that it doesn't change the pixel # means computed above for toronto_prepro assert start >= 0 assert stop > start assert stop <= X.shape[0] X = X[start:stop, :] y = y[start:stop] assert X.shape[0] == y.shape[0] view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) if which_set == 'train': length = X.shape[0] def search_right_label(desired_label, i): for idx in xrange(i, length): if y[idx] == desired_label: return idx def swap_ele(index, i): x_tmp = X[i] X[i] = X[index] X[index] = x_tmp y_tmp = y[i] y[i] = y[index] y[index] = y_tmp desired_label = 0 for i in xrange(length): desired_label = i % 10 if y[i] != desired_label: index = search_right_label(desired_label, i) swap_ele(index, i) for i in xrange(length - 100, length): print y[i] self.one_hot = one_hot if one_hot: one_hot = np.zeros((y.shape[0], 10), dtype='float32') for i in xrange(y.shape[0]): one_hot[i, y[i]] = 1. y = one_hot super(My_CIFAR10, self).__init__(X=X, y=y, view_converter=view_converter) assert not np.any(np.isnan(self.X)) if preprocessor: preprocessor.apply(self)
from pylearn2.datasets.cifar10 import CIFAR10 import time import warnings from theano.printing import Print import numpy as np from galatea.dbm.inpaint.super_dbm import SuperDBM from galatea.dbm.inpaint.super_dbm import GaussianConvolutionalVisLayer from galatea.dbm.inpaint.super_dbm import ConvMaxPool from galatea.dbm.inpaint.super_dbm import Softmax from pylearn2.utils import serial from theano.gof.op import get_debug_values from theano.printing import min_informative_str dataset = CIFAR10(which_set='train', one_hot=True, gcn=55.) rng = np.random.RandomState([2012, 07, 24]) irange = .05 nvis = 784 nclass = 10 nhid = 500 mf_iter = 10 batch_size = 100 lr = .001 momentum = 1. / 20. from pylearn2.utils import sharedX import theano.tensor as T X = dataset.X y = dataset.y
def loadDataset(type, oneHot=True): if type == 'TCDTIMIT': nbClasses = 39 # get the database # If it's small (lipspeakers) -> generate X_train, y_train etc here # otherwise we need to load and generate each speaker seperately in the training loop dataset = "TCDTIMIT" root_dir = os.path.join( os.path.expanduser('~/TCDTIMIT/lipreading/' + dataset)) results_dir = root_dir + "/results/CNN_binaryNet" if not os.path.exists(results_dir): os.makedirs(results_dir) database_binaryDir = root_dir + '/binary' datasetType = "lipspeakers" # "lipspeakers" #"volunteers" #"volunteers" # lipspeakers or volunteers" ############################################## if datasetType == "lipspeakers": loadPerSpeaker = False # only lipspeakers small enough to fit in CPU RAM, generate X_train etc here storeProcessed = True processedDir = database_binaryDir + "_allLipspeakersProcessed" # TODO: prepLip_all can be used to generate pkl containing all the lipspeaker data. Not sure if this stil works, so use with care! pkl_path = processedDir + os.sep + datasetType + "_oneHot" + ".pkl" if not os.path.exists(pkl_path): print("dataset not yet processed. Processing...") preprocessLipreading.prepLip_all(data_path=database_binaryDir, store_path=pkl_path, trainFraction=0.7, validFraction=0.1, testFraction=0.2, nbClasses=nbClasses, onehot=oneHot, type=datasetType, verbose=True) X_train, y_train, X_val, y_val, X_test, y_test = general_tools.unpickle( pkl_path) dtypeX = 'float32' dtypeY = 'int32' X_train = X_train.astype(dtypeX) y_train = y_train.astype(dtypeY) X_val = X_val.astype(dtypeX) y_val = y_val.astype(dtypeY) X_test = X_test.astype(dtypeX) y_test = y_test.astype(dtypeY) #import pdb;pdb.set_trace() if oneHot: # Onehot the targets y_train = np.float32(np.eye(nbClasses)[y_train]) y_val = np.float32(np.eye(nbClasses)[y_val]) y_test = np.float32(np.eye(nbClasses)[y_test]) # for hinge loss y_train = 2 * y_train - 1. y_val = 2 * y_val - 1. y_test = 2 * y_test - 1. #import pdb;pdb.set_trace() else: # cifar10 nbClasses = 10 train_set_size = 45000 print("train_set_size = " + str(train_set_size)) train_set = CIFAR10(which_set="train", start=0, stop=train_set_size) valid_set = CIFAR10(which_set="train", start=train_set_size, stop=50000) test_set = CIFAR10(which_set="test") # bc01 format # Inputs in the range [-1,+1] # print("Inputs in the range [-1,+1]") train_set.X = np.reshape( np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32)) valid_set.X = np.reshape( np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32)) test_set.X = np.reshape( np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32)) # flatten targets train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. X_train = train_set.X y_train = train_set.y X_val = valid_set.X y_val = valid_set.y X_test = test_set.X y_test = test_set.y return X_train, y_train, X_val, y_val, X_test, y_test
def load_dataset(which_set, dataset_types): # we need to have at least 2 types otherwise this func is useless assert len(dataset_types) > 1 print "loading.. ", which_set if which_set == 'test': start_set = 0 stop_set = 10000 elif which_set == 'valid': which_set = 'train' start_set = 40000 stop_set = 50000 else: #train start_set = 0 stop_set = 40000 n_classes = 10 data = [] for prepro in dataset_types: if prepro == 'gcn': print "LOADING GCN..." input_data = CIFAR10(which_set=which_set, start=start_set, stop=stop_set, gcn=55., axes=['b', 0, 1, 'c']) # gcn_data = input_data.get_topological_view() data.append(input_data.get_topological_view()) if prepro == 'toronto': print "LOADING TOR..." input_data = CIFAR10(which_set=which_set, start=start_set, stop=stop_set, axes=['b', 0, 1, 'c'], toronto_prepro=1) # tor_data = input_data.get_topological_view() data.append(input_data.get_topological_view()) if prepro == 'zca': print "LOADING ZCA..." data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar10') input_data = ZCA_Dataset( preprocessed_dataset=serial.load(data_dir + "/pylearn2_gcn_whitened/" + which_set + ".pkl"), preprocessor=serial.load( data_dir + "/pylearn2_gcn_whitened/preprocessor.pkl"), start=start_set, stop=stop_set, axes=['b', 0, 1, 'c']) # zca_data = input_data.get_topological_view() data.append(input_data.get_topological_view()) target_data = OneHotFormatter(n_classes).format(input_data.y, mode="concatenate") data.append(target_data) data_source = [] for i in range(len(dataset_types)): data_source.append('features' + str(i)) data_source.append('targets') ################################## DEFINE SPACES ################################## spaces = [] # add input spaces as b01c for i in range(0, len(dataset_types)): spaces.append( Conv2DSpace(shape=(32, 32), num_channels=3, axes=('b', 0, 1, 'c'))) # add output space spaces.append(VectorSpace(n_classes)) set = VectorSpacesDataset(tuple(data), (CompositeSpace(spaces), tuple(data_source))) return set
exit(-1) top_dir = os.environ['CRAFT_BNN_ROOT'] params_dir = top_dir + '/params' # BinaryOut activation = hardware_net.SignTheano print("activation = sign(x)") no_bias = True print("no_bias = " + str(no_bias)) # BinaryConnect H = 1. print('Loading CIFAR-10 dataset...') test_set = CIFAR10(which_set="test") print("Test set size = " + str(len(test_set.X))) test_instances = 10000 print("Using instances 0 .. " + str(test_instances)) # bc01 format # Inputs in the range [-1,+1] test_set.X = np.reshape( np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32)) # flatten targets test_set.y = np.hstack(test_set.y) # Onehot the targets test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss test_set.y = 2 * test_set.y - 1.
from pylearn2.datasets.cifar10 import CIFAR10 from pylearn2.gui.patch_viewer import PatchViewer dataset = CIFAR10(which_set='test') pv = PatchViewer((10, 1), (32, 32), is_color=True) T, y = dataset.get_batch_topo(10, include_labels=True) for i in xrange(10): print dataset.label_names[y[i]] pv.add_patch(dataset.adjust_for_viewer(T[i, :, :, :]), rescale=False) pv.show()
def get_test_data(): print 'loading entire cifar-10 dataset' cifar10 = CIFAR10(which_set='test') return cifar10
def load_dataset(dataset, train_set_size, val_stop): if (dataset == "CIFAR-10"): ''' (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 y_train = keras.utils.np_utils.to_categorical(y_train, num_classes) y_test = keras.utils.np_utils.to_categorical(y_test, num_classes) ''' print('Loading CIFAR-10 dataset...') #train_set.X = np.transpose(np.reshape(np.multiply(255., train_set.X).astype('uint8'), (-1, 1, 28, 28)), (0,2,3,1)) #train_set_size = 45000 train_set = CIFAR10(which_set="train", start=0, stop=train_set_size) valid_set = CIFAR10(which_set="train", start=train_set_size, stop=val_stop) #50000) test_set = CIFAR10(which_set="test") train_set.X = np.transpose(np.reshape(train_set.X, (-1, 3, 32, 32)), (0, 2, 3, 1)) valid_set.X = np.transpose(np.reshape(valid_set.X, (-1, 3, 32, 32)), (0, 2, 3, 1)) test_set.X = np.transpose(np.reshape(test_set.X, (-1, 3, 32, 32)), (0, 2, 3, 1)) # flatten targets train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. # enlarge train data set by mirrroring x_train_flip = train_set.X[:, :, ::-1, :] y_train_flip = train_set.y train_set.X = np.concatenate((train_set.X, x_train_flip), axis=0) train_set.y = np.concatenate((train_set.y, y_train_flip), axis=0) elif (dataset == "MNIST"): print('Loading MNIST dataset...') #train_set_size = 50000 #This images are between 0 and 1, 1D array train_set = MNIST(which_set="train", start=0, stop=train_set_size, shuffle=False) valid_set = MNIST(which_set="train", start=train_set_size, stop=val_stop, shuffle=False) #60000) test_set = MNIST(which_set="test", shuffle=False) #train_set.X = np.multiply(2. / 255., train_set.X) #print train_set.X[0] #raw_input() #train_set.X = np.subtract(train_set.X, 1.) #print train_set.X[0] #raw_input() #train_set.X = np.reshape(train_set.X, (-1, 1, 28, 28)) #print train_set.X[0] #raw_input() #train_set.X = np.transpose(train_set.X,(0,2,3,1)) #remove / 255. / 255. / 255. #the -1 means check how many elements exists ( I have no time to count them:-)) # num images x 1 x28x28 #1 dimension (grey color) #28x28x1 #the transpose is used to obtain num images x 28 x 28 x 1 train_set.X = np.transpose(np.reshape(train_set.X, (-1, 1, 28, 28)), (0, 2, 3, 1)) valid_set.X = np.transpose(np.reshape(valid_set.X, (-1, 1, 28, 28)), (0, 2, 3, 1)) test_set.X = np.transpose(np.reshape(test_set.X, (-1, 1, 28, 28)), (0, 2, 3, 1)) # flatten targets #From array of one elements to one array for all picture: [1,2,2,8,8,3,... train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) # Onehot the targets # Each element becomes an array with only one 1 and the rest 0 train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. # enlarge train data set by mirrroring x_train_flip = train_set.X[:, :, ::-1, :] y_train_flip = train_set.y train_set.X = np.concatenate((train_set.X, x_train_flip), axis=0) train_set.y = np.concatenate((train_set.y, y_train_flip), axis=0) #image=test_set.X[800] #print image.shape #image=test_set.X[800].reshape([28, 28]) #print test_set.X.shape #print train_set.X.shape #print valid_set.X.shape #plt.figure() #plt.imshow(image,cmap='gray') #plt.show(block=False) #plt.pause(0.05) #raw_input() else: print("wrong dataset given") return train_set, valid_set, test_set
print "Loading dataset" from pylearn2.datasets.cifar10 import CIFAR10 dataset = CIFAR10(which_set='train', axes=('c', 0, 1, 'b')) print "Building graph" rows = 10 cols = 10 m = rows * cols from pylearn2.space import Conv2DSpace space = Conv2DSpace([32, 32], num_channels=3, axes=('c', 0, 1, 'b')) X = space.make_batch_theano() from galatea.maxout import GCN_C01B2 gcn = GCN_C01B2() gcn.set_input_space(space) normed = gcn.fprop(X) from galatea.maxout import OnlineWhitener whitener = OnlineWhitener() whitener.set_input_space(gcn.get_output_space()) white = whitener.fprop(normed) assert white.ndim == 4
""" This script makes a dataset of 32x32 approximately whitened CIFAR-10 images. """ from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.utils import string_utils import numpy as np from pylearn2.datasets.cifar10 import CIFAR10 data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar10') print 'Loading CIFAR-10 train dataset...' train = CIFAR10(which_set='train') print "Preparing output directory..." output_dir = data_dir + '/pylearn2_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 approximately whitened version of the STL-10 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples.
print("N = " + str(N) + " Num_States = " + str(pow(2, N) + 1)) th = 3. #the nonlinearity parameter of state transfer probability print("tanh = " + str(th)) # Decaying LR LR_start = 0.01 #0.01 print("LR_start = " + str(LR_start)) LR_fin = 0.00003 # 0.0000003 print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) print('Loading CIFAR10 dataset...') train_set_size = 45000 train_set = CIFAR10(which_set="train", start=0, stop=train_set_size) valid_set = CIFAR10(which_set="train", start=train_set_size, stop=50000) test_set = CIFAR10(which_set="test") # bc01 format # Inputs in the range [-1,+1] # print("Inputs in the range [-1,+1]") ''' train_set.X = train_set.X.reshape(-1,3,32,32) valid_set.X = valid_set.X.reshape(-1,3,32,32) test_set.X = test_set.X.reshape(-1,3,32,32) ''' train_set.X = np.reshape( np.subtract(np.multiply(2. / 255, train_set.X), 1.), (-1, 3, 32, 32)) valid_set.X = np.reshape( np.subtract(np.multiply(2. / 255, valid_set.X), 1.), (-1, 3, 32, 32))
from pylearn2.utils import serial from pylearn2.datasets.cifar10 import CIFAR10 import numpy as np import sys from pylearn2.models.svm import DenseMulticlassSVM print 'Loading labels' y = CIFAR10(which_set='train', one_hot=True).y ignore, features_path = sys.argv print 'Loading features' X = serial.load(features_path) X = X.astype('float64') # avoid duplicating memory in sklearn def train(X, y): print 'Training SVM...' return DenseMulticlassSVM(C=1., kernel='linear').fit(X, np.argmax(y, axis=1)) X_train = X[0:40000, :] y_train = y[0:40000, :] W = train(X_train, y_train) def acc(W, X, y): y_hat = W.predict() y = np.argmax(y, axis=1)