def build_cnn_model(image_height, image_width, n_kernel, batch_size, learning_rate, rng): # learning_rate, n_kernel, batch_size, print, rng, T, print('... building the model') x = T.matrix('x', dtype=theano.config.floatX) y = T.ivector('y') layer_1_input = x.reshape((batch_size, 1, image_height, image_width)) layer_1 = LeNetConvPoolLayer(rng, input=layer_1_input, image_shape=(batch_size, 1, image_height, image_width), filter_shape=(n_kernel[0], 1, 7, 7), poolsize=(2, 2)) layer_2 = LeNetConvPoolLayer(rng, input=layer_1.output, image_shape=(batch_size, n_kernel[0], 57, 77), filter_shape=(n_kernel[1], n_kernel[0], 6, 6), poolsize=(2, 2)) layer_3 = LeNetConvPoolLayer(rng, input=layer_2.output, image_shape=(batch_size, n_kernel[1], 26, 36), filter_shape=(n_kernel[2], n_kernel[1], 5, 5), poolsize=(2, 2)) layer_4 = HiddenLayer(rng, input=layer_3.output.flatten(2), n_in=n_kernel[2] * 11 * 16, n_out=batch_size, activation=T.tanh) layer_5 = LogisticRegression(input=layer_4.output, input_dim=batch_size, output_dim=12) cost = layer_5.negative_log_likelihood(y) error = layer_5.errors(y) params = layer_5.params + layer_4.params + layer_3.params + layer_2.params + layer_1.params params = layer_5.params + layer_4.params + layer_2.params + layer_1.params grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function([x, y], cost, updates=updates) validation_model = theano.function([x, y], error) return train_model, validation_model
class LRTest: def __init__(self): import theano import util from theano import tensor as T from logistic_regression import LogisticRegression self.index = T.iscalar('index') self.BATCH_SIZE = 100 self.LEARNING_RATE = 0.12 self.dataSets = util.loadMnistData("mnist.pkl.gz") self.x = T.dmatrix('x') self.y = T.ivector('y') self.index = T.iscalar('index') self.classifier = LogisticRegression(input=self.x, nIn=28 * 28, nOut=10) self.cost = self.classifier.negativeLogLikelihood(self.y) self.gW = T.grad(cost=self.cost, wrt=self.classifier.W) self.gB = T.grad(cost=self.cost, wrt=self.classifier.b) self.trainSet, self.validSet, self.testSet = self.dataSets self.nTrainSet, self.nValidSet, self.nTestSet = map(self.numBatches, self.dataSets) updates = [ (self.classifier.W, self.classifier.W - self.LEARNING_RATE * self.gW), (self.classifier.b, self.classifier.b - self.LEARNING_RATE * self.gB) ] def makeGivens(data): return { self.x: data[0][self.index * self.BATCH_SIZE:(self.index + 1) * self.BATCH_SIZE], self.y: data[1][self.index * self.BATCH_SIZE:(self.index + 1) * self.BATCH_SIZE] } self.testModel = theano.function( inputs=[self.index], outputs=self.classifier.errors(self.y), givens=makeGivens(self.dataSets[2]) ) self.validationModel = theano.function( inputs=[self.index], outputs=self.classifier.errors(self.y), givens=makeGivens(self.dataSets[1]) ) self.trainModel = theano.function( inputs=[self.index], outputs=self.cost, updates=updates, givens=makeGivens(self.dataSets[0]) ) def numBatches(self, dataSet): return dataSet[0].get_value(borrow=True).shape[0] / self.BATCH_SIZE def printValid(self, epoch, batchIndex, loss): return 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, batchIndex + 1, self.nTrainSet, loss * 100. ) def printTestScore(self, epoch, batchIndex, score): return ( ' epoch %i, minibatch %i/%i, test error of' ' best model %f %%' ) % ( epoch, batchIndex + 1, self.nTrainSet, score * 100. ) def resultString(self, best, test): return ('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best * 100., test * 100.) def train(self): import numpy patience = 5000 patienceIncrease = 2 MAX_EPOCH = 1000 improveThresh = 0.995 validationFreq = min(self.nTrainSet, patience / 2) bestValidationLoss = numpy.inf epoch = 0 done = False testLoss = 0 while (epoch < MAX_EPOCH) and not done: epoch += 1 for batchIndex in xrange(self.nTrainSet): avgCost = self.trainModel(batchIndex) iter = (epoch - 1) * self.nTrainSet + batchIndex if (iter + 1) % validationFreq == 0: loss = numpy.mean(map(self.validationModel, xrange(self.nValidSet))) if loss < bestValidationLoss: if loss < bestValidationLoss * improveThresh: patience = max(patience, iter * patienceIncrease) bestValidationLoss = loss testLoss = numpy.mean(map(self.testModel, xrange(self.nTestSet))) yield epoch,batchIndex,loss, testLoss, bestValidationLoss if patience <= iter: done = True break def doTrain(self): for epoch,batchIndex, loss,testScore,bestScore in self.train(): str = self.printValid(epoch,batchIndex,loss) str += self.printTestScore(epoch,batchIndex,testScore) print(str) print(self.resultString(bestScore,testScore))
def fit(n_windows, win_width, rand_state, data_set, data_labels, filename="LR_weights.pkl"): # Permuting data rng = np.random.RandomState(8000) indices = rng.permutation(len(data_set)) data_set = np.array(data_set) data_labels = np.array(data_labels) data_set, data_labels = data_set[indices], data_labels[indices] print str(len(data_set)) + " all samples" train_len = int(len(data_set) * 9.0 / 10.0) valid_len = len(data_set) - train_len print "Train: " + str(train_len) print "Validate: " + str(valid_len) # Splitting fs train_dir = fs.File("LR_training.hdf5", "a") train_data = train_dir.create_dataset("LR_train_data", shape=((train_len + 1) * n_windows, 41, 41), dtype="i") train_labels = train_dir.create_dataset("LR_train_labels", shape=((train_len + 1) * n_windows,), dtype="i") valid_dir = fs.File("LR_validating.hdf5", "a") valid_data = valid_dir.create_dataset("LR_valid_data", shape=((valid_len + 1) * n_windows, 41, 41), dtype="i") valid_labels = valid_dir.create_dataset("LR_valid_labels", shape=((valid_len + 1) * n_windows,), dtype="i") counter = 0 next_counter = 0 for iter, data_sample in enumerate(data_set): if iter % 10000 == 0: print iter windows = WinExt.get_windows(data_sample, n_windows, win_width, rand_state) for window in windows: # First windows part for training # Second part for validation if iter < train_len: train_data[counter] = window train_labels[counter] = data_labels[iter] counter += 1 else: valid_data[next_counter] = window valid_labels[next_counter] = data_labels[iter] next_counter += 1 # Setting real length train_len = counter valid_len = next_counter print "Size of train is " + str(train_len) print "Size of valid is " + str(valid_len) print "Extracting has finished its work..." batch_size = 500 if train_len % batch_size != 0: # if the last batch is not full, just don't use the remainder whole = (train_len / batch_size) * batch_size train_len = whole if valid_len % batch_size != 0: whole = (valid_len / batch_size) * batch_size valid_len = whole n_train_batches = train_len / batch_size n_valid_batches = valid_len / batch_size data_tr = theano.shared( np.asarray(np.zeros((batch_size, 41, 41), dtype=np.int), dtype=theano.config.floatX), borrow=True ) labels_tr = theano.shared(np.asarray(np.zeros(batch_size, dtype=np.int), dtype="int32"), borrow=True) data_val = theano.shared( np.asarray(np.zeros((batch_size, 41, 41), dtype=np.int), dtype=theano.config.floatX), borrow=True ) labels_val = theano.shared(np.asarray(np.zeros(batch_size, dtype=np.int), dtype="int32"), borrow=True) print "Building logistic regression classifier..." x = T.dtensor3("x") # dtensor3 for 3d array y = T.ivector("y") # the labels are presented as 1D vector of [int] labels rng = np.random.RandomState(8000) classifier = LogisticRegression(input=x.flatten(2), n_in=41 * 41, n_out=2) cost = classifier.negative_log_likelihood(y) learning_rate = 0.03 # 0.3 / float(n_train_batches) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] validate_model = theano.function(inputs=[], outputs=classifier.errors(y), givens={x: data_val, y: labels_val}) # indices - for random shuffle train_model = theano.function( inputs=[], outputs=classifier.errors(y), updates=updates, givens={x: data_tr, y: labels_tr} ) print "Training..." # GDM with batches epoch = 0 n_epochs = 30 min_error = 100.0 errors = [] indices = rng.permutation(train_len) while epoch < n_epochs: print "================= " + str(epoch + 1) + " epoch =============== " for minibatch_index in range(n_train_batches): if minibatch_index % 50 == 0: print str(minibatch_index) + " batch" data_tr.set_value( np.array([train_data[indices[minibatch_index * batch_size + i]] for i in range(batch_size)]), borrow=True, ) labels_tr.set_value( np.array([train_labels[indices[minibatch_index * batch_size + i]] for i in range(batch_size)]), borrow=True, ) train_model() # compute zero-one loss on validation set validation_losses = [] for i in range(n_valid_batches): data_val.set_value(np.array(valid_data[i * batch_size : (i + 1) * batch_size]), borrow=True) labels_val.set_value(np.array(valid_labels[i * batch_size : (i + 1) * batch_size]), borrow=True) validation_losses.append(validate_model()) this_validation_loss = np.mean(validation_losses) * 100 errors.append(this_validation_loss) if this_validation_loss < min_error: print str(this_validation_loss) + "% error" min_error = this_validation_loss save_parameters(classifier, filename) epoch += 1 print "Shuffling..." indices = rng.permutation(train_len) show_errors(errors, "LogReg: 4 windows, h=41") # Cleaning data train_dir.clear() valid_dir.clear() train_dir.close() valid_dir.close()
def evaluate_convnet(learning_rate=0.1, n_epochs=1, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = ConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4) layer1 = ConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, # This is the negative-log-likelihood of the Logisitc Regression layer updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def train_CNN_mini_batch(learning_rate, n_epochs, num_kernels, batch_size, filter_size, is_multi_scale, num_of_classes, height, width, use_interpolation, use_hidden_layer): train_set_x_by_1, train_set_y, valid_set_x_by_1, valid_set_y, test_set_x_by_1, test_set_y, train_set_x_by_2, \ train_set_x_by_4, valid_set_x_by_2, valid_set_x_by_4, test_set_x_by_2, test_set_x_by_4 \ = load_processed_img_data() n_train_batches = train_set_x_by_1.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x_by_1.get_value(borrow=True).shape[0] n_test_batches = test_set_x_by_1.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size index = theano.tensor.lscalar() x_by_1 = theano.tensor.ftensor4('x_by_1') x_by_2 = theano.tensor.ftensor4('x_by_2') x_by_4 = theano.tensor.ftensor4('x_by_4') y = theano.tensor.ivector('y') print '... initialize the model' cnn_dir = 'models/CNN_' if is_multi_scale is True: cnn_dir += 'M_' else: cnn_dir += 'S_' if use_hidden_layer is True: cnn_dir += 'H_' else: cnn_dir += 'L_' if use_interpolation is True: cnn_dir += 'I_' else: cnn_dir += 'N_' cnn_dir = cnn_dir + str(num_kernels[0]) + '_' + str(num_kernels[1]) + '_' + str(num_kernels[2]) + '_' + str( batch_size) + '_' curr_date = str(datetime.date.today()) curr_date = curr_date.replace('-', '_') cnn_dir = cnn_dir + curr_date + str(time.strftime('_%H_%M_%S')) print 'CNN model is ', cnn_dir if not os.path.exists(cnn_dir): os.makedirs(cnn_dir) class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(cnn_dir + '/log.txt', 'w') def write(self, message): self.terminal.write(message) self.log.write(message) sys.stdout = Logger() layer0 = CNN_Layer( name='Layer_0', W=None, b=None, filter_shape=(num_kernels[0], 3, filter_size, filter_size), ) layer1 = CNN_Layer( name='Layer_1', W=None, b=None, filter_shape=(num_kernels[1], num_kernels[0], filter_size, filter_size), ) layer2 = CNN_Layer( name='Layer_2', W=None, b=None, filter_shape=(num_kernels[2], num_kernels[1], filter_size, filter_size), ) layer3 = HiddenLayer( name='Layer_3', W=None, b=None, n_in=num_kernels[2] * 3 if is_multi_scale is True else num_kernels[2], n_out=num_kernels[2] * 4 if is_multi_scale is True else num_kernels[2] * 2, activation=theano.tensor.tanh ) if is_multi_scale and use_hidden_layer: layer4_in = num_kernels[2] * 4 elif is_multi_scale and not use_hidden_layer: layer4_in = num_kernels[2] * 3 elif not is_multi_scale and use_hidden_layer: layer4_in = num_kernels[2] * 2 else: layer4_in = num_kernels[2] layer4 = LogisticRegression( name='Layer_4', W=None, b=None, n_in=layer4_in, n_out=num_of_classes, ) forward_propagation( layer0=layer0, layer1=layer1, layer2=layer2, layer3=layer3, layer4=layer4, x_by_1=x_by_1, x_by_2=x_by_2, x_by_4=x_by_4, num_kernels=num_kernels, batch_size=batch_size, filter_size=filter_size, is_multi_scale=is_multi_scale, height=height, width=width, use_interpolation=use_interpolation, use_hidden_layer=use_hidden_layer ) if use_hidden_layer is True: L2_norm = (layer4.W ** 2).sum() + (layer3.W ** 2).sum() + (layer2.W ** 2).sum() + (layer1.W ** 2).sum() + ( layer0.W ** 2).sum() else: L2_norm = (layer4.W ** 2).sum() + (layer2.W ** 2).sum() + (layer1.W ** 2).sum() + (layer0.W ** 2).sum() regularization = 0.00001 cost = layer4.negative_log_likelihood(y) + (regularization * L2_norm) if is_multi_scale is True: test_model = theano.function( [index], layer4.errors(y), givens={ x_by_1: test_set_x_by_1[index * batch_size: (index + 1) * batch_size], x_by_2: test_set_x_by_2[index * batch_size: (index + 1) * batch_size], x_by_4: test_set_x_by_4[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size * height * width: (index + 1) * batch_size * height * width] } ) else: test_model = theano.function( [index], layer4.errors(y), givens={ x_by_1: test_set_x_by_1[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size * height * width: (index + 1) * batch_size * height * width] } ) if is_multi_scale is True: validate_model = theano.function( [index], layer4.errors(y), givens={ x_by_1: valid_set_x_by_1[index * batch_size: (index + 1) * batch_size], x_by_2: valid_set_x_by_2[index * batch_size: (index + 1) * batch_size], x_by_4: valid_set_x_by_4[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size * height * width: (index + 1) * batch_size * height * width] } ) else: validate_model = theano.function( [index], layer4.errors(y), givens={ x_by_1: valid_set_x_by_1[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size * height * width: (index + 1) * batch_size * height * width] } ) if use_hidden_layer is True: params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params else: params = layer4.params + layer2.params + layer1.params + layer0.params grads = theano.tensor.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] if is_multi_scale is True: train_model = theano.function( [index], cost, updates=updates, givens={ x_by_1: train_set_x_by_1[index * batch_size: (index + 1) * batch_size], x_by_2: train_set_x_by_2[index * batch_size: (index + 1) * batch_size], x_by_4: train_set_x_by_4[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size * width * height: (index + 1) * batch_size * width * height] } ) else: train_model = theano.function( [index], cost, updates=updates, givens={ x_by_1: train_set_x_by_1[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size * width * height: (index + 1) * batch_size * width * height] } ) print '... training the model' patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this much is considered significant validation_frequency = min(n_train_batches, patience / 2) best_layer_0_W = numpy.zeros_like(layer0.W.get_value()) best_layer_0_b = numpy.zeros_like(layer0.b.get_value()) best_layer_1_W = numpy.zeros_like(layer1.W.get_value()) best_layer_1_b = numpy.zeros_like(layer1.b.get_value()) best_layer_2_W = numpy.zeros_like(layer2.W.get_value()) best_layer_2_b = numpy.zeros_like(layer2.b.get_value()) best_layer_3_W = numpy.zeros_like(layer3.W.get_value()) best_layer_3_b = numpy.zeros_like(layer3.b.get_value()) best_layer_4_W = numpy.zeros_like(layer4.W.get_value()) best_layer_4_b = numpy.zeros_like(layer4.b.get_value()) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch += 1 for mini_batch_index in xrange(n_train_batches): start = time.clock() iter = (epoch - 1) * n_train_batches + mini_batch_index cost_ij = train_model(mini_batch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, mini-batch %i/%i, validation error %f %%' % (epoch, mini_batch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # save best filters best_layer_0_W = layer0.W.get_value() best_layer_0_b = layer0.b.get_value() best_layer_1_W = layer1.W.get_value() best_layer_1_b = layer1.b.get_value() best_layer_2_W = layer2.W.get_value() best_layer_2_b = layer2.b.get_value() best_layer_3_W = layer3.W.get_value() best_layer_3_b = layer3.b.get_value() best_layer_4_W = layer4.W.get_value() best_layer_4_b = layer4.b.get_value() # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, mini-batch %i/%i, test error of ' 'best model %f %%') % (epoch, mini_batch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break print 'training @ iter = %d, time taken = %f' % (iter, (time.clock() - start)) end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) if not os.path.exists(cnn_dir + '/params'): os.makedirs(cnn_dir + '/params') numpy.save(cnn_dir + '/params/layer_0_W.npy', best_layer_0_W) numpy.save(cnn_dir + '/params/layer_0_b.npy', best_layer_0_b) numpy.save(cnn_dir + '/params/layer_1_W.npy', best_layer_1_W) numpy.save(cnn_dir + '/params/layer_1_b.npy', best_layer_1_b) numpy.save(cnn_dir + '/params/layer_2_W.npy', best_layer_2_W) numpy.save(cnn_dir + '/params/layer_2_b.npy', best_layer_2_b) numpy.save(cnn_dir + '/params/layer_3_W.npy', best_layer_3_W) numpy.save(cnn_dir + '/params/layer_3_b.npy', best_layer_3_b) numpy.save(cnn_dir + '/params/layer_4_W.npy', best_layer_4_W) numpy.save(cnn_dir + '/params/layer_4_b.npy', best_layer_4_b) numpy.save(cnn_dir + '/params/filer_kernels.npy', num_kernels) numpy.save(cnn_dir + '/params/filter_size.npy', filter_size) return cnn_dir
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', n_kernels=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type n_kernels: list of ints :param n_kernels: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, n_kernels[0], 12, 12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(n_kernels[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, n_kernels[1], 4, 4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, n_kernels[0], 12, 12), filter_shape=(n_kernels[1], n_kernels[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, n_kernels[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=n_kernels[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 1 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) print((' patience %i') % (patience)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
class RRNN(object): """Recurrent ReLU Neural Network """ def __init__( self, numpy_rng, theano_rng=None, n_ins=N_FEATURES * N_FRAMES, relu_layers_sizes=[1024, 1024, 1024], recurrent_connections=[2], # layer(s), can only be i^t -> i^{t+1} n_outs=62 * 3, rho=0.9, eps=1.E-6): """ TODO """ self.relu_layers = [] self.params = [] self.n_layers = len(relu_layers_sizes) self._rho = rho # ``momentum'' for adadelta self._eps = eps # epsilon for adadelta self._accugrads = [] # for adadelta self._accudeltas = [] # for adadelta self.n_outs = n_outs assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) self.x = T.fmatrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): if i == 0: input_size = n_ins else: input_size = relu_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.relu_layers[-1].output if i in recurrent_connections: inputr_size = relu_layers_sizes[i] previous_output = T.fmatrix('previous_output') relu_layer = RecurrentReLU(rng=numpy_rng, input=layer_input, in_stack=previous_output, n_in=input_size, n_in_stack=inputr_size, n_out=inputr_size) #relu_layer.in_stack = relu_layer.output # TODO TODO TODO self.params.extend(relu_layer.params) self._accugrads.extend([ shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accugrad_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accugrad_Ws', borrow=True) ]) self._accudeltas.extend([ shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accudelta_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accudelta_Ws', borrow=True) ]) else: relu_layer = ReLU(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=relu_layers_sizes[i]) self.params.extend(relu_layer.params) self._accugrads.extend([ shared(value=numpy.zeros( (input_size, relu_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True) ]) self._accudeltas.extend([ shared(value=numpy.zeros( (input_size, relu_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True) ]) self.relu_layers.append(relu_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(input=self.relu_layers[-1].output, n_in=relu_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self._accugrads.extend([ shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accugrad_b', borrow=True) ]) self._accudeltas.extend([ shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accudelta_b', borrow=True) ]) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum( self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def get_SGD_trainer(self): """ Returns a plain SGD minibatch trainer with learning rate as param. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for param, gparam in zip(self.params, gparams): updates[param] = param - gparam * learning_rate train_fn = theano.function(inputs=[ theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate) ], outputs=cost, updates=updates, givens={ self.x: batch_x, self.y: batch_y }) return train_fn def get_adadelta_trainer(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = -T.sqrt( (accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function( inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=cost, updates=updates, givens={ self.x: batch_x, self.y: batch_y }) return train_fn def get_adagrad_trainer(self): """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, param, gparam in zip(self._accugrads, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = accugrad + gparam * gparam dx = -(learning_rate / T.sqrt(agrad + self._eps)) * gparam updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[ theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate) ], outputs=cost, updates=updates, givens={ self.x: batch_x, self.y: batch_y }) return train_fn def score_classif(self, given_set): """ Returns functions to get current classification scores. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') score = theano.function( inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=self.errors, givens={ self.x: batch_x, self.y: batch_y }) # Create a function that scans the entire set given as input def scoref(): return [score(batch_x, batch_y) for batch_x, batch_y in given_set] return scoref
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size, k): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) # compile the theano function fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch learning_rate = T.scalar('lr') # learning rate to used # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append( (param, param - gparam * T.cast(learning_rate, dtype=theano.config.floatX))) train_fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[index * batch_size:(index + 1) * batch_size], self.y: test_set_y[index * batch_size:(index + 1) * batch_size] }) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score
class StackedDenoisingAutoencoder: def __init__(self, numpyRng, theanoRng=None, nIn=28*28, hiddenLayerSizes=[500,500], nOut=10): self.nLayers = len(hiddenLayerSizes) if not theanoRng: theanoRng = theano.tensor.shared_randomstreams.RandomStreams(numpyRng.randint(2 ** 30)) self.x = T.matrix('x') self.y = T.ivector('y') def makeSigmoidLayer(lastLayer,lastLayerSize,size): return Layer(rng=numpyRng,input=lastLayer,nIn=lastLayerSize,nOut=size,activation=T.nnet.sigmoid) def makeDALayer(lastLayer,lastLayerSize,size,sigmoidLayer): return DenoisingAutoEncoder( numpyRng=numpyRng,theanoRng=theanoRng,input=lastLayer, nVisible=lastLayerSize, nHidden=size, W=sigmoidLayer.W, bHidden=sigmoidLayer.b) def makeLayers(lastLayer,lastInputSize,nextLayerSizes): if nextLayerSizes: newList = list(nextLayerSizes) size = newList.pop() sigmoidLayer = makeSigmoidLayer(lastLayer,lastInputSize,size) daLayer = makeDALayer(lastLayer,lastInputSize,size,sigmoidLayer) yield (sigmoidLayer,daLayer) for layer in makeLayers(sigmoidLayer.output,size,newList): yield layer self.sigmoidLayers,self.dALayers = zip(*makeLayers(self.x,nIn,reversed(hiddenLayerSizes))) print "created sda with layer shapes below." for da in self.dALayers: print "layersize:", da.W.get_value().shape self.logLayer = LogisticRegression(self.sigmoidLayers[-1].output,hiddenLayerSizes[-1],nOut) self.params = [l.params for l in self.sigmoidLayers] + [self.logLayer.negativeLogLikelihood(self.y)] self.fineTuneCost = self.logLayer.negativeLogLikelihood(self.y) self.errors = self.logLayer.errors(self.y) def pretrainingFunctions(self,trainSetX,batchSize): index = T.lscalar("index") corruptionLevel = T.scalar('corruption') learningRate = T.scalar("learning") batchBegin = batchSize * index batchEnd = batchBegin + batchSize for dA in self.dALayers: cost,updates = dA.costFunctionAndUpdates(corruptionLevel,learningRate) f = theano.function( inputs=[ index, theano.Param(corruptionLevel,default=0.2), theano.Param(learningRate,default=0.1) ], outputs=cost, updates=updates, givens={self.x:trainSetX[batchBegin:batchEnd]}, ) yield f def pretrainingFunctionsWithOptimizer(self,trainSetX,batchSize,optimizer): """ with optimizer. optimizer(params,grads) """ index = T.lscalar("index") corruptionLevel = T.scalar('corruption') learningRate = T.scalar("learning") batchBegin = batchSize * index batchEnd = batchBegin + batchSize for dA in self.dALayers: #cost,updates = dA.costFunctionAndUpdates(corruptionLevel,learningRate) cost, param, grads = dA.costParamGrads(corruptionLevel) updates = optimizer(param,grads) f = theano.function( inputs=[ index, theano.Param(corruptionLevel,default=0.2), ], outputs=cost, updates=updates, givens={self.x:trainSetX[batchBegin:batchEnd]}, ) yield f def fineTuneFunctions(self,datasets,batchSize,learningRate): index = T.lscalar('i') trainSetX,trainSetY = datasets[0] validSetX,validSetY = datasets[1] testSetX,testSetY = datasets[2] gparams = T.grad(self.fineTuneCost,self.params) updates = [ (param,param-gparam*learningRate) for param,gparam in zip(self.params,gparams) ] def makeGivens(x,y): return {self.x:x[index*batchSize:(index+1)*batchSize], self.y:y[index*batchSize:(index+1)*batchSize]} trainer = theano.function( inputs=[index], outputs=self.fineTuneCost, updates=updates, givens=makeGivens(trainSetX,trainSetY), name='train' ) testScoreI=theano.function( inputs=[index], outputs=self.errors, givens=makeGivens(testSetX,testSetY), name='test' ) validScoreI=theano.function( inputs=[index], outputs=self.errors, givens=makeGivens(validSetX,validSetY), name='valid' ) def validationScore(): return [validScoreI(i) for i in xrange(validSetX.get_value(borrow=True).shape[0]/batchSize)] def testScore(): return [testScoreI(i) for i in xrange(validSetX.get_value(borrow=True).shape[0]/batchSize)] return trainer,validationScore,testScore def preTrain(self, data, batchSize=20, preLearningRate=0.1, corruptionLevels=(.1,.2,.3)): import numpy,util preTrainer = list(self.pretrainingFunctions(data,batchSize=batchSize)) assert len(corruptionLevels) == len(preTrainer) , "given corruption levels do not correspond to the layers!!!" for i,(trainer,corruptionLevel) in enumerate(zip(preTrainer,corruptionLevels)): for epoch in xrange(15): print 'Pre-training layer %i, epoch %d start' % (i,epoch) trainScores = [trainer(batchIndex,corruptionLevel,preLearningRate) for batchIndex in xrange(data.get_value(borrow=True).shape[0]/batchSize)] print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),numpy.mean(trainScores)
class StackedDenoisingAutoEncoder(object): """ Stacked Denoising Auto-Encoder A stacked denoising autoencoder is obtained by stacking several denoising autoencoder. The hidden layer of the denoising autoencoder at layer `i` becomes the input of the layer `i+1`. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given, one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.da_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data # the data is presented as rasterized images self.x = T.matrix('x') # the labels are presented as 1D vector of int labels self.y = T.ivector('y') # SDA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders. # We will first construct the SDA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SDA by doing # stochastich gradient descent on the MLP for i in range(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SDA if you are on the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question ... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDA # the visible biases in the DA are parameters of those DA # but not the SDA self.params.extend(sigmoid_layer.params) # construct a denoising autoencoder that shared weights with this # layer da_layer = DenoisingAutoEncoder(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.da_layers.append(da_layer) # we now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y. self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_x, batch_size): """ Generates a list of functions, each of them implementting one step in training the DA corresponding to the layer with same index. The function will require as input the minibatch index, and to train a DA you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_x: theano.tensor.TensorType :param train_x: shared variable that contains all datapoints used for training the DA :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during training for any of the DA layers """ # index to a minibatch index = T.lscalar('index') # % of corruption to use corruption_level = T.scalar('corruption') # learning rate to use learning_rate = T.scalar('lr') # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for da in self.da_layers: # get the cost and the updates list cost, updates = da.get_cost_updates(corruption_level, learning_rate) # compile the theano function fn = theano.function( inputs=[ index, theano.In(corruption_level, value=0.2), theano.In(learning_rate, value=0.1) ], outputs=cost, updates=updates, givens={self.x: train_x[batch_begin:batch_end]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): """ Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set. :type datasets: list of pairs of theano.tensor.TensorType :param datasets: it is a list that contain all the datasets; the has to contain three paris, `train`, `valid`, `test` in this order, where each pari is formed of two theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage """ (train_x, train_y) = datasets[0] (valid_x, valid_y) = datasets[1] (test_x, test_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_x.get_value(borrow=True).shape[0] n_valid_batches //= batch_size n_test_batches = test_x.get_value(borrow=True).shape[0] n_test_batches //= batch_size # index to a minibatch index = T.lscalar('index') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [(param, param - gparam * learning_rate) for param, gparam in zip(self.params, gparams)] givens = { self.x: train_x[index * batch_size:(index + 1) * batch_size], self.y: train_y[index * batch_size:(index + 1) * batch_size] } train_fn = theano.function(inputs=[index], outputs=self.finetune_cost, updates=updates, givens=givens, name='train') givens = { self.x: test_x[index * batch_size:(index + 1) * batch_size], self.y: test_y[index * batch_size:(index + 1) * batch_size] } test_score_i = theano.function([index], self.errors, givens=givens, name='test') givens = { self.x: valid_x[index * batch_size:(index + 1) * batch_size], self.y: valid_y[index * batch_size:(index + 1) * batch_size] } valid_score_i = theano.function([index], self.errors, givens=givens, name='valid') # create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in range(n_valid_batches)] # create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in range(n_test_batches)] return train_fn, valid_score, test_score
def optimize_lenet(learning_rate=0.01, n_epochs=200, dataset='data/mnist.pkl.gz', batch_size=500, n_hidden=500, nkerns=[20, 50], rng=np.random.RandomState(23455)): print '... load training set' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size # ミニバッチのindex index = T.lscalar() # dataシンボル x = T.matrix('x') # labelシンボル y = T.ivector('y') print '... building the model' # LeNetConvPoolLayerと矛盾が起きないように、(batch_size, 28*28)にラスタ化された行列を4DTensorにリシェイプする # 追加した1はチャンネル数 # ここではグレイスケール画像なのでチャンネル数は1 layer0_input = x.reshape((batch_size, 1, 28, 28)) # layer0 # filterのnkerns[0]は20 layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # layer1 # filterのnkerns[1]は50 layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # layer2_input # layer1の出力は4x4ピクセルの画像が50チャンネル分4次元Tensorで出力されるが、多層パーセプトロンの入力にそのまま使えない # 4x4x50=800次元のベクトルに変換する(batch_size, 50, 4, 4)から(batch_size, 800)にする layer2_input = layer1.output.flatten(2) # layer2 # 500ユニットの隠れレイヤー # layer2_inputで作成した入力ベクトルのサイズ=n_in layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=n_hidden, activation=T.tanh) # layer3 # 出力は500ユニット layer3 = LogisticRegression(input=layer2.output, n_in=n_hidden, n_out=10) # cost(普通の多層パーセプトロンは正則化項が必要だが、CNNは構造自体で正則化の効果を含んでいる) cost = layer3.negative_log_likelihood(y) # testモデル # 入力indexからgivensによって計算した値を使ってlayer3.errorsを計算する test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) # validationモデル validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # 微分用のパラメータ params = layer3.params + layer2.params + layer1.params + layer0.params # コスト関数パラメータについてのの微分 grads = T.grad(cost, params) # パラメータの更新 updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] # trainモデル train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # optimize print "train model ..." patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 test_score = 0 start_time = timeit.default_timer() epoch = 0 done_looping = False fp1 = open('log/lenet_validation_error.txt', 'w') fp2 = open('log/lenet_test_error.txt', 'w') while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: ## validationのindexをvalidationのエラー率を計算するfunctionに渡し、配列としてかえす validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] # 平均してscoreにする this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f ' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) fp1.write("%d\t%f\n" % (epoch, this_validation_loss * 100)) if this_validation_loss < best_validation_loss: if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter ## testのindex をtestのエラー率を計算するfunctionに渡し、配列として渡す test_losses = [ test_model(i) for i in xrange(n_test_batches) ] ## 平均してscoreにする test_score = np.mean(test_losses) ## print('epoch %i, minibatch %i/%i, test error %f ' % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) fp2.write("%d\t%f\n" % (epoch, test_score * 100)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print(( 'optimization complete. Best validation score of %f obtained at iteration %i, with test performance %f' ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('This code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) fp1.close() fp2.close() import cPickle cPickle.dump(layer0, open("model/lenet_layer0.pkl", "wb")) cPickle.dump(layer1, open("model/lenet_layer1.pkl", "wb"))
class SRNN(object): """Stacking ReLU Neural Network """ def __init__(self, numpy_rng, theano_rng=None, n_ins=N_FEATURES * N_FRAMES, relu_layers_sizes=[1024, 1024, 1024], n_outs=62 * 3, rho=0.90, eps=1.E-6): """ TODO WRITEME """ self.relu_layers = [] self.params = [] self.n_layers = len(relu_layers_sizes) self._rho = rho # ``momentum'' for adadelta self._eps = eps # epsilon for adadelta self._accugrads = [] # for adadelta self._accudeltas = [] # for adadelta self.n_outs = n_outs assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.fmatrix('x') self.y = T.ivector('y') self.p_y_in = T.fmatrix('p_y') input_relu_layer = StackReLU(rng=numpy_rng, input=self.x, in_stack=self.p_y_in, n_in=n_ins, n_in_stack=n_outs, n_out=relu_layers_sizes[0]) self.relu_layers.append(input_relu_layer) self.params.extend(input_relu_layer.params) self._accugrads.extend([shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accugrad_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accugrad_Ws', borrow=True)]) self._accudeltas.extend([shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accudelta_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accudelta_Ws', borrow=True)]) for i in xrange(1, self.n_layers): input_size = relu_layers_sizes[i-1] layer_input = self.relu_layers[-1].output relu_layer = ReLU(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=relu_layers_sizes[i]) self.relu_layers.append(relu_layer) self.params.extend(relu_layer.params) self._accugrads.extend([shared(value=numpy.zeros((input_size, relu_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True)]) self._accudeltas.extend([shared(value=numpy.zeros((input_size, relu_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True)]) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.relu_layers[-1].output, n_in=relu_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self._accugrads.extend([shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accugrad_b', borrow=True)]) self._accudeltas.extend([shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accudelta_b', borrow=True)]) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum(self.y) self.p_y_out = self.logLayer.p_y_given_x # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def get_SGD_trainer(self): """ Returns a plain SGD minibatch trainer with learning rate as param. FIXME TODO """ # TODO return -1 def get_stacked_adadelta_trainer(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params, that works on stacks, that is first a classification step to get the output probabilities, and then a step taking these outputs into account. """ batch_x = T.fmatrix('batch_x') batch_p_y = T.fmatrix('batch_p_y') batch_y = T.ivector('batch_y') first_pass = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_p_y)], outputs=self.p_y_out, givens={self.x: batch_x, self.p_y_in: batch_p_y}) cost = self.finetune_cost_sum gparams = T.grad(cost, self.params) updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_p_y), theano.Param(batch_y)], outputs=cost, updates=updates, givens={self.x: batch_x, self.p_y_in: batch_p_y, self.y: batch_y}) return first_pass, train_fn def get_bptt_adadelta_trainer(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params. """ cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad #p_y_given_x_init = shared(numpy.asarray(numpy.random.uniform((1, self.n_outs)), dtype='float32')) p_y_given_x_init = T.zeros((1, self.n_outs)) + 1./self.n_outs def one_step(x_t, p_y): self.x = x_t self.p_y_in = p_y return [x_t, self.p_y_out] batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') #batch_p_y = T.fmatrix('batch_p_y') [x, p_y_out], _ = theano.scan(lambda x_t, p_y_g_x_m1, *_: one_step(x_t, p_y_g_x_m1), sequences=batch_x[:-1], outputs_info=[None, p_y_init],) train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=cost, updates=updates, givens={self.y: batch_y, self.p_y_in: T.concatenate([p_y_init, p_y_out], axis=0), self.x: batch_x }) return train_fn def get_adagrad_trainer(self): """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. FIXME TODO """ # TODO return -1 def score_stacked_classif(self, given_set): """ Returns functions to get current stacked-based (not RNN) classification scores. """ batch_x = T.fmatrix('batch_x') batch_p_y = T.fmatrix('batch_p_y') batch_y = T.ivector('batch_y') p_y_init = T.zeros((batch_x.shape[0], self.n_outs)) + 1./self.n_outs # TODO try = 0 first_pass = theano.function(inputs=[theano.Param(batch_x)], outputs=self.p_y_out, givens={self.x: batch_x, self.p_y_in: p_y_init}) score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_p_y), theano.Param(batch_y)], outputs=self.errors, givens={self.x: batch_x, self.p_y_in: batch_p_y, self.y: batch_y}) # Create a function that scans the entire set given as input def scoref(): return [score(batch_x, first_pass(batch_x), batch_y) for batch_x, batch_y in given_set] return scoref def score_rnn_classif(self, given_set): """ Returns functions to get current RNN classification scores. """ # TODO batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=self.errors, givens={self.x: batch_x, self.y: batch_y}) # Create a function that scans the entire set given as input def scoref(): return [score(batch_x, batch_y) for batch_x, batch_y in given_set] return scoref def score_rnn_PER(self, given_set): """ Returns functions to get reccurrent PER. FIXME TODO""" # TODO return -1
class SdA(object): """ stacked autoencoder """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): # 必要なレイヤー配列を定義 self.sigmoid_layers = [] self.dA_layers = [] self.params = [] # 隠れ層の数 self.n_layers = len(hidden_layers_sizes) # 隠れ層の数は1以上 assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # 画像データ self.x = T.matrix('x') # int型正解ラベルデータ self.y = T.ivector('y') for i in xrange(self.n_layers): if i == 0: # 最初の隠れ層の入力データの数は、入力層のユニット数 input_size = n_ins else: # 2つ目以降の隠れ層の入力データの数は、ひとつ前の隠れ層のユニット数 input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output # 隠れ層 sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # 隠れ層のリストに追加 self.sigmoid_layers.append(sigmoid_layer) # 隠れ層のWとb self.params.extend(sigmoid_layer.params) # AutoEncoder dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) # AutoEncoderのリストに追加 self.dA_layers.append(dA_layer) # sigmlid_layresの最後のレイヤーを入力にする、hidden_layers_sizesの最後の層は入力のユニット数、出力ユニットの数はn_outs self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) # LogisticRegression層のWとb self.params.extend(self.logLayer.params) # 正則化項は無しで良い self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # LogisticRegression層のエラーを使う self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size): """ 各レイヤーのAutoEncoderによる学習 """ # minibatchのindex index = T.lscalar('index') # ノイズ率 corruption_level = T.scalar('corruption') # 学習率 learning_rate = T.scalar('lr') batch_begin = index * batch_size batch_end = batch_begin + batch_size # 事前学習のfunctionリスト pretrain_functions = [] for dA in self.dA_layers: cost, updates = dA.get_cost_updates(corruption_level, learning_rate) fn = theano.function( inputs=[ index, theano.Param(corruption_level, default=0.2), # Paramを使うとTensorの引数の名前で値を指定できる theano.Param(learning_rate, default=0.1) ], outputs=cost, updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) # 事前学習の関数リストに各層のオートエンコーダのcost計算とパラメータupdatesのfunctionを追加 pretrain_functions.append(fn) return pretrain_functions def build_finetune_functions(self, datasets, batch_size, learning_rate): """ ネットワーク全体でfinetuning """ train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # まとめて計算するようのbatch n_valid_batches = valid_set_x.get_value( borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value( borrow=True).shape[0] / batch_size index = T.lscalar('index') # logiticlayerの微分 gparams = T.grad(self.finetune_cost, self.params) # ネットワークのパラメータ更新 updates = [(param, param - gparam * learning_rate) for param, gparam in zip(self.params, gparams)] train_model = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }, name='train') # minibatch index i testのエラースコアfunction test_score_i = theano.function( inputs=[index], outputs=self.errors, givens={ self.x: test_set_x[index * batch_size:(index + 1) * batch_size], self.y: test_set_y[index * batch_size:(index + 1) * batch_size] }, name='test') # minibatch index i validateのエラースコアfunction valid_score_i = theano.function( inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }, name='validate') def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_model, valid_score, test_score
def main(): rng = np.random.RandomState(23455) datasets = load_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing batch_size = 500 n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size nkerns = [20, 50] # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of [int] labels layer0_input = x.reshape(batch_size, 1, 28, 28) layer0 = LeNetConvPoolLayer(rng, layer0_input, filter_shape=(nkerns[0], 1, 5, 5), image_shape=(batch_size, 1, 28, 28), poolsize=(2, 2)) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, filter_shape=(nkerns[1], nkerns[0], 5, 5), image_shape=(batch_size, nkerns[0], 12, 12), poolsize=(2, 2)) layer2_input = layer1.output.flaten(2) layer2 = HiddenLayer(rng, layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500) layer3 = LogisticRegression(layer2.output, n_in=500, n_out=10) cost = layer3.negative_log_likelihood(y) test_model = theano.function([index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] }) validate_model = theano.function([index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] }) params = layer3.params + layer2.params + layer1.params + layer0.params grads = T.grad(cost, params) learning_rate = 0.1 updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function([index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] }) print "Start training..." patience = 10000 patience_increase = 2 improvement_threshold = 0.995 n_epochs = 200 validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = np.inf test_score = 0. epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index) # NOQA if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = np.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break
def evaluate_model(learning_rate=0.001, n_epochs=100, nkerns=[16, 40, 50, 60], batch_size=20): """ Network for classification of MNIST database :type learning_rate: float :param learning_rate: this is the initial learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer :type batch_size: int :param batch_size: the batch size for training """ print("Evaluating model") rng = numpy.random.RandomState(23455) # loading the data datasets = load_test_data() valid_set_x, valid_set_y = datasets[0] test_set_x, test_set_y = datasets[1] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels loaded_params = numpy.load('../saved_models/model.npy') layer4_W, layer4_b, layer3_W, layer3_b, layer2_W, layer2_b, layer1_W, layer1_b, layer0_W, layer0_b = loaded_params ###################### # BUILD ACTUAL MODEL # ###################### print('Building the model...') chosen_height = 64 chosen_width = 64 # Reshape matrix of rasterized images of shape (batch_size, 32 * 32) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (32, 32) is the size of MNIST images. layer0_input = x.reshape((batch_size, 3, chosen_height, chosen_width)) # Construct the first convolutional pooling layer: # filtering does not reduce the layer size because we use padding # maxpooling reduces the size to (32/2, 32/2) = (16, 16) # 4D output tensor is thus of shape (batch_size, nkerns[0], 16, 16) layer0 = MyConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 3, chosen_height, chosen_width), p1=2, p2=2, filter_shape=(nkerns[0], 3, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer: # filtering does not reduce the layer size because we use padding # maxpooling reduces the size to (16/2, 16/2) = (8, 8) # 4D output tensor is thus of shape (batch_size, nkerns[1], 5, 5) layer1 = MyConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], chosen_height / 2, chosen_width / 2), p1=2, p2=2, filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # Construct the third convolutional pooling layer # filtering does not reduce the layer size because we use padding # maxpooling reduces the size to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[2], 4, 4) layer2 = MyConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], chosen_height / 4, chosen_width / 4), p1=2, p2=2, filter_shape=(nkerns[2], nkerns[1], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[2] * 4 * 4), # or (500, 20 * 4 * 4) = (500, 320) with the default values. layer3_input = layer2.output.flatten(2) # construct a fully-connected sigmoidal layer layer3 = HiddenLayer(rng, input=layer3_input, n_in=nkerns[2] * (chosen_height / 8) * (chosen_width / 8), n_out=800, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer4 = LogisticRegression(input=layer3.output, n_in=800, n_out=6) cost = layer4.negative_log_likelihood(y) predicted_output = layer4.y_pred # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer4.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer4.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params #Loading the model # f = file('../saved_models/model317.save.npy', 'r') # params = cPickle.load(f) # print(params) # f.close() # # layer4.params, layer3.params, layer2.params, layer1.params, layer0.params = params # # layer4.W, layer4.b = layer4.params # # layer3.W, layer3.b = layer3.params # # layer2.W, layer2.b = layer2.params # # layer1.W, layer1.b = layer1.params # # layer0.W, layer0.b = layer0.params # layer4.W, layer4.b, layer3.W, layer3.b, layer2.W, layer2.b, layer1.W, layer1.b, layer0.W, layer0.b = params # layer4.params = [layer4.W, layer4.b] # layer3.params = [layer3.W, layer3.b] # layer2.params = [layer2.W, layer2.b] # layer1.params = [layer1.W, layer1.b] # layer0.params = [layer0.W, layer0.b] # x = cPickle.load(f) # layer4.params = [layer4.W, layer4.b] # layer3.params = [layer3.W, layer3.b] # layer2.params = [layer2.W, layer2.b] # layer1.params = [layer1.W, layer1.b] # layer0.params = [layer0.W, layer0.b] # test it on the test set test_losses = [test_model(i) for i in range(n_test_batches)] validation_losses = [validate_model(i) for i in range(n_valid_batches)] test_score = numpy.mean(test_losses) validation_score = numpy.mean(validation_losses) print((' Validation error is %f %%') % (validation_score * 100.)) print((' Test error is %f %%') % (test_score * 100.))
def stochastic_gradient_descent_mnist( learning_rate=0.13, n_epochs=1000, path='/home/tao/Projects/machine-learning/data/mnist.pkl.gz', batch_size=600): datasets = load_data(path) train_set_data, train_set_label = datasets[0] validation_set_data, validation_set_label = datasets[1] test_set_data, test_set_label = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_data.get_value( borrow=True).shape[0] // batch_size n_valid_batches = validation_set_data.get_value( borrow=True).shape[0] // batch_size n_test_batches = test_set_data.get_value( borrow=True).shape[0] // batch_size print('... building the model') index = T.lscalar() # index to a [mini]batch data = T.matrix('x') # data, presented as rasterized images label = T.ivector('y') # labels, presented as 1D vector of [int] labels classifier = LogisticRegression(input=data, input_dim=28 * 28, output_dim=10) objective_function = classifier.negative_log_likelihood(label) # testing model test_model = theano.function( inputs=[index], outputs=classifier.errors(label), givens={ data: test_set_data[index * batch_size:(index + 1) * batch_size], label: test_set_label[index * batch_size:(index + 1) * batch_size] }) # validation model validate_model = theano.function( inputs=[index], outputs=classifier.errors(label), givens={ data: validation_set_data[index * batch_size:(index + 1) * batch_size], label: validation_set_label[index * batch_size:(index + 1) * batch_size] }) # gradients g_W = T.grad(cost=objective_function, wrt=classifier.W) g_b = T.grad(cost=objective_function, wrt=classifier.b) # update rule updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # training model train_model = theano.function( inputs=[index], outputs=objective_function, updates=updates, givens={ data: train_set_data[index * batch_size:(index + 1) * batch_size], label: train_set_label[index * batch_size:(index + 1) * batch_size] }) print('... training the model') # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this much is considered significant # go through this many minibatche before checking the network on the validation set; in this case we check every epoch validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] # grammar sugar this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print(( ' epoch %i, minibatch %i/%i, test error of best model %f %%' ) % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) with open('best_model.pkl', 'wb') as f: pickle.dump(classifier, f) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print( 'Optimization complete with best validation score of %f %%, with test performance %f %%' % (best_validation_loss * 100., test_score * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr)
class CNN(object): ''' Convolutional Neural Network with 2 convolutional pooling layers The default parameters are for the MNIST dataset NOTE: Dataset is required to be 28x28 images with three sub data sets ''' def __init__(self, datasets, batch_size=500, nkerns=[20, 50], img_size=(28, 28), learning_rate=0.1): train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] self.batch_size = batch_size # compute number of minibatches for training, validation and testing self.n_train_batches = train_set_x.get_value(borrow=True).shape[0] self.n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] self.n_test_batches = test_set_x.get_value(borrow=True).shape[0] self.n_train_batches /= batch_size self.n_valid_batches /= batch_size self.n_test_batches /= batch_size # allocate symbolic variables for the data self.index = T.lscalar() # index to a [mini]batch self.x = T.matrix('x') self.y = T.ivector('y') rng = np.random.RandomState(23455) layer0_input = self.x.reshape( (batch_size, 1, img_size[0], img_size[1])) # Create the two convolutional layers that also perform downsampling using maxpooling self.layer0 = ConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_size[0], img_size[1]), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) self.layer1 = ConvPoolLayer(rng, input=self.layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) layer2_input = self.layer1.output.flatten(2) # Create the hidden layer of the MLP self.layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # Create the logistic regression layer for classifiying the results self.layer3 = LogisticRegression(input=self.layer2.output, n_in=500, n_out=10) self.cost = self.layer3.negative_log_likelihood(self.y) self.params = self.layer3.params + self.layer2.params + self.layer1.params + self.layer0.params self.grads = T.grad(self.cost, self.params) # Update list for the paramters to be used when training the model updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(self.params, self.grads)] # This function updates the model parameters using Stochastic Gradient Descent self.train_model = th.function( [self.index], self. cost, # This is the negative-log-likelihood of the Logistic Regression layer updates=updates, givens={ self.x: test_set_x[self.index * batch_size:(self.index + 1) * batch_size], self.y: test_set_y[self.index * batch_size:(self.index + 1) * batch_size] }) # These are Theano functions for testing performance on our test and validation datasets self.test_model = th.function( [self.index], self.layer3.errors(self.y), givens={ self.x: test_set_x[self.index * batch_size:(self.index + 1) * batch_size], self.y: test_set_y[self.index * batch_size:(self.index + 1) * batch_size] }) self.validate_model = th.function( [self.index], self.layer3.errors(self.y), givens={ self.x: valid_set_x[self.index * batch_size:(self.index + 1) * batch_size], self.y: valid_set_y[self.index * batch_size:(self.index + 1) * batch_size] }) def train(self, n_epochs, patience=10000, patience_increase=2, improvement_threshold=0.995): ''' Train the CNN on the training data for a defined number of epochs ''' # Setup the variables for training the model n_train_batches = self.n_train_batches n_valid_batches = self.n_valid_batches n_test_batches = self.n_test_batches validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 best_score = 0. epoch = 0 done_looping = False # Train the CNN for a defined number of epochs while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index # Every 100 iterations if iter % 100 == 0: print 'Training iteration ', iter cost_ij = self.train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # Compute zero-one loss on validation set validation_losses = [ self.validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # Check if current validation loss is best so far if this_validation_loss < best_validation_loss: # Improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) # Save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter if patience <= iter: done_looping = True break print 'Optimization complete.' print('Best validation score of %f %% obtained at iteration %i' % (best_validation_loss * 100., best_iter + 1)) def test(self, set_x, set_y): ''' Test data sets and return the test score ''' # allocate symbolic variables for the data n_test_batches = set_x.get_value(borrow=True).shape[0] n_test_batches /= self.batch_size test_model = th.function( inputs=[self.index], outputs=self.layer3.errors(self.y), givens={ self.x: set_x[self.index * self.batch_size:(self.index + 1) * self.batch_size], self.y: set_y[self.index * self.batch_size:(self.index + 1) * self.batch_size] }) test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) return test_score def classify(self, set): ''' Return the labels for the given set NOTE: The batch size must be the same as the training set ''' n_test_batches = set.get_value(borrow=True).shape[0] n_test_batches /= self.batch_size classify_data = th.function( inputs=[self.index ], # Input to this function is a mini-batch at index outputs=self.layer3.y_pred, # Output the y_predictions givens={ self.x: set[self.index * batch_size:(self.index + 1) * batch_size] }) # Generate labels for the given data labels = [classify_data(i) for i in xrange(n_test_batches)] return np.array(labels)
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=N_FEATURES * N_FRAMES, hidden_layers_sizes=[1024, 1024], n_outs=62 * 3, rho=0.90, eps=1.E-6): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) #self._rho = shared(numpy.cast['float32'](rho), name='rho') # for adadelta #self._eps = shared(numpy.cast['float32'](eps), name='eps') # for adadelta self._rho = rho self._eps = eps self._accugrads = [] # for adadelta self._accudeltas = [] # for adadelta assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.fmatrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) self._accugrads.extend([shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO self._accudeltas.extend([shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO # Construct an RBM that shared weights with this layer if i == 0: rbm_layer = GRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) else: rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self._accugrads.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_outs), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO self._accudeltas.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_outs), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, k): batch_x = T.fmatrix('batch_x') learning_rate = T.scalar('lr') # learning rate to use pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error #markov_chain = shared(numpy.empty((batch_size, rbm.n_hidden), dtype='float32'), borrow=True) markov_chain = None cost, updates = rbm.get_cost_updates(learning_rate, persistent=markov_chain, k=k) # compile the theano function fn = theano.function(inputs=[batch_x, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={self.x: batch_x}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def get_SGD_trainer(self): """ Returns a plain SGD minibatch trainer with learning rate as param. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for param, gparam in zip(self.params, gparams): updates[param] = param - gparam * learning_rate train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_adadelta_trainer(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_adagrad_trainer(self): """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, param, gparam in zip(self._accugrads, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = accugrad + gparam * gparam dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_SAG_trainer(self): """ Returns a Stochastic Averaged Gradient (Bach & Moulines 2011) trainer. This is based on Bach 2013 slides: PRavg(theta_n) = Polyak-Ruppert averaging = (1+n)^{-1} * \sum_{k=0}^n theta_k theta_n = theta_{n-1} - gamma [ f'_n(PR_avg(theta_{n-1})) + f''_n(PR_avg( theta_{n-1})) * (theta_{n-1} - PR_avg(theta_{n-1}))] That returns two trainers: one for the first epoch, one for subsequent epochs. We use self._accudeltas to but the Polyak-Ruppert averaging, and self._accugrads for the number of iterations (updates). """ print "UNFINISHED, see TODO in get_SAG_trainer()" sys.exit(-1) batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # First trainer: gparams = T.grad(cost, self.params) updates = OrderedDict() for accudelta, accugrad, param, gparam in zip(self._accudeltas, self._accugrads, self.params, gparams): theta = param - gparam * learning_rate updates[accudelta] = (theta + accudelta * accugrad) / (accugrad + 1.) updates[param] = theta updates[accugrad] = accugrad + 1. train_fn_init = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) # Second trainer: gparams = T.grad(cost, self._accudeltas) # TODO recreate the network with # (TODO) self._accudeltas instead of self.params so that we can compute the cost hparams = T.grad(cost, gparams) # compute list of fine-tuning updates updates = OrderedDict() for accudelta, accugrad, param, gparam, hparam in zip(self._accudeltas, self._accugrads, self.params, gparams, hparams): theta = param - learning_rate * (gparam + hparam * (param - accudelta)) updates[accudelta] = (theta + accudelta * accugrad) / (accugrad + 1.) updates[param] = theta updates[accugrad] = accugrad + 1. train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn_init, train_fn def get_SGD_ld_trainer(self): """ Returns an SGD-ld trainer (Schaul et al. 2012). """ print "UNFINISHED, see TODO in get_SGD_ld_trainer()" sys.exit(-1) batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # INIT TODO # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, accuhess, param, gparam in zip(self._accugrads, self._accudeltas, self._accuhess, self.params, gparams): pass # TODO # TODO # TODO train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def score_classif(self, given_set): """ Returns functions to get current classification scores. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=self.errors, givens={self.x: batch_x, self.y: batch_y}) # Create a function that scans the entire set given as input def scoref(): return [score(batch_x, batch_y) for batch_x, batch_y in given_set] return scoref
class CNN(object): ''' Convolutional Neural Network with 2 convolutional pooling layers The default parameters are for the MNIST dataset NOTE: Dataset is required to be 28x28 images with three sub data sets ''' def __init__(self, datasets, batch_size=500, nkerns=[20, 50], img_size=(28, 28), learning_rate=0.1): train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] self.batch_size = batch_size # compute number of minibatches for training, validation and testing self.n_train_batches = train_set_x.get_value(borrow=True).shape[0] self.n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] self.n_test_batches = test_set_x.get_value(borrow=True).shape[0] self.n_train_batches /= batch_size self.n_valid_batches /= batch_size self.n_test_batches /= batch_size # allocate symbolic variables for the data self.index = T.lscalar() # index to a [mini]batch self.x = T.matrix('x') self.y = T.ivector('y') rng = np.random.RandomState(23455) layer0_input = self.x.reshape((batch_size, 1, img_size[0], img_size[1])) # Create the two convolutional layers that also perform downsampling using maxpooling self.layer0 = ConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_size[0], img_size[1]), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2,2)) self.layer1 = ConvPoolLayer(rng, input=self.layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2,2)) layer2_input = self.layer1.output.flatten(2) # Create the hidden layer of the MLP self.layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # Create the logistic regression layer for classifiying the results self.layer3 = LogisticRegression(input=self.layer2.output, n_in=500, n_out=10) self.cost = self.layer3.negative_log_likelihood(self.y) self.params = self.layer3.params + self.layer2.params + self.layer1.params + self.layer0.params self.grads = T.grad(self.cost, self.params) # Update list for the paramters to be used when training the model updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(self.params, self.grads)] # This function updates the model parameters using Stochastic Gradient Descent self.train_model = th.function([self.index], self.cost, # This is the negative-log-likelihood of the Logistic Regression layer updates=updates, givens={self.x: test_set_x[self.index * batch_size: (self.index + 1) * batch_size], self.y: test_set_y[self.index * batch_size: (self.index + 1) * batch_size]}) # These are Theano functions for testing performance on our test and validation datasets self.test_model = th.function([self.index], self.layer3.errors(self.y), givens={self.x: test_set_x[self.index * batch_size: (self.index + 1) * batch_size], self.y: test_set_y[self.index * batch_size: (self.index + 1) * batch_size]}) self.validate_model = th.function([self.index], self.layer3.errors(self.y), givens={self.x: valid_set_x[self.index * batch_size: (self.index + 1) * batch_size], self.y: valid_set_y[self.index * batch_size: (self.index + 1) * batch_size]}) def train(self, n_epochs, patience=10000, patience_increase=2, improvement_threshold=0.995): ''' Train the CNN on the training data for a defined number of epochs ''' # Setup the variables for training the model n_train_batches = self.n_train_batches n_valid_batches = self.n_valid_batches n_test_batches = self.n_test_batches validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 best_score = 0. epoch = 0 done_looping = False # Train the CNN for a defined number of epochs while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index # Every 100 iterations if iter % 100 == 0: print 'Training iteration ', iter cost_ij = self.train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # Compute zero-one loss on validation set validation_losses = [self.validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # Check if current validation loss is best so far if this_validation_loss < best_validation_loss: # Improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) # Save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter if patience <= iter: done_looping = True break print 'Optimization complete.' print('Best validation score of %f %% obtained at iteration %i' % (best_validation_loss * 100., best_iter + 1)) def test(self, set_x, set_y): ''' Test data sets and return the test score ''' # allocate symbolic variables for the data n_test_batches = set_x.get_value(borrow=True).shape[0] n_test_batches /= self.batch_size test_model = th.function(inputs=[self.index], outputs=self.layer3.errors(self.y), givens={self.x: set_x[self.index * self.batch_size: (self.index + 1) * self.batch_size], self.y: set_y[self.index * self.batch_size: (self.index + 1) * self.batch_size]}) test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) return test_score def classify(self, set): ''' Return the labels for the given set NOTE: The batch size must be the same as the training set ''' n_test_batches = set.get_value(borrow=True).shape[0] n_test_batches /= self.batch_size classify_data = th.function(inputs=[self.index], # Input to this function is a mini-batch at index outputs=self.layer3.y_pred, # Output the y_predictions givens={self.x: set[self.index * batch_size: (self.index + 1) * batch_size]}) # Generate labels for the given data labels = [classify_data(i) for i in xrange(n_test_batches)] return np.array(labels)
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=N_FEATURES * N_FRAMES, hidden_layers_sizes=[1024, 1024], n_phn=62 * 3, n_spkr=1, rho=0.90, eps=1.E-6): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) #self._rho = shared(numpy.cast['float32'](rho), name='rho') # for adadelta #self._eps = shared(numpy.cast['float32'](eps), name='eps') # for adadelta self._rho = rho self._eps = eps self._accugrads = [] # for adadelta self._accudeltas = [] # for adadelta assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.fmatrix('x') # the data is presented as rasterized images self.y_phn = T.ivector('y_phn') # the labels are presented as 1D vector # of [int] labels self.y_spkr = T.ivector('y_spkr') # the labels are presented as 1D vector # of [int] labels # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) self._accugrads.extend([shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO self._accudeltas.extend([shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO # Construct an RBM that shared weights with this layer if i == 0: rbm_layer = GRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) else: rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayerPhn = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_phn) self.params.extend(self.logLayerPhn.params) self._accugrads.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_phn), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_phn, ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO self._accudeltas.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_phn), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_phn, ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO self.logLayerSpkr = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_spkr) self.params.extend(self.logLayerSpkr.params) self._accugrads.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_spkr), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_spkr, ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO self._accudeltas.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_spkr), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_spkr, ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO self.finetune_cost_sum_phn = self.logLayerPhn.negative_log_likelihood_sum(self.y_phn) self.finetune_cost_sum_spkr = self.logLayerSpkr.negative_log_likelihood_sum(self.y_spkr) self.finetune_cost_phn = self.logLayerPhn.negative_log_likelihood(self.y_phn) self.finetune_cost_spkr = self.logLayerSpkr.negative_log_likelihood(self.y_spkr) self.errors_phn = self.logLayerPhn.errors(self.y_phn) self.errors_spkr = self.logLayerSpkr.errors(self.y_spkr) def get_SGD_trainer(self): """ Returns a plain SGD minibatch trainer with learning rate as param. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for param, gparam in zip(self.params, gparams): updates[param] = param - gparam * learning_rate train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_adadelta_trainer(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params. """ batch_x = T.fmatrix('batch_x') batch_y_phn = T.ivector('batch_y_phn') batch_y_spkr = T.ivector('batch_y_spkr') cost_phn = self.finetune_cost_sum_phn cost_spkr = self.finetune_cost_sum_spkr # compute the gradients with respect to the model parameters gparams_phn = T.grad(cost_phn, self.params[:-2]) gparams_spkr = T.grad(cost_spkr, self.params[:-4] + self.params[-2:]) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads[:-2], self._accudeltas[:-2], self.params[:-2], gparams_phn): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad for accugrad, accudelta, param, gparam in zip(self._accugrads[:-4] + self._accugrads[-2:], self._accudeltas[:-4] + self._accudeltas[-2:], self.params[:-4] + self.params[-2:], gparams_spkr): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y_phn), theano.Param(batch_y_spkr)], outputs=(cost_phn, cost_spkr), updates=updates, givens={self.x: batch_x, self.y_phn: batch_y_phn, self.y_spkr: batch_y_spkr}) return train_fn def get_adadelta_trainers(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params. """ batch_x = T.fmatrix('batch_x') batch_y_phn = T.ivector('batch_y_phn') batch_y_spkr = T.ivector('batch_y_spkr') #cost_phn = self.finetune_cost_sum_phn cost_phn = self.finetune_cost_phn #cost_spkr = self.finetune_cost_sum_spkr cost_spkr = self.finetune_cost_spkr # compute the gradients with respect to the model parameters gparams_phn = T.grad(cost_phn, self.params[:-2]) gparams_spkr = T.grad(cost_spkr, self.params[:-4] + self.params[-2:]) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads[:-2], self._accudeltas[:-2], self.params[:-2], gparams_phn): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn_phn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y_phn)], outputs=cost_phn, updates=updates, givens={self.x: batch_x, self.y_phn: batch_y_phn}) updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads[:-4] + self._accugrads[-2:], self._accudeltas[:-4] + self._accudeltas[-2:], self.params[:-4] + self.params[-2:], gparams_spkr): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn_spkr = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y_spkr)], outputs=cost_spkr, updates=updates, #givens={self.x: batch_x[20:24,:], self.y_spkr: batch_y_spkr[20:24]}) givens={self.x: batch_x, self.y_spkr: batch_y_spkr}) return train_fn_phn, train_fn_spkr def train_only_classif(self): batch_x = T.fmatrix('batch_x') batch_y_phn = T.ivector('batch_y_phn') batch_y_spkr = T.ivector('batch_y_spkr') #cost_phn = self.finetune_cost_sum_phn cost_phn = self.finetune_cost_phn #cost_spkr = self.finetune_cost_sum_spkr cost_spkr = self.finetune_cost_spkr # compute the gradients with respect to the model parameters gparams_phn = T.grad(cost_phn, self.params[-4:-2]) gparams_spkr = T.grad(cost_spkr, self.params[-2:]) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads[-4:-2], self._accudeltas[-4:-2], self.params[-4:-2], gparams_phn): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn_phn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y_phn)], outputs=cost_phn, updates=updates, givens={self.x: batch_x, self.y_phn: batch_y_phn}) updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads[-2:], self._accudeltas[-2:], self.params[-2:], gparams_spkr): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn_spkr = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y_spkr)], outputs=cost_spkr, updates=updates, #givens={self.x: batch_x[20:24,:], self.y_spkr: batch_y_spkr[20:24]}) givens={self.x: batch_x, self.y_spkr: batch_y_spkr}) return train_fn_phn, train_fn_spkr def score_classif(self, given_set): """ Returns functions to get current classification scores. """ batch_x = T.fmatrix('batch_x') batch_y_phn = T.ivector('batch_y_phn') batch_y_spkr = T.ivector('batch_y_spkr') score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y_phn), theano.Param(batch_y_spkr)], outputs=(self.errors_phn, self.errors_spkr), givens={self.x: batch_x, self.y_phn: batch_y_phn, self.y_spkr: batch_y_spkr}) # Create a function that scans the entire set given as input def scoref(): return [score(batch_x, batch_y_phn, batch_y_spkr) for batch_x, batch_y_phn, batch_y_spkr in given_set] return scoref
def test_regression_model_mnist(dataset_name='mnist.pkl.gz', learning_rate=0.13, n_epochs=1000, batch_size=600): # Set up the dataset dataset = load_data(dataset_name) # Split the data into a training, validation and test set train_data, train_labels = dataset[0] test_data, test_labels = dataset[1] validation_data, validation_labels = dataset[2] # Compute number of minibatches for each set n_train_batches = train_data.get_value(borrow=True).shape[0] / batch_size n_valid_batches = validation_data.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_data.get_value(borrow=True).shape[0] / batch_size data_dim = (28, 28) # The dimension of each image in the dataset data_classes = 10 # The number of classes within the data # Build the model # --------------- # Allocate symbolic variables for data index = T.lscalar() # This is the index to a minibatch x = T.matrix('x') # Data (rasterized images) y = T.ivector('y') # Labels (1d vector of ints) # Construct logistic regression class classifier = LogisticRegression(input=x, n_in=data_dim[0]*data_dim[1], n_out=data_classes) # Cost to minimize during training cost = classifier.negative_log_likelihood(y) # Compile a Theano function that computes mistakes made by the model on a minibatch test_model = th.function(inputs=[index], # This function is for the test data outputs=classifier.errors(y), givens={x: test_data[index * batch_size: (index + 1) * batch_size], y: test_labels[index * batch_size: (index + 1) * batch_size]}) validate_model = th.function(inputs=[index], # This function is for the validation data outputs=classifier.errors(y), givens={x: validation_data[index * batch_size: (index + 1) * batch_size], y: validation_labels[index * batch_size: (index + 1) * batch_size]}) # Compute the gradient of cost with respect to theta = (W,b) grad_W = T.grad(cost=cost, wrt=classifier.W) grad_b = T.grad(cost=cost, wrt=classifier.b) # Specify how to update model parameters as a list of (variable, update expression) pairs updates = [(classifier.W, classifier.W - learning_rate * grad_W), (classifier.b, classifier.b - learning_rate * grad_b)] # Compile Theano function that returns the cost and updates parameters of model based on update rules train_model = th.function(inputs=[index], # Index in minibatch that defines x with label y outputs=cost, # Cost/loss associated with x,y updates=updates, givens={x: train_data[index * batch_size: (index + 1) * batch_size], y: train_labels[index * batch_size: (index + 1) * batch_size]}) # Train the model # --------------- # Setup the early-stopping parameters patience = 5000 # Minimum number of examples to examine patience_increase = 2 # How much longer to wait once a new best is found improvement_threshold = 0.995 # Value of a significant relative improvement validation_frequency = min(n_train_batches, patience / 2) # Number of minibatches before validating best_validation_loss = np.inf test_score = 0 start_time = time.clock() # Setup the training loop done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # Set the iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # Compute the zero-one loss on the validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # Check if current validation score is the best if this_validation_loss < best_validation_loss: # Improve the patience is loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # Test on test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) print('epoch %i, minibatch %i/%i, test error of best model %f %%' % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) # Stop the loop if we have exhausted our patience if patience <= iter: done_looping = True break; # The loop has ended so record the time it took end_time = time.clock() # Print out results and timing information print('Optimization complete with best validation score of %f %%, with test performance %f %%' % (best_validation_loss * 100., test_score * 100.)) print 'The code ran for %d epochs with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
class SdA(object): """Stacked denoising auto-encoder class (SdA) A stacked denoising autoencoder model is obtained by stacking several dAs. The hidden layer of the dA at layer `i` becomes the input of the dA at layer `i+1`. The first layer dA gets as input the input of the SdA, and the hidden layer of the last dA represents the output. Note that after pretraining, the SdA is dealt with as a normal MLP, the dAs are only used to initialize the weights. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) self.theano_rng = theano_rng # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels for i in range(self.n_layers): # n sigmoid layers and n dA layers # size of input is either hidden units of layer below, or input size for first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # input to this layer, is either: # activation of hidden layer below # or input to SDA if you are first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) dA_layer = DenoisingAutoEncoder(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size): ''' Generates a list of functions, each of them implementing one step in training the dA corresponding to the layer with same index. The function will require as input the minibatch index, and to train a dA you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared variable that contains all datapoints used for training the dA :type batch_size: int :param batch_size: size of a [mini]batch :type learning_rate: float :param learning_rate: learning rate used during training for any of the dA layers ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch corruption_level = T.scalar('corruption') learning_rate = T.scalar('lr') batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: #get cost and updates list cost, updates = dA.get_cost_updates(corruption_level, learning_rate) # compile theano function fn = theano.function( inputs=[ index, theano.In(corruption_level, value=0.2), theano.In(learning_rate, value=0.1) ], outputs=cost, updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches //= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches //= batch_size index = T.lscalar('index') # index to a [mini]batch # compute gradients with respect to model parameters (backprop happens here??) gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [(param, param - gparam * learning_rate) for param, gparam in zip(self.params, gparams)] train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }, name='train') test_score_i = theano.function( inputs=[index], outputs=self.errors, givens={ self.x: test_set_x[index * batch_size:(index + 1) * batch_size], self.y: test_set_y[index * batch_size:(index + 1) * batch_size] }, name='test') valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }, name='valid') # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in range(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in range(n_test_batches)] return train_fn, valid_score, test_score
def optimize_cnn_lenet(learning_rate=0.01, n_epochs=200, dataset='data/mnist.pkl.gz', batch_size=500, n_hidden=500, nkerns=[20, 50], rng=np.random.RandomState(23455)): print '... load training set' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size # ミニバッチのindex index = T.lscalar() # dataシンボル x = T.matrix('x') # labelシンボル y = T.ivector('y') print '... building the model' # LeNetConvPoolLayerと矛盾が起きないように、(batch_size, 28*28)にラスタ化された行列を4DTensorにリシェイプする # 追加した1はチャンネル数 # ここではグレイスケール画像なのでチャンネル数は1 layer0_input = x.reshape((batch_size, 1, 28, 28)) # filterのnkerns[0]は20 layer0 = ConvLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5)) layer1 = PoolLayer(layer0.output, poolsize=(2, 2)) # filterのnkerns[1]は50 layer2 = ConvLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5)) layer3 = PoolLayer(layer2.output, poolsize=(2, 2)) # layer2_input # layer1の出力は4x4ピクセルの画像が50チャンネル分4次元Tensorで出力されるが、多層パーセプトロンの入力にそのまま使えない # 4x4x50=800次元のベクトルに変換する(batch_size, 50, 4, 4)から(batch_size, 800)にする layer4_input = layer3.output.flatten(2) # 500ユニットの隠れレイヤー # layer2_inputで作成した入力ベクトルのサイズ=n_in layer4 = HiddenLayer(rng, input=layer4_input, n_in=nkerns[1]*4*4, n_out=n_hidden, activation=T.tanh) # 出力は500ユニット layer5 = LogisticRegression(input=layer4.output, n_in=n_hidden, n_out=10) # cost(普通の多層パーセプトロンは正則化項が必要だが、CNNは構造自体で正則化の効果を含んでいる) cost = layer5.negative_log_likelihood(y) # testモデル # 入力indexからgivensによって計算した値を使ってlayer3.errorsを計算する test_model = theano.function([index], layer5.errors(y), givens={x:test_set_x[index*batch_size : (index + 1)*batch_size], y: test_set_y[index*batch_size : (index + 1)*batch_size]}) # validationモデル validate_model = theano.function([index], layer5.errors(y), givens={x:valid_set_x[index*batch_size : (index + 1)*batch_size], y: valid_set_y[index*batch_size : (index + 1)*batch_size]}) # 微分用のパラメータ(pooling層にはパラメータがない) params = layer5.params + layer4.params + layer2.params + layer0.params # コスト関数パラメータについてのの微分 grads = T.grad(cost, params) # パラメータの更新 updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] # trainモデル train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={x: train_set_x[index*batch_size : (index + 1)*batch_size], y:train_set_y[index*batch_size : (index+1)*batch_size]}) # optimize print "train model ..." patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience/2) best_validation_loss = np.inf best_iter = 0 test_score = 0 start_time = timeit.default_timer() epoch = 0 done_looping = False fp1 = open('log/lenet_validation_error.txt', 'w') fp2 = open('log/lenet_test_error.txt', 'w') while(epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: ## validationのindexをvalidationのエラー率を計算するfunctionに渡し、配列としてかえす validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] # 平均してscoreにする this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f ' % (epoch, minibatch_index+1, n_train_batches, this_validation_loss*100.)) fp1.write("%d\t%f\n" % (epoch, this_validation_loss*100)) if this_validation_loss < best_validation_loss: if(this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter*patience_increase) best_validation_loss = this_validation_loss best_iter = iter ## testのindex をtestのエラー率を計算するfunctionに渡し、配列として渡す test_losses = [test_model(i) for i in xrange(n_test_batches)] ## 平均してscoreにする test_score = np.mean(test_losses) ## print('epoch %i, minibatch %i/%i, test error %f ' % (epoch, minibatch_index+1, n_train_batches, test_score*100.)) fp2.write("%d\t%f\n" % (epoch, test_score*100)) if patience <= iter: done_looping = True break fp1.close() fp2.close() end_time = timeit.default_timer() print(('optimization complete. Best validation score of %f obtained at iteration %i, with test performance %f') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr,('This code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time)/60.)) import cPickle cPickle.dump(layer0, open("model/cnn_layer0.pkl", "wb")) cPickle.dump(layer2, open("model/cnn_layer2.pkl", "wb")) cPickle.dump(layer4, open("model/cnn_layer4.pkl", "wb")) cPickle.dump(layer5, open("model/cnn_layer5.pkl", "wb"))
def evaluate_model(learning_rate=0.001, n_epochs=100, nkerns=[16, 40, 50, 60], batch_size=20): """ Network for classification of MNIST database :type learning_rate: float :param learning_rate: this is the initial learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer :type batch_size: int :param batch_size: the batch size for training """ print("Evaluating model") rng = numpy.random.RandomState(23455) # loading the data1 datasets = load_test_data(1) valid_set_x, valid_set_y = datasets[0] test_set_x, test_set_y = datasets[1] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels loaded_params = numpy.load('../saved_models/model1.npy') layer4_W, layer4_b, layer3_W, layer3_b, layer2_W, layer2_b, layer1_W, layer1_b, layer0_W, layer0_b = loaded_params ###################### # BUILD ACTUAL MODEL # ###################### print('Building the model...') # Reshape matrix of rasterized images of shape (batch_size, 32 * 32) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (32, 32) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 64, 88)) # Construct the first convolutional pooling layer: # filtering does not reduce the layer size because we use padding # maxpooling reduces the size to (32/2, 32/2) = (16, 16) # 4D output tensor is thus of shape (batch_size, nkerns[0], 16, 16) layer0 = MyConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 64, 88), p1=2, p2=2, filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2), W=layer0_W, b=layer0_b) # Construct the second convolutional pooling layer: # filtering does not reduce the layer size because we use padding # maxpooling reduces the size to (16/2, 16/2) = (8, 8) # 4D output tensor is thus of shape (batch_size, nkerns[1], 5, 5) layer1 = MyConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 32, 44), p1=2, p2=2, filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2), W=layer1_W, b=layer1_b) # Construct the third convolutional pooling layer # filtering does not reduce the layer size because we use padding # maxpooling reduces the size to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[2], 4, 4) layer2 = MyConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 16, 22), p1=2, p2=2, filter_shape=(nkerns[2], nkerns[1], 5, 5), poolsize=(2, 2), W=layer2_W, b=layer2_b) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[2] * 4 * 4), # or (500, 20 * 4 * 4) = (500, 320) with the default values. layer3_input = layer2.output.flatten(2) # construct a fully-connected sigmoidal layer layer3 = HiddenLayer(rng, input=layer3_input, n_in=nkerns[2] * 8 * 11, n_out=800, activation=T.tanh, W=layer3_W, b=layer3_b) # classify the values of the fully-connected sigmoidal layer layer4 = LogisticRegression(input=layer3.output, n_in=800, n_out=6, W=layer4_W, b=layer4_b) cost = layer4.negative_log_likelihood(y) predicted_output = layer4.y_pred # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer4.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) val_model_preds = theano.function( [index], layer4.prediction(), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], }) validate_model = theano.function( [index], layer4.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params val_preds = [val_model_preds(i) for i in range(n_valid_batches)] #print(val_preds) #preds = numpy(val_preds) preds = [] for pred in val_preds: for p in pred: preds.append(p) #preds = val_preds.reshape(valid_set_x.get_value(borrow=True).shape[0]) actual_labels = load_test_data(1, 2) n = len(actual_labels) confusion_matrix = numpy.zeros((6, 6)) for i in range(n): confusion_matrix[int(actual_labels[i])][preds[i]] += 1 print(confusion_matrix) correct = 0.0 for i in range(n): if (preds[i] == int(actual_labels[i])): correct += 1.0 accuracy = correct / n print("Number of correctly classified : ", correct) print("Test accuracy is", accuracy * 100)
def train_CNN_mini_batch(learning_rate, n_epochs, num_kernels, batch_size, filter_size, is_multi_scale, num_of_classes, height, width, use_interpolation, use_hidden_layer): train_set_x_by_1, train_set_y, valid_set_x_by_1, valid_set_y, test_set_x_by_1, test_set_y, train_set_x_by_2, \ train_set_x_by_4, valid_set_x_by_2, valid_set_x_by_4, test_set_x_by_2, test_set_x_by_4 \ = load_processed_img_data() n_train_batches = train_set_x_by_1.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x_by_1.get_value(borrow=True).shape[0] n_test_batches = test_set_x_by_1.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size index = theano.tensor.lscalar() x_by_1 = theano.tensor.ftensor4('x_by_1') x_by_2 = theano.tensor.ftensor4('x_by_2') x_by_4 = theano.tensor.ftensor4('x_by_4') y = theano.tensor.ivector('y') print '... initialize the model' cnn_dir = 'models/CNN_' if is_multi_scale is True: cnn_dir += 'M_' else: cnn_dir += 'S_' if use_hidden_layer is True: cnn_dir += 'H_' else: cnn_dir += 'L_' if use_interpolation is True: cnn_dir += 'I_' else: cnn_dir += 'N_' cnn_dir = cnn_dir + str(num_kernels[0]) + '_' + str( num_kernels[1]) + '_' + str( num_kernels[2]) + '_' + str(batch_size) + '_' curr_date = str(datetime.date.today()) curr_date = curr_date.replace('-', '_') cnn_dir = cnn_dir + curr_date + str(time.strftime('_%H_%M_%S')) print 'CNN model is ', cnn_dir if not os.path.exists(cnn_dir): os.makedirs(cnn_dir) class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(cnn_dir + '/log.txt', 'w') def write(self, message): self.terminal.write(message) self.log.write(message) sys.stdout = Logger() layer0 = CNN_Layer( name='Layer_0', W=None, b=None, filter_shape=(num_kernels[0], 3, filter_size, filter_size), ) layer1 = CNN_Layer( name='Layer_1', W=None, b=None, filter_shape=(num_kernels[1], num_kernels[0], filter_size, filter_size), ) layer2 = CNN_Layer( name='Layer_2', W=None, b=None, filter_shape=(num_kernels[2], num_kernels[1], filter_size, filter_size), ) layer3 = HiddenLayer(name='Layer_3', W=None, b=None, n_in=num_kernels[2] * 3 if is_multi_scale is True else num_kernels[2], n_out=num_kernels[2] * 4 if is_multi_scale is True else num_kernels[2] * 2, activation=theano.tensor.tanh) if is_multi_scale and use_hidden_layer: layer4_in = num_kernels[2] * 4 elif is_multi_scale and not use_hidden_layer: layer4_in = num_kernels[2] * 3 elif not is_multi_scale and use_hidden_layer: layer4_in = num_kernels[2] * 2 else: layer4_in = num_kernels[2] layer4 = LogisticRegression( name='Layer_4', W=None, b=None, n_in=layer4_in, n_out=num_of_classes, ) forward_propagation(layer0=layer0, layer1=layer1, layer2=layer2, layer3=layer3, layer4=layer4, x_by_1=x_by_1, x_by_2=x_by_2, x_by_4=x_by_4, num_kernels=num_kernels, batch_size=batch_size, filter_size=filter_size, is_multi_scale=is_multi_scale, height=height, width=width, use_interpolation=use_interpolation, use_hidden_layer=use_hidden_layer) if use_hidden_layer is True: L2_norm = (layer4.W**2).sum() + (layer3.W**2).sum() + ( layer2.W**2).sum() + (layer1.W**2).sum() + (layer0.W**2).sum() else: L2_norm = (layer4.W**2).sum() + (layer2.W**2).sum() + ( layer1.W**2).sum() + (layer0.W**2).sum() regularization = 0.00001 cost = layer4.negative_log_likelihood(y) + (regularization * L2_norm) if is_multi_scale is True: test_model = theano.function( [index], layer4.errors(y), givens={ x_by_1: test_set_x_by_1[index * batch_size:(index + 1) * batch_size], x_by_2: test_set_x_by_2[index * batch_size:(index + 1) * batch_size], x_by_4: test_set_x_by_4[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size * height * width:(index + 1) * batch_size * height * width] }) else: test_model = theano.function( [index], layer4.errors(y), givens={ x_by_1: test_set_x_by_1[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size * height * width:(index + 1) * batch_size * height * width] }) if is_multi_scale is True: validate_model = theano.function( [index], layer4.errors(y), givens={ x_by_1: valid_set_x_by_1[index * batch_size:(index + 1) * batch_size], x_by_2: valid_set_x_by_2[index * batch_size:(index + 1) * batch_size], x_by_4: valid_set_x_by_4[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size * height * width:(index + 1) * batch_size * height * width] }) else: validate_model = theano.function( [index], layer4.errors(y), givens={ x_by_1: valid_set_x_by_1[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size * height * width:(index + 1) * batch_size * height * width] }) if use_hidden_layer is True: params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params else: params = layer4.params + layer2.params + layer1.params + layer0.params grads = theano.tensor.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] if is_multi_scale is True: train_model = theano.function( [index], cost, updates=updates, givens={ x_by_1: train_set_x_by_1[index * batch_size:(index + 1) * batch_size], x_by_2: train_set_x_by_2[index * batch_size:(index + 1) * batch_size], x_by_4: train_set_x_by_4[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size * width * height:(index + 1) * batch_size * width * height] }) else: train_model = theano.function( [index], cost, updates=updates, givens={ x_by_1: train_set_x_by_1[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size * width * height:(index + 1) * batch_size * width * height] }) print '... training the model' patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this much is considered significant validation_frequency = min(n_train_batches, patience / 2) best_layer_0_W = numpy.zeros_like(layer0.W.get_value()) best_layer_0_b = numpy.zeros_like(layer0.b.get_value()) best_layer_1_W = numpy.zeros_like(layer1.W.get_value()) best_layer_1_b = numpy.zeros_like(layer1.b.get_value()) best_layer_2_W = numpy.zeros_like(layer2.W.get_value()) best_layer_2_b = numpy.zeros_like(layer2.b.get_value()) best_layer_3_W = numpy.zeros_like(layer3.W.get_value()) best_layer_3_b = numpy.zeros_like(layer3.b.get_value()) best_layer_4_W = numpy.zeros_like(layer4.W.get_value()) best_layer_4_b = numpy.zeros_like(layer4.b.get_value()) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch += 1 for mini_batch_index in xrange(n_train_batches): start = time.clock() iter = (epoch - 1) * n_train_batches + mini_batch_index cost_ij = train_model(mini_batch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, mini-batch %i/%i, validation error %f %%' % (epoch, mini_batch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # save best filters best_layer_0_W = layer0.W.get_value() best_layer_0_b = layer0.b.get_value() best_layer_1_W = layer1.W.get_value() best_layer_1_b = layer1.b.get_value() best_layer_2_W = layer2.W.get_value() best_layer_2_b = layer2.b.get_value() best_layer_3_W = layer3.W.get_value() best_layer_3_b = layer3.b.get_value() best_layer_4_W = layer4.W.get_value() best_layer_4_b = layer4.b.get_value() # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, mini-batch %i/%i, test error of ' 'best model %f %%') % (epoch, mini_batch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break print 'training @ iter = %d, time taken = %f' % (iter, (time.clock() - start)) end_time = time.clock() print('Optimization complete.') print( 'Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) if not os.path.exists(cnn_dir + '/params'): os.makedirs(cnn_dir + '/params') numpy.save(cnn_dir + '/params/layer_0_W.npy', best_layer_0_W) numpy.save(cnn_dir + '/params/layer_0_b.npy', best_layer_0_b) numpy.save(cnn_dir + '/params/layer_1_W.npy', best_layer_1_W) numpy.save(cnn_dir + '/params/layer_1_b.npy', best_layer_1_b) numpy.save(cnn_dir + '/params/layer_2_W.npy', best_layer_2_W) numpy.save(cnn_dir + '/params/layer_2_b.npy', best_layer_2_b) numpy.save(cnn_dir + '/params/layer_3_W.npy', best_layer_3_W) numpy.save(cnn_dir + '/params/layer_3_b.npy', best_layer_3_b) numpy.save(cnn_dir + '/params/layer_4_W.npy', best_layer_4_W) numpy.save(cnn_dir + '/params/layer_4_b.npy', best_layer_4_b) numpy.save(cnn_dir + '/params/filer_kernels.npy', num_kernels) numpy.save(cnn_dir + '/params/filter_size.npy', filter_size) return cnn_dir
def evaluate_cifar(learning_rate=0.001, n_epochs=100, dataset_folder='cifar-10-batches-py', nkerns=[16, 20, 20], batch_size=32): """ Network for classification of MNIST database :type learning_rate: float :param learning_rate: this is the initial learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset_folder: string :param dataset_folder: the folder containing the batch files for cifar :type nkerns: list of ints :param nkerns: number of kernels on each layer :type batch_size: int :param batch_size: the batch size for training """ rng = numpy.random.RandomState(23455) # loading the cifar data datasets = load_cifar_data(dataset_folder) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('Building the model...') # Reshape matrix of rasterized images of shape (batch_size, 32 * 32) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (32, 32) is the size of MNIST images. layer0_input = x.reshape((batch_size, 3, 32, 32)) # Construct the first convolutional pooling layer: # filtering does not reduce the layer size because we use padding # maxpooling reduces the size to (32/2, 32/2) = (16, 16) # 4D output tensor is thus of shape (batch_size, nkerns[0], 16, 16) layer0 = MyConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 3, 32, 32), p1=2, p2=2, filter_shape=(nkerns[0], 3, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer: # filtering does not reduce the layer size because we use padding # maxpooling reduces the size to (16/2, 16/2) = (8, 8) # 4D output tensor is thus of shape (batch_size, nkerns[1], 5, 5) layer1 = MyConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 16, 16), p1=2, p2=2, filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # Construct the third convolutional pooling layer # filtering does not reduce the layer size because we use padding # maxpooling reduces the size to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[2], 4, 4) layer2 = MyConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 8, 8), p1=2, p2=2, filter_shape=(nkerns[2], nkerns[1], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[2] * 4 * 4), # or (500, 20 * 4 * 4) = (500, 320) with the default values. layer3_input = layer2.output.flatten(2) # construct a fully-connected sigmoidal layer layer3 = HiddenLayer(rng, input=layer3_input, n_in=nkerns[2] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=5) # the cost we minimize during training is the NLL of the model cost = layer4.negative_log_likelihood(y) predicted_output = layer4.y_pred # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer4.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer4.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # the learning rate for batch SGD (adaptive learning rate) l_rate = T.scalar('l_rate', dtype=theano.config.floatX) # the momentum SGD momentum = T.scalar('momentum', dtype=theano.config.floatX) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [] for param in params: previous_step = theano.shared(param.get_value() * 0., broadcastable=param.broadcastable) step = momentum * previous_step - l_rate * T.grad(cost, param) updates.append((previous_step, step)) updates.append((param, param + step)) train_model = theano.function( [index, l_rate, momentum], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print('Training...') # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False # initializing the adaptive leaning rate adaptive_learning_rate = learning_rate # initializing the momentum momentum = 0.9 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 if epoch % 10 == 0: # decreasing the learning rate after every 10 epochs adaptive_learning_rate = 0.95 * adaptive_learning_rate # increasing the learning rate after every 10 epochs momentum = 1.05 * momentum for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index, adaptive_learning_rate, momentum) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # increase the learning rate by small amount (adaptive) adaptive_learning_rate = 1.01 * adaptive_learning_rate #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) else: # decrease the learning rate by small amount (adaptive) adaptive_learning_rate = 0.5 * adaptive_learning_rate if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
class RRNN(object): """Recurrent ReLU Neural Network """ def __init__(self, numpy_rng, theano_rng=None, n_ins=N_FEATURES * N_FRAMES, relu_layers_sizes=[1024, 1024, 1024], recurrent_connections=[2], # layer(s), can only be i^t -> i^{t+1} n_outs=62 * 3, rho=0.9, eps=1.E-6): """ TODO """ self.relu_layers = [] self.params = [] self.n_layers = len(relu_layers_sizes) self._rho = rho # ``momentum'' for adadelta self._eps = eps # epsilon for adadelta self._accugrads = [] # for adadelta self._accudeltas = [] # for adadelta self.n_outs = n_outs assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.fmatrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): if i == 0: input_size = n_ins else: input_size = relu_layers_sizes[i-1] if i == 0: layer_input = self.x else: layer_input = self.relu_layers[-1].output if i in recurrent_connections: inputr_size = relu_layers_sizes[i] previous_output = T.fmatrix('previous_output') relu_layer = RecurrentReLU(rng=numpy_rng, input=layer_input, in_stack=previous_output, n_in=input_size, n_in_stack=inputr_size, n_out=inputr_size) #relu_layer.in_stack = relu_layer.output # TODO TODO TODO self.params.extend(relu_layer.params) self._accugrads.extend([shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accugrad_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accugrad_Ws', borrow=True)]) self._accudeltas.extend([shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accudelta_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accudelta_Ws', borrow=True)]) else: relu_layer = ReLU(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=relu_layers_sizes[i]) self.params.extend(relu_layer.params) self._accugrads.extend([shared(value=numpy.zeros((input_size, relu_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True)]) self._accudeltas.extend([shared(value=numpy.zeros((input_size, relu_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True)]) self.relu_layers.append(relu_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.relu_layers[-1].output, n_in=relu_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self._accugrads.extend([shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accugrad_b', borrow=True)]) self._accudeltas.extend([shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accudelta_b', borrow=True)]) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def get_SGD_trainer(self): """ Returns a plain SGD minibatch trainer with learning rate as param. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for param, gparam in zip(self.params, gparams): updates[param] = param - gparam * learning_rate train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_adadelta_trainer(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_adagrad_trainer(self): """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, param, gparam in zip(self._accugrads, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = accugrad + gparam * gparam dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def score_classif(self, given_set): """ Returns functions to get current classification scores. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=self.errors, givens={self.x: batch_x, self.y: batch_y}) # Create a function that scans the entire set given as input def scoref(): return [score(batch_x, batch_y) for batch_x, batch_y in given_set] return scoref
def evaluate_model(learning_rate=0.005, n_epochs=50, nkerns=[16, 40, 50, 60], batch_size=32): """ Network for classification :type learning_rate: float :param learning_rate: this is the initial learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer :type batch_size: int :param batch_size: the batch size for training """ print("Evaluating model") rng = numpy.random.RandomState(23455) # loading the data datasets = load_data(3) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('Building the model...') layer0_input = x.reshape((batch_size, 1, 64, 88)) layer0 = MyConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 64, 88), p1=2, p2=2, filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) layer1 = MyConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 32, 44), p1=2, p2=2, filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) layer2 = MyConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 16, 22), p1=2, p2=2, filter_shape=(nkerns[2], nkerns[1], 5, 5), poolsize=(2, 2)) layer3_input = layer2.output.flatten(2) # construct a fully-connected sigmoidal layer layer3 = HiddenLayer(rng, input=layer3_input, n_in=nkerns[2] * 8 * 11, n_out=800, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer4 = LogisticRegression(input=layer3.output, n_in=800, n_out=6) # the cost we minimize during training is the NLL of the model cost = layer4.negative_log_likelihood(y) predicted_output = layer4.y_pred # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer4.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer4.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # the learning rate for batch SGD (adaptive learning rate) l_rate = T.scalar('l_rate', dtype=theano.config.floatX) adaptive_learning_rate = T.scalar('adaptive_learning_rate', dtype=theano.config.floatX) # the momentum SGD momentum = T.scalar('momentum', dtype=theano.config.floatX) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [] for param in params: previous_step = theano.shared(param.get_value() * 0., broadcastable=param.broadcastable) step = momentum * previous_step - l_rate * T.grad(cost, param) updates.append((previous_step, step)) updates.append((param, param + step)) train_model = theano.function( [index, l_rate, momentum], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print('Training...') # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False # initializing the adaptive leaning rate adaptive_learning_rate = learning_rate # initializing the momentum momentum = 0.1 a = 0.0001 b = 0.3 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 if epoch % 5 == 0: # decreasing the learning rate after every 10 epochs adaptive_learning_rate = 0.95 * adaptive_learning_rate # increasing the learning rate after every 10 epochs #momentum = 1.005 * momentum for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index, adaptive_learning_rate, momentum) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # increase the learning rate by small amount (adaptive) adaptive_learning_rate += a #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter #Save the model print("Saving model") save_filename = "../saved_models/model3" x = numpy.array([ layer4.W.get_value(), layer4.b.get_value(), layer3.W.get_value(), layer3.b.get_value(), layer2.W.get_value(), layer2.b.get_value(), layer1.W.get_value(), layer1.b.get_value(), layer0.W.get_value(), layer0.b.get_value() ]) numpy.save(save_filename, x) # f = file(save_filename, 'wb') # # cPickle.dump([param.get_value() for param in params], f, protocol=cPickle.HIGHEST_PROTOCOL) # cPickle.dump([param.get_value() for param in params], f, protocol=cPickle.HIGHEST_PROTOCOL) # # cPickle.dump(params, f, protocol=cPickle.HIGHEST_PROTOCOL) # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) else: # decrease the learning rate by small amount (adaptive) adaptive_learning_rate = adaptive_learning_rate - ( b * adaptive_learning_rate) + (0.01 * a) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=N_FEATURES * N_FRAMES, hidden_layers_sizes=[1024, 1024], n_outs=62 * 3, rho=0.90, eps=1.E-6): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) #self._rho = shared(numpy.cast['float32'](rho), name='rho') # for adadelta #self._eps = shared(numpy.cast['float32'](eps), name='eps') # for adadelta self._rho = rho self._eps = eps self._accugrads = [] # for adadelta self._accudeltas = [] # for adadelta assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.fmatrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) self._accugrads.extend([ shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True) ]) # TODO self._accudeltas.extend([ shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True) ]) # TODO # Construct an RBM that shared weights with this layer if i == 0: rbm_layer = GRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) else: rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self._accugrads.extend([ shared(value=numpy.zeros((hidden_layers_sizes[-1], n_outs), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accugrad_b', borrow=True) ]) # TODO self._accudeltas.extend([ shared(value=numpy.zeros((hidden_layers_sizes[-1], n_outs), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accudelta_b', borrow=True) ]) # TODO # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum( self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, k): batch_x = T.fmatrix('batch_x') learning_rate = T.scalar('lr') # learning rate to use pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error #markov_chain = shared(numpy.empty((batch_size, rbm.n_hidden), dtype='float32'), borrow=True) markov_chain = None cost, updates = rbm.get_cost_updates(learning_rate, persistent=markov_chain, k=k) # compile the theano function fn = theano.function( inputs=[batch_x, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={self.x: batch_x}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def get_SGD_trainer(self): """ Returns a plain SGD minibatch trainer with learning rate as param. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for param, gparam in zip(self.params, gparams): updates[param] = param - gparam * learning_rate train_fn = theano.function(inputs=[ theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate) ], outputs=cost, updates=updates, givens={ self.x: batch_x, self.y: batch_y }) return train_fn def get_adadelta_trainer(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = -T.sqrt( (accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function( inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=cost, updates=updates, givens={ self.x: batch_x, self.y: batch_y }) return train_fn def get_adagrad_trainer(self): """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # compute list of fine-tuning updates updates = OrderedDict() for accugrad, param, gparam in zip(self._accugrads, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = accugrad + gparam * gparam dx = -(learning_rate / T.sqrt(agrad + self._eps)) * gparam updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[ theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate) ], outputs=cost, updates=updates, givens={ self.x: batch_x, self.y: batch_y }) return train_fn def get_SAG_trainer(self): """ Returns a Stochastic Averaged Gradient (Bach & Moulines 2011) trainer. This is based on Bach 2013 slides: PRavg(theta_n) = Polyak-Ruppert averaging = (1+n)^{-1} * \sum_{k=0}^n theta_k theta_n = theta_{n-1} - gamma [ f'_n(PR_avg(theta_{n-1})) + f''_n(PR_avg( theta_{n-1})) * (theta_{n-1} - PR_avg(theta_{n-1}))] That returns two trainers: one for the first epoch, one for subsequent epochs. We use self._accudeltas to but the Polyak-Ruppert averaging, and self._accugrads for the number of iterations (updates). """ print "UNFINISHED, see TODO in get_SAG_trainer()" sys.exit(-1) batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use cost = self.finetune_cost_sum # First trainer: gparams = T.grad(cost, self.params) updates = OrderedDict() for accudelta, accugrad, param, gparam in zip(self._accudeltas, self._accugrads, self.params, gparams): theta = param - gparam * learning_rate updates[accudelta] = (theta + accudelta * accugrad) / (accugrad + 1.) updates[param] = theta updates[accugrad] = accugrad + 1. train_fn_init = theano.function(inputs=[ theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate) ], outputs=cost, updates=updates, givens={ self.x: batch_x, self.y: batch_y }) # Second trainer: gparams = T.grad(cost, self._accudeltas) # TODO recreate the network with # (TODO) self._accudeltas instead of self.params so that we can compute the cost hparams = T.grad(cost, gparams) # compute list of fine-tuning updates updates = OrderedDict() for accudelta, accugrad, param, gparam, hparam in zip( self._accudeltas, self._accugrads, self.params, gparams, hparams): theta = param - learning_rate * (gparam + hparam * (param - accudelta)) updates[accudelta] = (theta + accudelta * accugrad) / (accugrad + 1.) updates[param] = theta updates[accugrad] = accugrad + 1. train_fn = theano.function(inputs=[ theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate) ], outputs=cost, updates=updates, givens={ self.x: batch_x, self.y: batch_y }) return train_fn_init, train_fn def get_SGD_ld_trainer(self): """ Returns an SGD-ld trainer (Schaul et al. 2012). """ print "UNFINISHED, see TODO in get_SGD_ld_trainer()" sys.exit(-1) batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') cost = self.finetune_cost_sum # compute the gradients with respect to the model parameters gparams = T.grad(cost, self.params) # INIT TODO # compute list of fine-tuning updates updates = OrderedDict() for accugrad, accudelta, accuhess, param, gparam in zip( self._accugrads, self._accudeltas, self._accuhess, self.params, gparams): pass # TODO # TODO # TODO train_fn = theano.function( inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=cost, updates=updates, givens={ self.x: batch_x, self.y: batch_y }) return train_fn def score_classif(self, given_set): """ Returns functions to get current classification scores. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') score = theano.function( inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=self.errors, givens={ self.x: batch_x, self.y: batch_y }) # Create a function that scans the entire set given as input def scoref(): return [score(batch_x, batch_y) for batch_x, batch_y in given_set] return scoref