def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, dataset='file_622.pkl.gz', display_filters = True, nkerns=[32, 48, 64, 128, 256], batch_size=500): #batch_size = 500 """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 100, 46)) #28,28 # Construct the first convolutional pooling layer: # filtering reduces the image size to (100-3+1 , 46-3+1) = (98, 44) # maxpooling reduces this further to (98/2, 44/2) = (49, 22) # 4D output tensor is thus of shape (batch_size, nkerns[0], 49, 22) layer0 = LeNetConvPoolLayer1( rng, input=layer0_input, image_shape=(batch_size, 1, 100, 46), filter_shape=(nkerns[0], 1, 3, 3), poolsize=(2, 2) ) layer0_1 = LeNetConvPoolLayer( #Conv ReLU rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 49, 22), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(2, 2) ) layer0_2 = LeNetConvPoolLayer( #Conv ReLU rng, input=layer0_1.output, image_shape=(batch_size, nkerns[1], 47, 20), filter_shape=(nkerns[1], nkerns[1], 3, 3), poolsize=(2, 2) ) layer0_3 = LeNetConvPoolLayer( #Conv ReLU rng, input=layer0_2.output, image_shape=(batch_size, nkerns[1], 45, 18), filter_shape=(nkerns[2], nkerns[1], 3, 3), poolsize=(2, 2) ) layer1 = LeNetConvPoolLayer( #Conv ReLU rng, input=layer0_3.output, image_shape=(batch_size, nkerns[2], 43, 16), filter_shape=(nkerns[2], nkerns[2], 3, 3), poolsize=(2, 2) ) layer1_1 = LeNetConvPoolLayer( #Conv ReLU rng, input=layer1.output, image_shape=(batch_size, nkerns[2], 41, 14), filter_shape=(nkerns[3], nkerns[2], 3, 3), poolsize=(2, 2) ) layer1_2 = LeNetConvPoolLayer( #Conv ReLU rng, input=layer1_1.output, image_shape=(batch_size, nkerns[3], 39, 12), filter_shape=(nkerns[3], nkerns[3], 3, 3), poolsize=(2, 2) ) layer1_3 = LeNetConvPoolLayer( #Conv ReLU rng, input=layer1_2.output, image_shape=(batch_size, nkerns[3], 37, 10), filter_shape=(nkerns[3], nkerns[3], 3, 3), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (35-5+1, 8-5+1) = (31, 4) # maxpooling reduces this further to (31/2, 4/2) = (15, 2) # 4D output tensor is thus of shape (batch_size, nkerns[1], 15, 2) layer2_1 = LeNetConvPoolLayer1( rng, input=layer1_3.output, image_shape=(batch_size, nkerns[3], 35, 8), filter_shape=(nkerns[4], nkerns[3], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 22 * 8), # or (500, 50 * 22 * 8) = (500, 8800) with the default values. layer2_input = layer2_1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[4] * 15 * 2, n_out=800, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=800, n_out=5) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2_1.params + layer1_3.params + layer1_2.params + layer1_1.params + layer1.params+ layer0_3.params + layer0_2.params + layer0_1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) """if patience <= iter: done_looping = True break""" end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
def evaluate_lenet5(initial_learning_rate=0.1, learning_rate_decay=1, dropout_rates=[0.2, 0.2, 0.2, 0.5], n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type learning_rate_decay: float :param learning_rate_decay: learning rate decay used (1 means learning rate decay is deactivated) :type dropout_rates: list of float :param dropout_rates: dropout rate used for each layer (input layer, 1st filtered layer, 2nd filtered layer, fully connected layer) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch epoch = T.scalar() x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels learning_rate = theano.shared( numpy.asarray(initial_learning_rate, dtype=theano.config.floatX)) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) layer0_input_dropout = _dropout_from_layer(rng, layer0_input, dropout_rates[0]) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0_dropout = DropoutLeNetConvPoolLayer(rng, input=layer0_input_dropout, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2), dropout_rate=dropout_rates[1]) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2), W=layer0_dropout.W * (1 - dropout_rates[0]), b=layer0_dropout.b) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1_dropout = DropoutLeNetConvPoolLayer(rng, input=layer0_dropout.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2), dropout_rate=dropout_rates[2]) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2), W=layer1_dropout.W * (1 - dropout_rates[1]), b=layer1_dropout.b) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_dropout_input = layer1_dropout.output.flatten(2) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2_dropout = DropoutHiddenLayer(rng, input=layer2_dropout_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh, dropout_rate=dropout_rates[3]) layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh, W=layer2_dropout.W * (1 - dropout_rates[2]), b=layer2_dropout.b) # classify the values of the fully-connected sigmoidal layer layer3_dropout = LogisticRegression(input=layer2_dropout.output, n_in=500, n_out=10) layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10, W=layer3_dropout.W * (1 - dropout_rates[-1]), b=layer3_dropout.b) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) dropout_cost = layer3_dropout.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3_dropout.params + layer2_dropout.params + layer1_dropout.params + layer0_dropout.params # create a list of gradients for all model parameters grads = T.grad(dropout_cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], dropout_cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # Theano function to decay the learning rate decay_learning_rate = theano.function( inputs=[], outputs=learning_rate, updates={learning_rate: learning_rate * learning_rate_decay}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break new_learning_rate = decay_learning_rate() end_time = timeit.default_timer() print('Optimization complete.') print( 'Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, rng, input, filter_shapes, image_shape, poolsize, layer_sizes, dropout_rates, activations): """ Allocate a cnn_mlp (ConvNet followed by MLP) with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shapes: list of (list of length 4) :param filter_shapes: list of the filters whith their respective properties ((number of kernels, num input feature maps, filter height, filter width), ...) len(filter_shapes) = number of LeNetConvPoolLayer layers :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows, #cols) :type layer_sizes: list of int :param layer_sizes: sizes (number of units) of each HiddenLayer ( len(layer_sizes) = number of HiddenLayer layers) :type dropout_rates: list of float :param dropout_rates: dropout rate used for each layer (including dropout on the input) :type activations: list of theano.function :param activations: list of the activation functions to use at each layer """ ####################################### # Set up all the convolutional layers # ####################################### self.layers = [] self.dropout_layers = [] next_layer_input = input.reshape(image_shape) next_dropout_layer_input = _dropout_from_layer(rng, next_layer_input, p=dropout_rates[0]) layer_counter = 0 for i in range(len(filter_shapes)): filter_shape = filter_shapes[i] next_dropout_layer = DropoutLeNetConvPoolLayer( rng=rng, input=next_dropout_layer_input, image_shape=image_shape, filter_shape=filter_shape, poolsize=poolsize, dropout_rate=dropout_rates[layer_counter + 1], activation=activations[layer_counter]) self.dropout_layers.append(next_dropout_layer) next_dropout_layer_input = next_dropout_layer.output # Reuse parameters from the dropout layer here next_layer = LeNetConvPoolLayer( rng=rng, input=next_layer_input, image_shape=image_shape, filter_shape=filter_shape, W=next_dropout_layer.W * (1 - dropout_rates[layer_counter]), b=next_dropout_layer.b, poolsize=poolsize, activation=activations[layer_counter]) self.layers.append(next_layer) next_layer_input = next_layer.output image_shape = (image_shape[0], filter_shape[0], (image_shape[2] - filter_shape[2] + 1) / poolsize[0], (image_shape[3] - filter_shape[3] + 1) / poolsize[1]) layer_counter += 1 ################################ # Set up all the hidden layers # ################################ weight_matrix_sizes = zip(layer_sizes, layer_sizes[1:]) next_layer_input = next_layer_input.flatten(2) next_dropout_layer_input = next_dropout_layer_input.flatten(2) assert ( layer_sizes[0] == numpy.prod(image_shape[1:]) ), "The dimension of the first hidden layer does not match last convolutional layer size." for n_in, n_out in weight_matrix_sizes[:-1]: next_dropout_layer = DropoutHiddenLayer( rng=rng, input=next_dropout_layer_input, activation=activations[layer_counter], n_in=n_in, n_out=n_out, dropout_rate=dropout_rates[layer_counter + 1]) self.dropout_layers.append(next_dropout_layer) next_dropout_layer_input = next_dropout_layer.output # Reuse the paramters from the dropout layer here next_layer = HiddenLayer( rng=rng, input=next_layer_input, activation=activations[layer_counter], # scale the weight matrix W with (1-p) W=next_dropout_layer.W * (1 - dropout_rates[layer_counter]), b=next_dropout_layer.b, n_in=n_in, n_out=n_out) self.layers.append(next_layer) next_layer_input = next_layer.output layer_counter += 1 ########################### # Set up the output layer # ########################### n_in, n_out = weight_matrix_sizes[-1] dropout_output_layer = LogisticRegression( input=next_dropout_layer_input, n_in=n_in, n_out=n_out) self.dropout_layers.append(dropout_output_layer) # Again, reuse paramters in the dropout output. output_layer = LogisticRegression( input=next_layer_input, # scale the weight matrix W with (1-p) W=dropout_output_layer.W * (1 - dropout_rates[-1]), b=dropout_output_layer.b, n_in=n_in, n_out=n_out) self.layers.append(output_layer) # Use the negative log likelihood of the logistic regression layer as # the objective. self.dropout_negative_log_likelihood = self.dropout_layers[ -1].negative_log_likelihood self.dropout_errors = self.dropout_layers[-1].errors self.negative_log_likelihood = self.layers[-1].negative_log_likelihood self.errors = self.layers[-1].errors # Grab all the parameters together. self.params = [ param for layer in self.dropout_layers for param in layer.params ]