def __init__(self, rng, input, n_hidden_out, n_out, nkerns, batch_size): self.layer0 = LeNetConvPoolLayer(rng, input=input.reshape( (batch_size, 1, 28, 28)), image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) self.layer1 = LeNetConvPoolLayer(rng, input=self.layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) self.layer2 = HiddenLayer(rng, input=self.layer1.output.flatten(2), n_in=nkerns[1] * 4 * 4, n_out=n_hidden_out, activation=T.tanh) self.logRegressionLayer = LogisticRegression(input=self.layer2.output, n_in=n_hidden_out, n_out=n_out) self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likehood) self.errors = self.logRegressionLayer.errors self.params = self.layer0.params + self.layer1.params + self.layer2.params + self.logRegressionLayer.params self.input = input
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = theano_rng = RandomStreams(numpy_rng.randint(2**30)) self.x = T.matrix('x') self.y = T.ivector('y') for i in range(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self.finetune_cost = self.logLayer.negative_log_likehood(self.y) self.errors = self.logLayer.errors(self.y)
def __init__(self, np_rng, theano_rng=None, n_ins=784, hidden_layer_sizes=[500, 500], n_outs=10): self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layer_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(np_rng.randint(2 ** 30)) self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): if i == 0: n_in = n_ins layer_input = self.x else: n_in = hidden_layer_sizes[i-1] layer_input = self.sigmoid_layers[-1].output n_out = hidden_layer_sizes[i] sigmoid_layer = HiddenLayer(np_rng, layer_input, n_in, n_out, activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) dA_layer = AutoEncoder(np_rng, n_in, n_out, theano_rng=theano_rng, input=layer_input, W=sigmoid_layer.W, b_hid=sigmoid_layer.b) self.dA_layers.append(dA_layer) self.log_layer = LogisticRegression(self.sigmoid_layers[-1].output, self.y, hidden_layer_sizes[-1], n_outs) self.params.extend(self.log_layer.params) self.finetune_cost = self.log_layer.negative_log_likelihood() self.errors = self.log_layer.errors()
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='emotion', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = Ld.load_share(dataset) if dataset == 'mnist': ishape = (28, 28) # this is the size of MNIST images num_label = 10 elif dataset == 'emotion': ishape = (48, 48) # this is the size of MNIST images num_label = 7 train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1])) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) if dataset == 'emotion': layer05 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 22, 22), filter_shape=(nkerns[0], nkerns[0], 3, 3), poolsize=(2, 2)) layer1 = LeNetConvPoolLayer(rng, input=layer05.output, image_shape=(batch_size, nkerns[0], 10, 10), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(2, 2)) elif dataset == 'mnist': layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=num_label) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print( (' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.15, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 20], batch_size=500): """ Demonstrates lenet on CIFAR-10 dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue return shared_x, T.cast(shared_y, 'int32') data_batch_1 = unpickle('cifar-10-batches-py/data_batch_1') data_batch_2 = unpickle('cifar-10-batches-py/data_batch_2') data_batch_3 = unpickle('cifar-10-batches-py/data_batch_3') data_batch_4 = unpickle('cifar-10-batches-py/data_batch_4') data_batch_5 = unpickle('cifar-10-batches-py/data_batch_5') test = unpickle('cifar-10-batches-py/test_batch') train_set_1 = data_batch_1["data"] train_set_2 = data_batch_2["data"] train_set_3 = data_batch_3["data"] train_set_4 = data_batch_4["data"] train_set_5 = data_batch_5["data"] X_train = numpy.concatenate((train_set_1, train_set_2, train_set_3, train_set_4, train_set_5), axis=0) y_train = numpy.concatenate((data_batch_1["labels"], data_batch_2["labels"], data_batch_3["labels"], data_batch_4["labels"], data_batch_5["labels"])) test_set = test["data"] Xte_rows = test_set.reshape(train_set_1.shape[0], 32 * 32 * 3) Yte = numpy.asarray(test["labels"]) Xval_rows = X_train[:7500, :] # take first 1000 for validation Yval = y_train[:7500] Xtr_rows = X_train[7500:50000, :] # keep last 49,000 for train Ytr = y_train[7500:50000] mean_train = Xtr_rows.mean(axis=0) stdv_train = Xte_rows.std(axis=0) Xtr_rows = (Xtr_rows - mean_train) / stdv_train Xval_rows = (Xval_rows - mean_train) / stdv_train Xte_rows = (Xte_rows - mean_train) / stdv_train learning_rate = theano.shared(learning_rate) """whitening""" """ Xtr_rows -= numpy.mean(Xtr_rows, axis=0) # zero-center the data (important) cov = numpy.dot(Xtr_rows.T, Xtr_rows) / Xtr_rows.shape[0] U,S,V = numpy.linalg.svd(cov) Xrot = numpy.dot(Xtr_rows, U)# decorrelate the data Xrot_reduced = numpy.dot(Xtr_rows, U[:,:100]) # whiten the data: # divide by the eigenvalues (which are square roots of the singular values) Xwhite = Xrot / numpy.sqrt(S + 1e-5)""" """whitening""" #Xtr_rows = whiten(Xtr_rows) # zero-center the data (important) """cov = numpy.dot(Xtr_rows.T, Xtr_rows) / Xtr_rows.shape[0] U,S,V = numpy.linalg.svd(cov) Xrot = numpy.dot(Xtr_rows, U) Xtr_rows = Xrot / numpy.sqrt(S + 1e-5) Xval_rot = numpy.dot(Xval_rows,U) Xval_rows = Xval_rot / numpy.sqrt(S + 1e-5) Xte_rot = numpy.dot(Xte_rows,U) Xte_rows = Xte_rot / numpy.sqrt(S + 1e-5) """ train_set = (Xtr_rows, Ytr) valid_set = (Xval_rows, Yval) test_set = (Xte_rows, Yte) test_set_x, test_set_y = shared_dataset(test_set) valid_set_x, valid_set_y = shared_dataset(valid_set) train_set_x, train_set_y = shared_dataset(train_set) datasets = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 3, 32, 32)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (32+4-5+1 , 32+4-5+1) = (32, 32) # maxpooling reduces this further to (32/2, 32/2) = (16, 16) # 4D output tensor is thus of shape (batch_size, nkerns[0], 16, 16) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 3, 32, 32), filter_shape=(nkerns[0], 3, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (16+4-5+1, 16+4-5+1) = (16, 16) # maxpooling reduces this further to (16/2, 16/2) = (8, 8) # 4D output tensor is thus of shape (batch_size, nkerns[1], 8, 8) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 16, 16), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 8 * 8, n_out=500, activation=relu ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model L2_reg = 0.001 L2_sqr = ( (layer2.W ** 2).sum() + (layer3.W ** 2).sum() ) cost = layer3.negative_log_likelihood(y) + L2_reg * L2_sqr # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False epoch_loss_list = [] epoch_val_list = [] while (epoch < n_epochs) and (not done_looping): epoch += 1 if epoch == 10: learning_rate.set_value(0.1) # if epoch > 30: # learning_rate.set_value(learning_rate.get_value()*0.9995) if epoch > 3: epoch_loss_np = numpy.reshape(epoch_loss_list, newshape=(len(epoch_loss_list), 3)) epoch_val_np = numpy.reshape(epoch_val_list, newshape=(len(epoch_val_list), 3)) numpy.savetxt(fname='epoc_cost.csv', X=epoch_loss_np, fmt='%1.3f') numpy.savetxt(fname='epoc_val_error.csv', X=epoch_val_np, fmt='%1.3f') for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index) epoch_loss_entry = [iter, epoch, float(cost_ij)] epoch_loss_list.append(epoch_loss_entry) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) epoch_val_entry = [iter, epoch, this_validation_loss] epoch_val_list.append(epoch_val_entry) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) epoch_loss_np = numpy.reshape(epoch_loss_list, newshape=(len(epoch_loss_list), 3)) epoch_val_np = numpy.reshape(epoch_val_list, newshape=(len(epoch_val_list), 3)) epoch_loss = pandas.DataFrame({"iter": epoch_loss_np[:, 0], "epoch": epoch_loss_np[:, 1], "cost": epoch_loss_np[:, 2]}) epoch_vall = pandas.DataFrame({"iter": epoch_val_np[:, 0], "epoch": epoch_val_np[:, 1], "val_error": epoch_val_np[:, 2]}) epoc_avg_loss = pandas.DataFrame(epoch_loss.groupby(['epoch']).mean()["cost"]) epoc_avg_val = pandas.DataFrame(epoch_vall.groupby(['epoch']).mean()["val_error"]) epoc_avg_loss = pandas.DataFrame({"epoch": epoc_avg_loss.index.values, "cost": epoc_avg_loss["cost"]}) epoc_avg_loss_val = pandas.DataFrame({"epoch": epoc_avg_val.index.values, "val_error": epoc_avg_val["val_error"]}) epoc_avg_loss.plot(kind="line", x="epoch", y="cost") plt.show() epoc_avg_loss_val.plot(kind='line', x="epoch", y="val_error") plt.show()
def __init__(self, n_inp=784, n_out=10, hidden_layer_sizes=[500, 500]): """ This class is made to support a variable number of layers. :param n_inps: int, dimension of the input to the DBN :param n_outs: int, demension of the output of the network :param hidden_layer_sizes: list of ints, intermediate layers size, must contain at least one value """ self.sigmoid_layers = [] self.layers = [] self.params = [] self.n_layers = len(hidden_layer_sizes) assert self.n_layers > 0 #define the grape height, weight, channel = n_inp self.x = tf.placeholder(tf.float32, [None, height, weight, channel]) self.y = tf.placeholder(tf.float32, [None, n_out]) for i in range(self.n_layers): # Construct the sigmoidal layer # the size of the input is either the number of hidden units of the layer # below or the input size if we are on the first layer if i == 0: input_size = height * weight * channel else: input_size = hidden_layer_sizes[i - 1] # the input to this layer is either the activation of the hidden layer below # or the input of the DBN if you are on the first layer if i == 0: layer_input = tf.reshape(self.x, [-1, height * weight * channel]) else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(input=layer_input, n_inp=input_size, n_out=hidden_layer_sizes[i], activation=tf.nn.sigmoid) #add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # Its arguably a philosophical question... but we are going to only # declare that the parameters of the sigmoid_layers are parameters of the DBN. # The visible biases in the RBM are parameters of those RBMs, but not of the DBN self.params.extend(sigmoid_layer.params) if i == 0: rbm_layer = GRBM(inp=layer_input, n_visible=input_size, n_hidden=hidden_layer_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) else: rbm_layer = RBM(inp=layer_input, n_visible=input_size, n_hidden=hidden_layer_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.layers.append(rbm_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_inp=hidden_layer_sizes[-1], n_out=n_out) self.params.extend(self.logLayer.params) #print(self.sigmoid_layers[-1].output) #print(hidden_layer_sizes[-1], n_out) #compute the cost for second phase of training, defined as the cost of the # logistic regression output layer self.finetune_cost = self.logLayer.cost(self.y) #compute the gradients with respect to the model parameters symbolic variable that # points to the number of errors made on the minibatch given by self.x and self.y self.pred = self.logLayer.pred self.accuracy = self.logLayer.accuracy(self.y) """
def test_CNN(learning_rate=0.01, n_epochs=1000, batch_size=20, n_hidden=500): dataset = load_data() train_set_x, train_set_y = dataset[ 0] #tt = train_set_x.get_value(); tt.shape ---(50000, 784) valid_set_x, valid_set_y = dataset[1] test_set_x, test_set_y = dataset[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size print('training set has %i batches' % n_train_batches) print('validate set has %i batches' % n_valid_batches) print('testing set has %i batches' % n_test_batches) #symbolic variables x = T.matrix() y = T.ivector() #lvector: [long int] labels; ivector:[int] labels minibatch_index = T.lscalar() print 'build the model...' rng = numpy.random.RandomState(23455) # transfrom x from (batchsize, 28*28) to (batchsize,feature,28,28)) # I_shape = (28,28),F_shape = (5,5), N_filters_0 = 20 D_features_0 = 1 layer0_input = x.reshape((batch_size, D_features_0, 28, 28)) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, filter_shape=(N_filters_0, D_features_0, 5, 5), image_shape=(batch_size, 1, 28, 28)) #layer0.output: (batch_size, N_filters_0, (28-5+1)/2, (28-5+1)/2) -> 20*20*12*12 N_filters_1 = 50 D_features_1 = N_filters_0 layer1 = LeNetConvPoolLayer(rng, input=layer0.output, filter_shape=(N_filters_1, D_features_1, 5, 5), image_shape=(batch_size, N_filters_0, 12, 12)) # layer1.output: (20,50,4,4) layer2_input = layer1.output.flatten(2) # (20,50,4,4)->(20,(50*4*4)) layer2 = HiddenLayer(rng, layer2_input, n_in=50 * 4 * 4, n_out=500, activation=T.tanh) layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) ########################## cost = layer3.negative_log_likelihood(y) test_model = theano.function( inputs=[minibatch_index], outputs=layer3.errors(y), givens={ x: test_set_x[minibatch_index * batch_size:(minibatch_index + 1) * batch_size], y: test_set_y[minibatch_index * batch_size:(minibatch_index + 1) * batch_size] }) valid_model = theano.function( inputs=[minibatch_index], outputs=layer3.errors(y), givens={ x: valid_set_x[minibatch_index * batch_size:(minibatch_index + 1) * batch_size], y: valid_set_y[minibatch_index * batch_size:(minibatch_index + 1) * batch_size] }) params = layer3.params + layer2.params + layer1.params + layer0.params gparams = T.grad(cost, params) updates = [] for par, gpar in zip(params, gparams): updates.append((par, par - learning_rate * gpar)) train_model = theano.function( inputs=[minibatch_index], outputs=[cost], updates=updates, givens={ x: train_set_x[minibatch_index * batch_size:(minibatch_index + 1) * batch_size], y: train_set_y[minibatch_index * batch_size:(minibatch_index + 1) * batch_size] }) #---------------------Train-----------------------# print 'training...' epoch = 0 patience = 10000 patience_increase = 2 validation_frequency = min(n_train_batches, patience / 2) improvement_threshold = 0.995 best_parameters = None min_validation_error = numpy.inf done_looping = False start_time = time.clock() while (epoch < n_epochs) and (not done_looping): epoch += 1 for minibatch_index in xrange(n_train_batches): #cur_batch_train_error,cur_params = train_model(minibatch_index) cur_batch_train_error = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: #validation_error = numpy.mean([valid_model(idx) for idx in xrange(n_valid_batches)]) validation_losses = [ valid_model(i) for i in xrange(n_valid_batches) ] validation_error = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, validation_error * 100.)) if validation_error < min_validation_error: if validation_error < min_validation_error * improvement_threshold: patience = max(patience, iter * patience_increase) min_validation_error = validation_error #best_parameters = cur_params best_iter = iter #test test_error = numpy.mean( [test_model(idx) for idx in xrange(n_test_batches)]) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_error * 100.)) if iter >= patience: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (min_validation_error * 100., best_iter + 1, test_error * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, np_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type np_rng: np.random.RandomState :param np_rng: np random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(np_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in range(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=np_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(np_rng=np_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def __init__(self, E, U, height, width, filter_hs, conv_non_linear, hidden_units, batch_size, non_static, dropout_rates,subspace_size=None, activations=[Iden]): """ height = sentence length (padded where necessary) width = word vector length (300 for word2vec) filter_hs = filter window sizes hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer """ rng = np.random.RandomState(3435) feature_maps = hidden_units[0] self.batch_size = batch_size # define model architecture self.index = T.lscalar() self.x = T.matrix('x') self.y = T.ivector('y') self.Words = theano.shared(value=E, name="Words") self.Users = None self.u = None self.subspace_size = subspace_size zero_vec_tensor = T.vector() self.zero_vec = np.zeros(width) # reset Words to 0? self.set_zero = theano.function([zero_vec_tensor], updates=[(self.Words, T.set_subtensor(self.Words[0,:],zero_vec_tensor))], allow_input_downcast=True) # inputs to the ConvNet go to all convolutional filters: layer0_input = self.Words[T.cast(self.x.flatten(), dtype="int32")].reshape( (self.x.shape[0], 1, self.x.shape[1], self.Words.shape[1])) self.conv_layers = [] # outputs of convolutional filters layer1_inputs = [] image_shape = (batch_size, 1, height, width) filter_w = width for filter_h in filter_hs: filter_shape = (feature_maps, 1, filter_h, filter_w) pool_size = (height-filter_h+1, width-filter_w+1) conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=image_shape, filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) layer1_input = conv_layer.output.flatten(2) self.conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) # inputs to the MLP layer1_input = T.concatenate(layer1_inputs, 1) if U is not None: print "Will use user embeddings" self.u = T.ivector('u') self.Users = theano.shared(value=U, name="Users") them_users = self.Users[self.u] if self.subspace_size: print "and subspace" # set_trace() self.subspace = HiddenLayer(rng, them_users, U.shape[1], subspace_size, Sigmoid) self.peep = theano.function([self.x, self.u],[self.subspace.output,layer1_input],allow_input_downcast=True) layer1_input = T.concatenate((layer1_input,T.nnet.sigmoid(self.subspace.output)),1) layer_sizes = [feature_maps*len(filter_hs)+subspace_size] # layer1_input = T.concatenate((layer1_input,them_users),1) # layer_sizes = [feature_maps*len(filter_hs)+U.shape[1]] else: layer1_input = T.concatenate((layer1_input,them_users),1) layer_sizes = [feature_maps*len(filter_hs)+U.shape[1]] else: print "NO user embeddings" layer_sizes = [feature_maps*len(filter_hs)] layer_sizes += hidden_units[1:] super(ConvNet, self).__init__(rng, input=layer1_input, layer_sizes=layer_sizes, activations=activations, dropout_rates=dropout_rates) # add parameters from convolutional layers for conv_layer in self.conv_layers: self.params += conv_layer.params if non_static: # if word vectors are allowed to change, add them as model parameters self.params += [self.Words] if U is not None: # if self.subspace_size is None: self.params += [self.Users]
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ 在mnist数据集上验证模型 :type learning_rate: float :param learning_rate: 用于梯度下降的学习率大小(或步长) :type n_epochs: int :param n_epochs: 最大的优化周期 :type dataset: string :param dataset: 数据集名称 :type nkerns: list of ints :param nkerns: 每一层的核的数目 """ # 随机化种子 rng = numpy.random.RandomState(23455) # 导入数据 datasets = load_data(dataset) # mnist数据集有三种,分别是train、valid、test数据集,可以分别导入它们 train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # 每种类型数据集的数目,并计算它们的batch数目 n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # 分配变量,用于计数batch index = T.lscalar() #x:图像输入,y:标签输出 x = T.matrix('x') y = T.ivector('y') ###################### # 下面开始构建真正的模型# ###################### print '... building the model' # 将输入数据变形为四维矩阵(batch_size,1,28,28),其中28*28为图片的大小 layer0_input = x.reshape((batch_size, 1, 28, 28)) # 构建第一个卷积-pooling层 # filtering使得图片的大小减小为 (28-5+1 , 28-5+1) = (24, 24) # maxpooling使得它变为 (24/2, 24/2) = (12, 12) # 输出的是4D tensor,形状是 (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # 构建第二个卷积-pooling层 # filtering使得图片的大小减小为 (12-5+1, 12-5+1) = (8, 8) # maxpooling使得它变为 (8/2, 8/2) = (4, 4) # 输出的是4D tensor,形状是 (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # 隐藏层是全连接的,它的输入是形状为 (batch_size, num_pixels) layer2_input = layer1.output.flatten(2) # 构建一个全连接的隐藏层,激活函数为theano.tensor.tanh layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh ) # 对全连接层进行分类 layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # 损耗函数 cost = layer3.negative_log_likelihood(y) # 计算模型训练时并产生的误差 test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) # 验证模型 validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # 合并参数 params = layer3.params + layer2.params + layer1.params + layer0.params # 损耗函数对参数求导 grads = T.grad(cost, params) # 使用SGD算法来更新参数 updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] # 训练模型 train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # 训练模型 # ############### print '... training' # 提前停止训练的参数 patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # 计算validation损耗 validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # 如果已经得到最好的validation if this_validation_loss < best_validation_loss: # 提高patience if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # 设置最好的validation_loss best_validation_loss = this_validation_loss best_iter = iter # 测试 test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=None, n_outs=(None, None), continuous=False): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: tuple of ints :param n_outs: dimensions of the sigmoid layers of the network """ if n_outs == (None, None): n_outs = (10, 10) if hidden_layers_sizes is None: hidden_layers_sizes = [500, 500] self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector for i in range(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer if continuous and i == 0: rbm_layer = CRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) else: rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.topLayer = MLP( rng=numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_hidden=n_outs[0], n_out=n_outs[1]) self.params.extend(self.topLayer.params) # self.logLayer = LogisticRegression( # input=self.sigmoid_layers[-1].output, # n_in=hidden_layers_sizes[-1], # n_out=n_outs) # self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.topLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.topLayer.errors(self.y)
def evaluate_lenet5( learning_rate=0.1, n_epochs=200, dataset="mnist.pkl.gz", nkerns=[20, 50], batch_size=500, ): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # display some chars: display_some(train_set_x, train_set_y.eval(), n=5, title="label=") # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix("x") # the data is presented as rasterized images y = T.ivector("y") # the labels are presented as 1D vector of [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print("... building the model") # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2), ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2), ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], [layer3.errors(y), layer3.y_pred], givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size], }, ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size], }, ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( inputs=[index], outputs=[cost, layer3.errors(y)], updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], }, ) ############### # TRAIN MODEL # ############### print("... training") # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0.0 start_time = timeit.default_timer() epoch = 0 done_looping = False # for error_curve plot cost_train = [] # observe likelihood cost while training err_train = [] # observe train err while training err_valid = [] # observe valid err while training err_test = [] # observe test err while training while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print("training @ iter = ", iter) train_outputs = train_model(minibatch_index) cost_ij = train_outputs[0] err_train.append(train_outputs[1]) # add error_train if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) err_valid.append(this_validation_loss) print("epoch %i, minibatch %i/%i, validation error %f %%" % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0, )) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i)[0] for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) err_test.append(test_score) print((" epoch %i, minibatch %i/%i, test error of " "best model %f %%") % ( epoch, minibatch_index + 1, n_train_batches, test_score * 100.0, )) """ # save the best model with open('../doc/data/best_model.pkl', 'wb') as f: pickle.dump(layer0, layer1, layer2, layer3, f) """ if patience <= iter: done_looping = True break end_time = timeit.default_timer() print("Optimization complete.") print("Best validation score of %f %% obtained at iteration %i, " "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0)) print( ("The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0)), file=sys.stderr, ) model = [layer0, layer1, layer2, layer3] # save the best model with open("../doc/data/best_model.pkl", "wb") as f: pickle.dump(model, f) test_pred_y = test_model(0)[1] # predict on first batch_size sampless # display some chars using predict display_some(test_set_x, test_pred_y, n=5, title="pred=") # n < batch_size return err_train, err_valid, err_test