def train_and_test(args, print_config): assert args.conv_layer_n == len(args.filter_widths) == len( args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len( args.ks) # \mod{dim, 2^{\sum fold_flags}} == 0 assert args.embed_dm % (2**sum(args.fold_flags)) == 0 ################### # get the data # ################### datasets = load_data(args.corpus_path) train_set_x, train_set_y = datasets[0] dev_set_x, dev_set_y = datasets[1] test_set_x, test_set_y = datasets[2] word2index = datasets[3] index2word = datasets[4] pretrained_embeddings = datasets[5] n_train_batches = train_set_x.get_value( borrow=True).shape[0] / args.batch_size n_dev_batches = dev_set_x.get_value( borrow=True).shape[0] / args.dev_test_batch_size n_test_batches = test_set_x.get_value( borrow=True).shape[0] / args.dev_test_batch_size train_sent_len = train_set_x.get_value(borrow=True).shape[1] possible_labels = set(train_set_y.get_value().tolist()) if args.use_pretrained_embedding: args.embed_dm = pretrained_embeddings.get_value().shape[1] ################################### # Symbolic variable definition # ################################### x = T.imatrix('x') # the word indices matrix y = T.ivector('y') # the sentiment labels batch_index = T.iscalar('batch_index') rng = np.random.RandomState(1234) ############################### # Construction of the network # ############################### # Layer 1, the embedding layer layer1 = WordEmbeddingLayer( rng, input=x, vocab_size=len(word2index), embed_dm=args.embed_dm, embeddings=(pretrained_embeddings if args.use_pretrained_embedding else None)) dropout_layers = [layer1] layers = [layer1] for i in range(args.conv_layer_n): fold_flag = args.fold_flags[i] # for the dropout layer dpl = DropoutLayer(input=dropout_layers[-1].output, rng=rng, dropout_rate=args.dropout_rates[0]) next_layer_dropout_input = dpl.output next_layer_input = layers[-1].output # for the conv layer filter_shape = (args.nkerns[i], (1 if i == 0 else args.nkerns[i - 1]), 1, args.filter_widths[i]) k = args.ks[i] print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" % ( args.conv_activation_unit, i + 2, filter_shape, k, args.dropout_rates[i], args.norm_w, fold_flag) # we have two layers adding to two paths repsectively, # one for training # the other for prediction(averaged model) dropout_conv_layer = ConvFoldingPoolLayer( rng, input=next_layer_dropout_input, filter_shape=filter_shape, k=k, norm_w=args.norm_w, fold=fold_flag, activation=args.conv_activation_unit) # for prediction # sharing weight with dropout layer conv_layer = ConvFoldingPoolLayer( rng, input=next_layer_input, filter_shape=filter_shape, k=k, activation=args.conv_activation_unit, fold=fold_flag, W=dropout_conv_layer.W * (1 - args.dropout_rates[i]), # model averaging b=dropout_conv_layer.b) dropout_layers.append(dropout_conv_layer) layers.append(conv_layer) # last, the output layer # both dropout and without dropout if sum(args.fold_flags) > 0: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum( args.fold_flags)) else: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm print "For output layer, n_in = %d, dropout_rate = %f" % ( n_in, args.dropout_rates[-1]) dropout_output_layer = LogisticRegression( rng, input=dropout_layers[-1].output.flatten(2), n_in=n_in, # divided by 2x(how many times are folded) n_out=len(possible_labels) # five sentiment level ) output_layer = LogisticRegression( rng, input=layers[-1].output.flatten(2), n_in=n_in, n_out=len(possible_labels), W=dropout_output_layer.W * (1 - args.dropout_rates[-1]), # sharing the parameters, don't forget b=dropout_output_layer.b) dropout_layers.append(dropout_output_layer) layers.append(output_layer) ############################### # Error and cost # ############################### # cost and error come from different model! dropout_cost = dropout_output_layer.nnl(y) errors = output_layer.errors(y) def prepare_L2_sqr(param_layers, L2_regs): assert len(L2_regs) == len(param_layers) return T.sum([ L2_reg / 2 * ((layer.W if hasattr(layer, "W") else layer.embeddings)**2).sum() for L2_reg, layer in zip(L2_regs, param_layers) ]) L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs) L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:]) if args.use_L2_reg: cost = dropout_cost + L2_sqr cost_no_ebd = dropout_cost + L2_sqr_no_ebd else: cost = dropout_cost cost_no_ebd = dropout_cost ############################### # Parameters to be used # ############################### print "Delay embedding learning by %d epochs" % ( args.embedding_learning_delay_epochs) print "param_layers: %r" % dropout_layers param_layers = dropout_layers ############################## # Parameter Update # ############################## print "Using AdaDelta with rho = %f and epsilon = %f" % (args.rho, args.epsilon) params = [param for layer in param_layers for param in layer.params] param_shapes = [ param for layer in param_layers for param in layer.param_shapes ] param_grads = [T.grad(cost, param) for param in params] # AdaDelta parameter update # E[g^2] # initialized to zero egs = [ theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX), borrow=True, name="Eg:" + param.name) for param_shape, param in zip(param_shapes, params) ] # E[\delta x^2], initialized to zero exs = [ theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX), borrow=True, name="Ex:" + param.name) for param_shape, param in zip(param_shapes, params) ] new_egs = [ args.rho * eg + (1 - args.rho) * g**2 for eg, g in zip(egs, param_grads) ] delta_x = [ -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g for new_eg, ex, g in zip(new_egs, exs, param_grads) ] new_exs = [ args.rho * ex + (1 - args.rho) * (dx**2) for ex, dx in zip(exs, delta_x) ] egs_updates = zip(egs, new_egs) exs_updates = zip(exs, new_exs) param_updates = [(p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)] updates = egs_updates + exs_updates + param_updates # updates WITHOUT embedding # exclude the embedding parameter egs_updates_no_ebd = zip(egs[1:], new_egs[1:]) exs_updates_no_ebd = zip(exs[1:], new_exs[1:]) param_updates_no_ebd = [ (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)[1:] ] updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd def make_train_func(cost, updates): return theano.function( inputs=[batch_index], outputs=[cost], updates=updates, givens={ x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size:(batch_index + 1) * args.batch_size] }) train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb) train_model = make_train_func(cost, updates) def make_error_func(x_val, y_val): return theano.function( inputs=[], outputs=errors, givens={ x: x_val, y: y_val }, ) dev_error = make_error_func(dev_set_x, dev_set_y) test_error = make_error_func(test_set_x, test_set_y) ############################# # Debugging purpose code # ############################# # : PARAMETER TUNING NOTE: # some demonstration of the gradient vanishing probelm train_data_at_index = { x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], } train_data_at_index_with_y = { x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size:(batch_index + 1) * args.batch_size] } if print_config["nnl"]: get_nnl = theano.function( inputs=[batch_index], outputs=dropout_cost, givens={ x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size:(batch_index + 1) * args.batch_size] }) if print_config["L2_sqr"]: get_L2_sqr = theano.function(inputs=[], outputs=L2_sqr) get_L2_sqr_no_ebd = theano.function(inputs=[], outputs=L2_sqr_no_ebd) if print_config["grad_abs_mean"]: print_grads = theano.function( inputs=[], outputs=[ theano.printing.Print(param.name)(T.mean(T.abs_(param_grad))) for param, param_grad in zip(params, param_grads) ], givens={ x: train_set_x, y: train_set_y }) activations = [l.output for l in dropout_layers[1:-1]] weight_grads = [T.grad(cost, l.W) for l in dropout_layers[1:-1]] if print_config["activation_hist"]: # turn into 1D array get_activations = theano.function( inputs=[batch_index], outputs=[val.flatten(1) for val in activations], givens=train_data_at_index) if print_config["weight_grad_hist"]: # turn into 1D array get_weight_grads = theano.function( inputs=[batch_index], outputs=[val.flatten(1) for val in weight_grads], givens=train_data_at_index_with_y) if print_config["activation_tracking"]: # get the mean and variance of activations for each conv layer get_activation_mean = theano.function( inputs=[batch_index], outputs=[T.mean(val) for val in activations], givens=train_data_at_index) get_activation_std = theano.function( inputs=[batch_index], outputs=[T.std(val) for val in activations], givens=train_data_at_index) if print_config["weight_grad_tracking"]: # get the mean and variance of activations for each conv layer get_weight_grad_mean = theano.function( inputs=[batch_index], outputs=[T.mean(g) for g in weight_grads], givens=train_data_at_index_with_y) get_weight_grad_std = theano.function( inputs=[batch_index], outputs=[T.std(g) for g in weight_grads], givens=train_data_at_index_with_y) #the training loop patience = args.patience # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 start_time = time.clock() done_looping = False epoch = 0 nnls = [] L2_sqrs = [] activation_means = [[] for i in range(args.conv_layer_n)] activation_stds = [[] for i in range(args.conv_layer_n)] weight_grad_means = [[] for i in range(args.conv_layer_n)] weight_grad_stds = [[] for i in range(args.conv_layer_n)] activation_hist_data = [[] for i in range(args.conv_layer_n)] weight_grad_hist_data = [[] for i in range(args.conv_layer_n)] train_errors = [] dev_errors = [] try: print "validation_frequency = %d" % validation_frequency while (epoch < args.n_epochs): epoch += 1 print "At epoch {0}".format(epoch) if epoch == (args.embedding_learning_delay_epochs + 1): print "########################" print "Start training embedding" print "########################" # shuffle the training data train_set_x_data = train_set_x.get_value(borrow=True) train_set_y_data = train_set_y.get_value(borrow=True) permutation = np.random.permutation( train_set_x.get_value(borrow=True).shape[0]) train_set_x.set_value(train_set_x_data[permutation]) train_set_y.set_value(train_set_y_data[permutation]) for minibatch_index in range(n_train_batches): if epoch >= (args.embedding_learning_delay_epochs + 1): train_cost = train_model(minibatch_index) else: train_cost = train_model_no_ebd(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # train_error_val = np.mean([train_error(i) # for i in range(n_train_batches)]) dev_error_val = dev_error() # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %( # epoch, # minibatch_index, # train_error_val * 100, # dev_error_val * 100 # ) print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" % ( epoch, minibatch_index, dev_error_val * 100) # train_errors.append(train_error_val) dev_errors.append(dev_error_val) if dev_error_val < best_validation_loss: best_iter = iter #improve patience if loss improvement is good enough if dev_error_val < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = dev_error_val test_error_val = test_error() print((' epoch %i, minibatch %i/%i, test error of' ' best dev error %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_error_val * 100.)) print "Dumping model to %s" % (args.model_path) dump_params(params, args.model_path) if (minibatch_index + 1) % 50 == 0 or minibatch_index == n_train_batches - 1: print "%d / %d minibatches completed" % ( minibatch_index + 1, n_train_batches) if print_config["nnl"]: print "`nnl` for the past 50 minibatches is %f" % ( np.mean(np.array(nnls))) nnls = [] if print_config["L2_sqr"]: print "`L2_sqr`` for the past 50 minibatches is %f" % ( np.mean(np.array(L2_sqrs))) L2_sqrs = [] ################## # Plotting stuff # ################## if print_config["nnl"]: nnl = get_nnl(minibatch_index) # print "nll for batch %d: %f" %(minibatch_index, nnl) nnls.append(nnl) if print_config["L2_sqr"]: if epoch >= (args.embedding_learning_delay_epochs + 1): L2_sqrs.append(get_L2_sqr()) else: L2_sqrs.append(get_L2_sqr_no_ebd()) if print_config["activation_tracking"]: layer_means = get_activation_mean(minibatch_index) layer_stds = get_activation_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip( activation_means, activation_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["weight_grad_tracking"]: layer_means = get_weight_grad_mean(minibatch_index) layer_stds = get_weight_grad_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip( weight_grad_means, weight_grad_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["activation_hist"]: for layer_hist, layer_data in zip( activation_hist_data, get_activations(minibatch_index)): layer_hist += layer_data.tolist() if print_config["weight_grad_hist"]: for layer_hist, layer_data in zip( weight_grad_hist_data, get_weight_grads(minibatch_index)): layer_hist += layer_data.tolist() except: import traceback traceback.print_exc(file=sys.stdout) finally: from plot_util import (plot_hist, plot_track, plot_error_vs_epoch, plt) if print_config["activation_tracking"]: plot_track(activation_means, activation_stds, "activation_tracking") if print_config["weight_grad_tracking"]: plot_track(weight_grad_means, weight_grad_stds, "weight_grad_tracking") if print_config["activation_hist"]: plot_hist(activation_hist_data, "activation_hist") if print_config["weight_grad_hist"]: plot_hist(weight_grad_hist_data, "weight_grad_hist") if print_config["error_vs_epoch"]: train_errors = [0] * len(dev_errors) ax = plot_error_vs_epoch( train_errors, dev_errors, title=('Best dev score: %f %% ' ' at iter %i with test error %f %%') % (best_validation_loss * 100., best_iter + 1, test_error_val * 100.)) if not args.task_signature: plt.show() else: plt.savefig("plots/" + args.task_signature + ".png") end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_error_val * 100.)) # save the result with open(args.output, "a") as f: f.write("%s\t%f\t%f\n" % (args.task_signature, best_validation_loss, test_error_val)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
dtype = theano.config.floatX) layer3 = LogisticRegression(rng = rng, input = layer2.output.flatten(2), n_in = n_in, n_out = n_out, W = theano.shared(value = W_logreg, name = "W_logreg"), b = theano.shared(value = b_logreg, name = "b_logreg") ) f1 = theano.function(inputs = [x_symbol, y_symbol], outputs = layer3.nnl(y_symbol) ) f2 = theano.function(inputs = [x_symbol, y_symbol], outputs = layer3.errors(y_symbol) ) f3 = theano.function(inputs = [x_symbol], outputs = layer3.p_y_given_x ) f_el = theano.function(inputs = [x_symbol], outputs = layer1.output ) f_cl = theano.function(inputs = [x_symbol], outputs = layer2.output ) #########################
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def train_and_test(args, print_config): assert args.conv_layer_n == len(args.filter_widths) == len(args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len(args.ks) # \mod{dim, 2^{\sum fold_flags}} == 0 assert args.embed_dm % (2 ** sum(args.fold_flags)) == 0 ################### # get the data # ################### datasets = load_data(args.corpus_path) train_set_x, train_set_y = datasets[0] dev_set_x, dev_set_y = datasets[1] test_set_x, test_set_y = datasets[2] word2index = datasets[3] index2word = datasets[4] pretrained_embeddings = datasets[5] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / args.batch_size n_dev_batches = dev_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size train_sent_len = train_set_x.get_value(borrow=True).shape[1] possible_labels = set(train_set_y.get_value().tolist()) if args.use_pretrained_embedding: args.embed_dm = pretrained_embeddings.get_value().shape[1] ################################### # Symbolic variable definition # ################################### x = T.imatrix('x') # the word indices matrix y = T.ivector('y') # the sentiment labels batch_index = T.iscalar('batch_index') rng = np.random.RandomState(1234) ############################### # Construction of the network # ############################### # Layer 1, the embedding layer layer1 = WordEmbeddingLayer(rng, input = x, vocab_size = len(word2index), embed_dm = args.embed_dm, embeddings = ( pretrained_embeddings if args.use_pretrained_embedding else None ) ) dropout_layers = [layer1] layers = [layer1] for i in xrange(args.conv_layer_n): fold_flag = args.fold_flags[i] # for the dropout layer dpl = DropoutLayer( input = dropout_layers[-1].output, rng = rng, dropout_rate = args.dropout_rates[0] ) next_layer_dropout_input = dpl.output next_layer_input = layers[-1].output # for the conv layer filter_shape = ( args.nkerns[i], (1 if i == 0 else args.nkerns[i-1]), 1, args.filter_widths[i] ) k = args.ks[i] print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" %( args.conv_activation_unit, i+2, filter_shape, k, args.dropout_rates[i], args.norm_w, fold_flag ) # we have two layers adding to two paths repsectively, # one for training # the other for prediction(averaged model) dropout_conv_layer = ConvFoldingPoolLayer(rng, input = next_layer_dropout_input, filter_shape = filter_shape, k = k, norm_w = args.norm_w, fold = fold_flag, activation = args.conv_activation_unit) # for prediction # sharing weight with dropout layer conv_layer = ConvFoldingPoolLayer(rng, input = next_layer_input, filter_shape = filter_shape, k = k, activation = args.conv_activation_unit, fold = fold_flag, W = dropout_conv_layer.W * (1 - args.dropout_rates[i]), # model averaging b = dropout_conv_layer.b ) dropout_layers.append(dropout_conv_layer) layers.append(conv_layer) # last, the output layer # both dropout and without dropout if sum(args.fold_flags) > 0: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum(args.fold_flags)) else: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm print "For output layer, n_in = %d, dropout_rate = %f" %(n_in, args.dropout_rates[-1]) dropout_output_layer = LogisticRegression( rng, input = dropout_layers[-1].output.flatten(2), n_in = n_in, # divided by 2x(how many times are folded) n_out = len(possible_labels) # five sentiment level ) output_layer = LogisticRegression( rng, input = layers[-1].output.flatten(2), n_in = n_in, n_out = len(possible_labels), W = dropout_output_layer.W * (1 - args.dropout_rates[-1]), # sharing the parameters, don't forget b = dropout_output_layer.b ) dropout_layers.append(dropout_output_layer) layers.append(output_layer) ############################### # Error and cost # ############################### # cost and error come from different model! dropout_cost = dropout_output_layer.nnl(y) errors = output_layer.errors(y) def prepare_L2_sqr(param_layers, L2_regs): assert len(L2_regs) == len(param_layers) return T.sum([ L2_reg / 2 * ((layer.W if hasattr(layer, "W") else layer.embeddings) ** 2).sum() for L2_reg, layer in zip(L2_regs, param_layers) ]) L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs) L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:]) if args.use_L2_reg: cost = dropout_cost + L2_sqr cost_no_ebd = dropout_cost + L2_sqr_no_ebd else: cost = dropout_cost cost_no_ebd = dropout_cost ############################### # Parameters to be used # ############################### print "Delay embedding learning by %d epochs" %(args.embedding_learning_delay_epochs) print "param_layers: %r" %dropout_layers param_layers = dropout_layers ############################## # Parameter Update # ############################## print "Using AdaDelta with rho = %f and epsilon = %f" %(args.rho, args.epsilon) params = [param for layer in param_layers for param in layer.params] param_shapes= [param for layer in param_layers for param in layer.param_shapes] param_grads = [T.grad(cost, param) for param in params] # AdaDelta parameter update # E[g^2] # initialized to zero egs = [ theano.shared( value = np.zeros(param_shape, dtype = theano.config.floatX ), borrow = True, name = "Eg:" + param.name ) for param_shape, param in zip(param_shapes, params) ] # E[\delta x^2], initialized to zero exs = [ theano.shared( value = np.zeros(param_shape, dtype = theano.config.floatX ), borrow = True, name = "Ex:" + param.name ) for param_shape, param in zip(param_shapes, params) ] new_egs = [ args.rho * eg + (1 - args.rho) * g ** 2 for eg, g in zip(egs, param_grads) ] delta_x = [ -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g for new_eg, ex, g in zip(new_egs, exs, param_grads) ] new_exs = [ args.rho * ex + (1 - args.rho) * (dx ** 2) for ex, dx in zip(exs, delta_x) ] egs_updates = zip(egs, new_egs) exs_updates = zip(exs, new_exs) param_updates = [ (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params) ] updates = egs_updates + exs_updates + param_updates # updates WITHOUT embedding # exclude the embedding parameter egs_updates_no_ebd = zip(egs[1:], new_egs[1:]) exs_updates_no_ebd = zip(exs[1:], new_exs[1:]) param_updates_no_ebd = [ (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)[1:] ] updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd def make_train_func(cost, updates): return theano.function(inputs = [batch_index], outputs = [cost], updates = updates, givens = { x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size] } ) train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb) train_model = make_train_func(cost, updates) def make_error_func(x_val, y_val): return theano.function(inputs = [], outputs = errors, givens = { x: x_val, y: y_val }, ) dev_error = make_error_func(dev_set_x, dev_set_y) test_error = make_error_func(test_set_x, test_set_y) ############################# # Debugging purpose code # ############################# # : PARAMETER TUNING NOTE: # some demonstration of the gradient vanishing probelm train_data_at_index = { x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size], } train_data_at_index_with_y = { x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size] } if print_config["nnl"]: get_nnl = theano.function( inputs = [batch_index], outputs = dropout_cost, givens = { x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size] } ) if print_config["L2_sqr"]: get_L2_sqr = theano.function( inputs = [], outputs = L2_sqr ) get_L2_sqr_no_ebd = theano.function( inputs = [], outputs = L2_sqr_no_ebd ) if print_config["grad_abs_mean"]: print_grads = theano.function( inputs = [], outputs = [theano.printing.Print(param.name)( T.mean(T.abs_(param_grad)) ) for param, param_grad in zip(params, param_grads) ], givens = { x: train_set_x, y: train_set_y } ) activations = [ l.output for l in dropout_layers[1:-1] ] weight_grads = [ T.grad(cost, l.W) for l in dropout_layers[1:-1] ] if print_config["activation_hist"]: # turn into 1D array get_activations = theano.function( inputs = [batch_index], outputs = [ val.flatten(1) for val in activations ], givens = train_data_at_index ) if print_config["weight_grad_hist"]: # turn into 1D array get_weight_grads = theano.function( inputs = [batch_index], outputs = [ val.flatten(1) for val in weight_grads ], givens = train_data_at_index_with_y ) if print_config["activation_tracking"]: # get the mean and variance of activations for each conv layer get_activation_mean = theano.function( inputs = [batch_index], outputs = [ T.mean(val) for val in activations ], givens = train_data_at_index ) get_activation_std = theano.function( inputs = [batch_index], outputs = [ T.std(val) for val in activations ], givens = train_data_at_index ) if print_config["weight_grad_tracking"]: # get the mean and variance of activations for each conv layer get_weight_grad_mean = theano.function( inputs = [batch_index], outputs = [ T.mean(g) for g in weight_grads ], givens = train_data_at_index_with_y ) get_weight_grad_std = theano.function( inputs = [batch_index], outputs = [ T.std(g) for g in weight_grads ], givens = train_data_at_index_with_y ) #the training loop patience = args.patience # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 start_time = time.clock() done_looping = False epoch = 0 nnls = [] L2_sqrs = [] activation_means = [[] for i in xrange(args.conv_layer_n)] activation_stds = [[] for i in xrange(args.conv_layer_n)] weight_grad_means = [[] for i in xrange(args.conv_layer_n)] weight_grad_stds = [[] for i in xrange(args.conv_layer_n)] activation_hist_data = [[] for i in xrange(args.conv_layer_n)] weight_grad_hist_data = [[] for i in xrange(args.conv_layer_n)] train_errors = [] dev_errors = [] try: print "validation_frequency = %d" %validation_frequency while (epoch < args.n_epochs): epoch += 1 print "At epoch {0}".format(epoch) if epoch == (args.embedding_learning_delay_epochs + 1): print "########################" print "Start training embedding" print "########################" # shuffle the training data train_set_x_data = train_set_x.get_value(borrow = True) train_set_y_data = train_set_y.get_value(borrow = True) permutation = np.random.permutation(train_set_x.get_value(borrow=True).shape[0]) train_set_x.set_value(train_set_x_data[permutation]) train_set_y.set_value(train_set_y_data[permutation]) for minibatch_index in xrange(n_train_batches): if epoch >= (args.embedding_learning_delay_epochs + 1): train_cost = train_model(minibatch_index) else: train_cost = train_model_no_ebd(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # train_error_val = np.mean([train_error(i) # for i in xrange(n_train_batches)]) dev_error_val = dev_error() # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %( # epoch, # minibatch_index, # train_error_val * 100, # dev_error_val * 100 # ) print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" %( epoch, minibatch_index, dev_error_val * 100 ) # train_errors.append(train_error_val) dev_errors.append(dev_error_val) if dev_error_val < best_validation_loss: best_iter = iter #improve patience if loss improvement is good enough if dev_error_val < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = dev_error_val test_error_val = test_error() print( ( ' epoch %i, minibatch %i/%i, test error of' ' best dev error %f %%' ) % ( epoch, minibatch_index + 1, n_train_batches, test_error_val * 100. ) ) print "Dumping model to %s" %(args.model_path) dump_params(params, args.model_path) if (minibatch_index+1) % 50 == 0 or minibatch_index == n_train_batches - 1: print "%d / %d minibatches completed" %(minibatch_index + 1, n_train_batches) if print_config["nnl"]: print "`nnl` for the past 50 minibatches is %f" %(np.mean(np.array(nnls))) nnls = [] if print_config["L2_sqr"]: print "`L2_sqr`` for the past 50 minibatches is %f" %(np.mean(np.array(L2_sqrs))) L2_sqrs = [] ################## # Plotting stuff # ################## if print_config["nnl"]: nnl = get_nnl(minibatch_index) # print "nll for batch %d: %f" %(minibatch_index, nnl) nnls.append(nnl) if print_config["L2_sqr"]: if epoch >= (args.embedding_learning_delay_epochs + 1): L2_sqrs.append(get_L2_sqr()) else: L2_sqrs.append(get_L2_sqr_no_ebd()) if print_config["activation_tracking"]: layer_means = get_activation_mean(minibatch_index) layer_stds = get_activation_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip(activation_means, activation_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["weight_grad_tracking"]: layer_means = get_weight_grad_mean(minibatch_index) layer_stds = get_weight_grad_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip(weight_grad_means, weight_grad_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["activation_hist"]: for layer_hist, layer_data in zip(activation_hist_data , get_activations(minibatch_index)): layer_hist += layer_data.tolist() if print_config["weight_grad_hist"]: for layer_hist, layer_data in zip(weight_grad_hist_data , get_weight_grads(minibatch_index)): layer_hist += layer_data.tolist() except: import traceback traceback.print_exc(file = sys.stdout) finally: from plot_util import (plot_hist, plot_track, plot_error_vs_epoch, plt) if print_config["activation_tracking"]: plot_track(activation_means, activation_stds, "activation_tracking") if print_config["weight_grad_tracking"]: plot_track(weight_grad_means, weight_grad_stds, "weight_grad_tracking") if print_config["activation_hist"]: plot_hist(activation_hist_data, "activation_hist") if print_config["weight_grad_hist"]: plot_hist(weight_grad_hist_data, "weight_grad_hist") if print_config["error_vs_epoch"]: train_errors = [0] * len(dev_errors) ax = plot_error_vs_epoch(train_errors, dev_errors, title = ('Best dev score: %f %% ' ' at iter %i with test error %f %%') %( best_validation_loss * 100., best_iter + 1, test_error_val * 100. ) ) if not args.task_signature: plt.show() else: plt.savefig("plots/" + args.task_signature + ".png") end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_error_val * 100.)) # save the result with open(args.output, "a") as f: f.write("%s\t%f\t%f\n" %(args.task_signature, best_validation_loss, test_error_val)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, x, y, batch_size, videos, kernels, pools, n_input, n_output, hidden_input, params=None): learning_rate = 0.1 rng = numpy.random.RandomState(1234) print '... building the model' sys.stdout.flush() if not params: # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = ConvLayer(x, n_input[0], n_output[0], kernels[0], videos[0], pools[0], batch_size, 'L0', rng) layer1 = ConvLayer(layer0.output, n_input[1], n_output[1], kernels[1], videos[1], pools[1], batch_size, 'L1', rng) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=hidden_input, n_out=batch_size, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=batch_size, n_out=2) else: layer0 = ConvLayer(x, n_input[0], n_output[0], kernels[0], videos[0], pools[0], batch_size, 'L0', rng, True, params[6], params[7]) layer1 = ConvLayer(layer0.output, n_input[1], n_output[1], kernels[1], videos[1], pools[1], batch_size, 'L1', rng, True, params[4], params[5]) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=hidden_input, n_out=batch_size, activation=T.tanh, W=params[2], b=params[3]) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=batch_size, n_out=2, W=params[0], b=params[1]) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a list of all model parameters to be fit by gradient descent self.params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, self.params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(self.params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) self.train_model = theano.function([x, y], cost, updates=updates) self.validate_model = theano.function(inputs=[x, y], outputs=layer3.errors(y)) self.predict = theano.function(inputs=[x], outputs=layer3.y_pred) print '... building done' sys.stdout.flush()
np_l = LogisticRegression(W, b) ######################### # THEANO PART ######################### x_symbol = theano.tensor.dmatrix('x') y_symbol = theano.tensor.ivector('y') th_l = TheanoLogisticRegression(rng=np.random.RandomState(1234), input=x_symbol, n_in=10, n_out=5, W=theano.shared(value=W, name="W"), b=theano.shared(value=b, name="b")) f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=th_l.nnl(y_symbol)) actual = np_l.nnl(x, y) expected = f1(x, y) assert_matrix_eq(actual, expected, "nnl") f2 = theano.function(inputs=[x_symbol, y_symbol], outputs=th_l.errors(y_symbol)) actual = np_l.errors(x, y) expected = f2(x, y) assert_matrix_eq(actual, expected, "errors")
n_in = filter_shape[0] * k * embed_dm / 2 n_out = 5 W_logreg = np.asarray(np.random.rand(n_in, n_out), dtype=theano.config.floatX) b_logreg = np.asarray(np.random.rand(n_out), dtype=theano.config.floatX) layer3 = LogisticRegression(rng=rng, input=layer2.output.flatten(2), n_in=n_in, n_out=n_out, W=theano.shared(value=W_logreg, name="W_logreg"), b=theano.shared(value=b_logreg, name="b_logreg")) f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.nnl(y_symbol)) f2 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.errors(y_symbol)) f3 = theano.function(inputs=[x_symbol], outputs=layer3.p_y_given_x) f_el = theano.function(inputs=[x_symbol], outputs=layer1.output) f_cl = theano.function(inputs=[x_symbol], outputs=layer2.output) ######################### # NUMPY PART # ######################### class Params(object): pass
th_l = TheanoLogisticRegression(rng = np.random.RandomState(1234), input = x_symbol, n_in = 10, n_out = 5, W = theano.shared(value = W, name = "W"), b = theano.shared(value = b, name = "b") ) f1 = theano.function(inputs = [x_symbol, y_symbol], outputs = th_l.nnl(y_symbol) ) actual = np_l.nnl(x, y) expected = f1(x, y) assert_matrix_eq(actual, expected, "nnl") f2 = theano.function(inputs = [x_symbol, y_symbol], outputs = th_l.errors(y_symbol) ) actual = np_l.errors(x, y) expected = f2(x, y) assert_matrix_eq(actual, expected, "errors")