def main(model='mlp', num_epochs=500): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs', fixed_shape=(128,) + X_train.shape[1:]) target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") if model == 'mlp': network = build_mlp(input_var) elif model.startswith('custom_mlp:'): depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',') network = build_custom_mlp(input_var, int(depth), int(width), float(drop_in), float(drop_hid)) elif model == 'cnn': network = build_cnn(input_var) else: print("Unrecognized model type %r." % model) return # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = T.mean(loss) # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = T.mean(test_loss) # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.cast(T.eq( T.cast(T.argmax(test_prediction, axis=1), 'int32'), T.cast(target_var, 'int32') ), theano.config.floatX)) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, 128, shuffle=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, 128, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, 128, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100))
def main(num_epochs=NUM_EPOCHS): print("Building network ...") # First, we build the network, starting with an input layer # Recurrent layers expect input of shape # (batch size, max sequence length, number of features) l_in = lasagne.layers.InputLayer(shape=(N_BATCH, MAX_LENGTH, 2)) # The network also needs a way to provide a mask for each sequence. We'll # use a separate input layer for that. Since the mask only determines # which indices are part of the sequence for each batch entry, they are # supplied as matrices of dimensionality (N_BATCH, MAX_LENGTH) l_mask = lasagne.layers.InputLayer(shape=(N_BATCH, MAX_LENGTH)) # We're using a bidirectional network, which means we will combine two # RecurrentLayers, one with the backwards=True keyword argument. # Setting a value for grad_clipping will clip the gradients in the layer # Setting only_return_final=True makes the layers only return their output # for the final time step, which is all we need for this task l_forward = lasagne.layers.RecurrentLayer( l_in, N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP, W_in_to_hid=lasagne.init.HeUniform(), W_hid_to_hid=lasagne.init.HeUniform(), nonlinearity=lasagne.nonlinearities.tanh, only_return_final=True) l_backward = lasagne.layers.RecurrentLayer( l_in, N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP, W_in_to_hid=lasagne.init.HeUniform(), W_hid_to_hid=lasagne.init.HeUniform(), nonlinearity=lasagne.nonlinearities.tanh, only_return_final=True, backwards=True) # Now, we'll concatenate the outputs to combine them. l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward]) # Our output layer is a simple dense connection, with 1 output unit l_out = lasagne.layers.DenseLayer( l_concat, num_units=1, nonlinearity=lasagne.nonlinearities.tanh) target_values = T.vector('target_output', fixed_shape=(N_BATCH,)) # lasagne.layers.get_output produces a variable for the output of the net network_output = lasagne.layers.get_output(l_out) # The network output will have shape (n_batch, 1); let's flatten to get a # 1-dimensional vector of predicted values predicted_values = T.flatten(network_output) # Our cost will be mean-squared error cost = T.mean(T.square(predicted_values - target_values)) # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_out) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) # Theano functions for training and computing cost print("Compiling functions ...") import time start_time = time.time() train = theano.function([l_in.input_var, target_values, l_mask.input_var], cost, updates=updates) compute_cost = theano.function( [l_in.input_var, target_values, l_mask.input_var], cost) print("compiling took %f seconds" % (time.time() - start_time)) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val = gen_data() print("Training ...") try: for epoch in range(num_epochs): import time start_time = time.time() for _ in range(EPOCH_SIZE): X, y, m = gen_data() train(X, y, m) cost_val = compute_cost(X_val, y_val, mask_val) print("Epoch {} validation cost = {}; spent {} seconds".format(epoch, cost_val, time.time() - start_time)) except KeyboardInterrupt: pass