def pretrain(shared_args, private_args): """ Pretrain an SdA model for the given number of training epochs. The model is either initialized from scratch, or is reconstructed from a previously pickled model. :type shared_args: dict :param shared_args: dict containing all the arguments common to both models. :type private_args: dict :param private_args: dict containing all the arguments specific to each model spawned off this first process. """ # Import sandbox.cuda to bind the specified GPU to this subprocess # then import the remaining theano and model modules. import theano.sandbox.cuda theano.sandbox.cuda.use(private_args['gpu']) import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from SdA import SdA shared_args_dict = shared_args[0] current_dir = os.getcwd() os.chdir(shared_args_dict['dir']) today = datetime.today() day = str(today.date()) hour = str(today.time()) arch_list = get_arch_list(private_args) corruption_list = [shared_args_dict['corruption'] for i in arch_list] layer_types = parse_layer_type(shared_args_dict['layertype'], len(arch_list)) output_filename = "hybrid_pretraining_sda_" + "_".join(elem for elem in layer_types) + private_args['arch'] + "." + day + "." + hour output_file = open(output_filename,'w') os.chdir(current_dir) print >> output_file, "Run on " + str(datetime.now()) # Get the training data sample from the input file data_set_file = openFile(str(shared_args_dict['input']), mode = 'r') datafiles = extract_unlabeled_chunkrange(data_set_file, num_files = 30, offset = shared_args_dict['offset']) if datafiles is None: print("No data was returned, exiting.") data_set_file.close() output_file.close() return train_set_x = load_data_unlabeled(datafiles) # DEBUG: get validation set too validation_datafiles = extract_unlabeled_chunkrange(data_set_file, num_files = 5, offset = shared_args_dict['offset'] + 30) valid_set_x = load_data_unlabeled(validation_datafiles) data_set_file.close() # compute number of minibatches for training, validation and testing n_train_batches, n_features = train_set_x.get_value(borrow=True).shape n_train_batches /= shared_args_dict['batch_size'] # numpy random generator numpy_rng = numpy.random.RandomState(89677) # Set the initial value of the learning rate learning_rate = theano.shared(numpy.asarray(shared_args_dict['pretrain_lr'], dtype=theano.config.floatX)) # Check if we can restore from a previously trained model, # otherwise construct a new SdA if private_args.has_key('restore'): print >> output_file, 'Unpickling the model from %s ...' % (private_args['restore']) current_dir = os.getcwd() os.chdir(shared_args_dict['dir']) f = file(private_args['restore'], 'rb') sda_model = cPickle.load(f) f.close() os.chdir(current_dir) else: print '... building the model' sda_model = SdA(numpy_rng=numpy_rng, n_ins=n_features, hidden_layers_sizes=arch_list, corruption_levels = corruption_list, layer_types=layer_types, loss=shared_args_dict['loss'], n_outs=-1, sparse_init=shared_args_dict['sparse_init'], opt_method=shared_args_dict['opt_method']) ######################### # PRETRAINING THE MODEL # ######################### print '... getting the pretraining functions' pretraining_fns = sda_model.pretraining_functions(train_set_x=train_set_x, batch_size=shared_args_dict['batch_size'], learning_rate=learning_rate, method='cm') print '... getting the hybrid training functions' hybrid_pretraining_fns = sda_model.build_finetune_limited_reconstruction(train_set_x=train_set_x, batch_size=shared_args_dict['batch_size'], learning_rate=learning_rate, method='cm') # DEBUG: get full finetuning theano function # get the training, validation function for the model datasets = (train_set_x,valid_set_x) print '... getting the finetuning functions' finetune_train_fn, validate_model = sda_model.build_finetune_full_reconstruction( datasets=datasets, batch_size=shared_args_dict['batch_size'], learning_rate=learning_rate, method='cm') # DEBUG: should only have n_layers - 2 hybrid pretraining functions assert len(hybrid_pretraining_fns) == sda_model.n_layers - 2 print '... writing meta-data to output file' metadict = {'n_train_batches': n_train_batches} metadict = dict(metadict.items() + shared_args_dict.items()) write_metadata(output_file, metadict) print '... pre-training the model' start_time = time.clock() # Get corruption levels from the SdA. corruption_levels = sda_model.corruption_levels # Function to decrease the learning rate decay_learning_rate = theano.function(inputs=[], outputs=learning_rate, updates={learning_rate: learning_rate * shared_args_dict['lr_decay']}) # Function to reset the learning rate lr_val = T.scalar('original_lr') reset_learning_rate = theano.function(inputs=[lr_val], outputs=learning_rate, updates={learning_rate: lr_val}) # Set up functions for max norm regularization apply_max_norm_regularization = sda_model.max_norm_regularization() for i in xrange(sda_model.n_layers): for epoch in xrange(shared_args_dict['pretraining_epochs']): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i],momentum=shared_args_dict['momentum'])) print >> output_file, 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print >> output_file, numpy.mean(c) print >> output_file, learning_rate.get_value(borrow=True) decay_learning_rate() apply_max_norm_regularization(norm_limit=shared_args_dict['maxnorm']) # Do hybrid pretraining only on the middle layer(s) if i > 0 and i < sda_model.n_layers - 1: for h_epoch in xrange(20): hybrid_c = [] for batch_index in xrange(n_train_batches): hybrid_c.append(hybrid_pretraining_fns[i-1](index=batch_index,momentum=shared_args_dict['momentum'])) print >> output_file, "Hybrid pre-training on layers %i and below, epoch %d, cost" % (i, h_epoch), print >> output_file, numpy.mean(hybrid_c) # Reset the learning rate reset_learning_rate(numpy.asarray(shared_args_dict['pretrain_lr'], dtype=numpy.float32)) if private_args.has_key('save'): print >> output_file, 'Pickling the model...' current_dir = os.getcwd() os.chdir(shared_args_dict['dir']) f = file(private_args['save'], 'wb') cPickle.dump(sda_model, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() os.chdir(current_dir) print '... finetuning with final layer' best_validation_loss = numpy.inf for f_epoch in xrange(20): for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = finetune_train_fn(minibatch_index, shared_args_dict['momentum']) # DEBUG: monitor the training error print >> output_file, ('Fine-tuning epoch %i, minibatch %i/%i, training error %f ' % (f_epoch, minibatch_index + 1, n_train_batches, minibatch_avg_cost)) # apply max-norm regularization apply_max_norm_regularization(shared_args_dict['maxnorm']) # validate every epoch validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) # save best model that achieved this best loss if this_validation_loss < best_validation_loss: print >> output_file, 'Pickling the model...' current_dir = os.getcwd() os.chdir(shared_args_dict['dir']) f = file(private_args['save'], 'wb') cPickle.dump(sda_model, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() os.chdir(current_dir) print >> output_file, ('epoch %i, minibatch %i/%i, validation error %f ' % (f_epoch, minibatch_index + 1, n_train_batches, this_validation_loss)) end_time = time.clock() print >> output_file, ('The hybrid training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) output_file.close()
def test_restrict_norm_SdA(num_epochs=10, pretrain_lr=0.00001, lr_decay = 0.98, batch_size=20): """ Pretrain an SdA model for the given number of training epochs, applying norm restrictions on the W matrices. Try ReLU units, since their weights seem to blow up on this data set. :type num_epochs: int :param num_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type batch_size: int :param batch_size: train in mini-batches of this size """ layer_types=['ReLU','ReLU'] current_dir = os.getcwd() os.chdir(options.dir) today = datetime.today() day = str(today.date()) hour = str(today.time()) output_filename = "test_max_norm_sda_." + '_'.join([elem for elem in layer_types]) + day + "." + hour output_file = open(output_filename,'w') os.chdir(current_dir) print >> output_file, "Run on " + str(datetime.now()) # Get the training data sample from the input file data_set_file = openFile(str(options.inputfile), mode = 'r') datafiles = extract_unlabeled_chunkrange(data_set_file, num_files = 10) train_set_x = load_data_unlabeled(datafiles, features = (5,20)) data_set_file.close() # compute number of minibatches for training, validation and testing n_train_batches, n_features = train_set_x.get_value(borrow=True).shape n_train_batches /= batch_size # numpy random generator numpy_rng = numpy.random.RandomState(89677) print '... building the model' # Set the initial value of the learning rate learning_rate = theano.shared(numpy.asarray(pretrain_lr, dtype=theano.config.floatX)) # Function to decrease the learning rate decay_learning_rate = theano.function(inputs=[], outputs=learning_rate, updates={learning_rate: learning_rate * lr_decay}) sda_model = SdA(numpy_rng=numpy_rng, n_ins=n_features, hidden_layers_sizes=[5, 5], corruption_levels = [0.25, 0.25], layer_types=layer_types) ######################### # PRETRAINING THE MODEL # ######################### print '... getting the pretraining functions' pretraining_fns = sda_model.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, learning_rate=learning_rate) #print '... dumping pretraining functions to output file pre pickling' #print >> output_file, 'Pretraining functions, pre pickling' #for i in xrange(sda.n_layers): #theano.printing.debugprint(pretraining_fns[i], file = output_file, print_type=True) print '... getting the max-norm regularization functions' max_norm_regularization_fns = sda_model.max_norm_regularization() print '... pre-training the model' start_time = time.clock() ## Pre-train layer-wise corruption_levels = [float(options.corruption), float(options.corruption)] for i in xrange(sda_model.n_layers): for epoch in xrange(num_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i])) # regularize weights here scale = max_norm_regularization_fns[i](norm_limit=options.norm_limit) print >> output_file, 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print >> output_file, numpy.mean(c) print >> output_file, 'Learning rate ' print >> output_file, learning_rate.get_value(borrow=True) print >> output_file, 'Scale ', scale decay_learning_rate() end_time = time.clock() print >> output_file, ('Pretraining time for file ' + os.path.split(__file__)[1] + ' was %.2fm to go through %i epochs' % (((end_time - start_time) / 60.), (num_epochs / 2))) # Pickle the SdA print >> output_file, 'Pickling the model...' f = file(options.savefile, 'wb') cPickle.dump(sda_model, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() # Unpickle the SdA print >> output_file, 'Unpickling the model...' f = file(options.savefile, 'rb') pickled_sda = cPickle.load(f) f.close() # Test that the W-matrices and biases for the dA layers in sda are all close to the W-matrices # and biases freshly unpickled for i in xrange(pickled_sda.n_layers): pickled_dA_params = pickled_sda.dA_layers[i].get_params() fresh_dA_params = sda_model.dA_layers[i].get_params() if not numpy.allclose(pickled_dA_params[0].get_value(), fresh_dA_params[0].get_value()): print >> output_file, ("numpy says that Ws in layer %i are not close" % (i)) print >> output_file, "Norm for pickled dA " + pickled_dA_params[0].name + ": " print >> output_file, norm(pickled_dA_params[0].get_value()) print >> output_file, "Values for pickled dA " + pickled_dA_params[0].name + ": " print >> output_file, numpy.array_repr(pickled_dA_params[0].get_value()) print >> output_file, "Norm for fresh dA " + fresh_dA_params[0].name + ": " print >> output_file, norm(fresh_dA_params[0].get_value()) print >> output_file, "Values for fresh dA " + fresh_dA_params[0].name + ": " print >> output_file, numpy.array_repr(fresh_dA_params[0].get_value()) if not numpy.allclose(pickled_dA_params[1].get_value(), fresh_dA_params[1].get_value()): print >> output_file, ("numpy says that the biases in layer %i are not close" % (i)) print >> output_file, "Norm for pickled dA " + pickled_dA_params[1].name + ": " print >> output_file, norm(pickled_dA_params[1].get_value()) print >> output_file, "Values for pickled dA " + pickled_dA_params[1].name + ": " print >> output_file, numpy.array_repr(pickled_dA_params[1].get_value()) print >> output_file, "Norm for fresh dA " + fresh_dA_params[1].name + ": " print >> output_file, norm(fresh_dA_params[1].get_value()) print >> output_file, "Values for fresh dA " + pickled_dA_params[1].name + ": " print >> output_file, numpy.array_repr(pickled_dA_params[1].get_value()) output_file.close()