def train(args): print args numpy.random.seed(int(args['--seed'])) if (args['--validation']): dataset = load_data.load_mnist_for_validation( n_v=int(args['--num_validation_samples'])) else: dataset = load_data.load_mnist_full() x_train, t_train = dataset[0] x_test, t_test = dataset[1] layer_sizes = [ int(layer_size) for layer_size in args['--layer_sizes'].split('-') ] model = FNN_MNIST(layer_sizes=layer_sizes) x = T.matrix() t = T.ivector() if (args['--cost_type'] == 'MLE'): cost = costs.cross_entropy_loss(x=x, t=t, forward_func=model.forward_train) elif (args['--cost_type'] == 'L2'): cost = costs.cross_entropy_loss(x=x, t=t, forward_func=model.forward_train) \ + costs.weight_decay(params=model.params, coeff=float(args['--lamb'])) elif (args['--cost_type'] == 'AT'): cost = costs.adversarial_training( x, t, model.forward_train, 'CE', epsilon=float(args['--epsilon']), lamb=float(args['--lamb']), norm_constraint=args['--norm_constraint'], forward_func_for_generating_adversarial_examples=model. forward_no_update_batch_stat) elif (args['--cost_type'] == 'VAT'): cost = costs.virtual_adversarial_training( x, t, model.forward_train, 'CE', epsilon=float(args['--epsilon']), norm_constraint=args['--norm_constraint'], num_power_iter=int(args['--num_power_iter']), forward_func_for_generating_adversarial_examples=model. forward_no_update_batch_stat) elif (args['--cost_type'] == 'VAT_finite_diff'): cost = costs.virtual_adversarial_training_finite_diff( x, t, model.forward_train, 'CE', epsilon=float(args['--epsilon']), norm_constraint=args['--norm_constraint'], num_power_iter=int(args['--num_power_iter']), forward_func_for_generating_adversarial_examples=model. forward_no_update_batch_stat) nll = costs.cross_entropy_loss(x=x, t=t, forward_func=model.forward_test) error = costs.error(x=x, t=t, forward_func=model.forward_test) optimizer = optimizers.ADAM(cost=cost, params=model.params, alpha=float(args['--initial_learning_rate'])) index = T.iscalar() batch_size = int(args['--batch_size']) f_train = theano.function( inputs=[index], outputs=cost, updates=optimizer.updates, givens={ x: x_train[batch_size * index:batch_size * (index + 1)], t: t_train[batch_size * index:batch_size * (index + 1)] }) f_nll_train = theano.function( inputs=[index], outputs=nll, givens={ x: x_train[batch_size * index:batch_size * (index + 1)], t: t_train[batch_size * index:batch_size * (index + 1)] }) f_nll_test = theano.function( inputs=[index], outputs=nll, givens={ x: x_test[batch_size * index:batch_size * (index + 1)], t: t_test[batch_size * index:batch_size * (index + 1)] }) f_error_train = theano.function( inputs=[index], outputs=error, givens={ x: x_train[batch_size * index:batch_size * (index + 1)], t: t_train[batch_size * index:batch_size * (index + 1)] }) f_error_test = theano.function( inputs=[index], outputs=error, givens={ x: x_test[batch_size * index:batch_size * (index + 1)], t: t_test[batch_size * index:batch_size * (index + 1)] }) f_lr_decay = theano.function( inputs=[], outputs=optimizer.alpha, updates={ optimizer.alpha: theano.shared( numpy.array(args['--learning_rate_decay']).astype( theano.config.floatX)) * optimizer.alpha }) randix = RandomStreams(seed=numpy.random.randint(1234)).permutation( n=x_train.shape[0]) update_permutation = OrderedDict() update_permutation[x_train] = x_train[randix] update_permutation[t_train] = t_train[randix] f_permute_train_set = theano.function(inputs=[], outputs=x_train, updates=update_permutation) statuses = {} statuses['nll_train'] = [] statuses['error_train'] = [] statuses['nll_test'] = [] statuses['error_test'] = [] n_train = x_train.get_value().shape[0] n_test = x_test.get_value().shape[0] sum_nll_train = numpy.sum( numpy.array([f_nll_train(i) for i in xrange(n_train / batch_size)])) * batch_size sum_error_train = numpy.sum( numpy.array([f_error_train(i) for i in xrange(n_train / batch_size)])) sum_nll_test = numpy.sum( numpy.array([f_nll_test(i) for i in xrange(n_test / batch_size)])) * batch_size sum_error_test = numpy.sum( numpy.array([f_error_test(i) for i in xrange(n_test / batch_size)])) statuses['nll_train'].append(sum_nll_train / n_train) statuses['error_train'].append(sum_error_train) statuses['nll_test'].append(sum_nll_test / n_test) statuses['error_test'].append(sum_error_test) print "[Epoch]", str(-1) print "nll_train : ", statuses['nll_train'][-1], "error_train : ", statuses['error_train'][-1], \ "nll_test : ", statuses['nll_test'][-1], "error_test : ", statuses['error_test'][-1] print "training..." make_sure_path_exists("./trained_model") for epoch in xrange(int(args['--num_epochs'])): cPickle.dump( (statuses, args), open('./trained_model/' + 'tmp-' + args['--save_filename'], 'wb'), cPickle.HIGHEST_PROTOCOL) f_permute_train_set() ### update parameters ### [f_train(i) for i in xrange(n_train / batch_size)] ######################### sum_nll_train = numpy.sum( numpy.array([f_nll_train(i) for i in xrange(n_train / batch_size)])) * batch_size sum_error_train = numpy.sum( numpy.array( [f_error_train(i) for i in xrange(n_train / batch_size)])) sum_nll_test = numpy.sum( numpy.array([f_nll_test(i) for i in xrange(n_test / batch_size)])) * batch_size sum_error_test = numpy.sum( numpy.array([f_error_test(i) for i in xrange(n_test / batch_size)])) statuses['nll_train'].append(sum_nll_train / n_train) statuses['error_train'].append(sum_error_train) statuses['nll_test'].append(sum_nll_test / n_test) statuses['error_test'].append(sum_error_test) print "[Epoch]", str(epoch) print "nll_train : ", statuses['nll_train'][-1], "error_train : ", statuses['error_train'][-1], \ "nll_test : ", statuses['nll_test'][-1], "error_test : ", statuses['error_test'][-1] f_lr_decay() ### finetune batch stat ### f_finetune = theano.function( inputs=[index], outputs=model.forward_for_finetuning_batch_stat(x), givens={x: x_train[batch_size * index:batch_size * (index + 1)]}) [f_finetune(i) for i in xrange(n_train / batch_size)] sum_nll_train = numpy.sum( numpy.array([f_nll_train(i) for i in xrange(n_train / batch_size)])) * batch_size sum_error_train = numpy.sum( numpy.array([f_error_train(i) for i in xrange(n_train / batch_size)])) sum_nll_test = numpy.sum( numpy.array([f_nll_test(i) for i in xrange(n_test / batch_size)])) * batch_size sum_error_test = numpy.sum( numpy.array([f_error_test(i) for i in xrange(n_test / batch_size)])) statuses['nll_train'].append(sum_nll_train / n_train) statuses['error_train'].append(sum_error_train) statuses['nll_test'].append(sum_nll_test / n_test) statuses['error_test'].append(sum_error_test) print "[after finetuning]" print "nll_train : ", statuses['nll_train'][-1], "error_train : ", statuses['error_train'][-1], \ "nll_test : ", statuses['nll_test'][-1], "error_test : ", statuses['error_test'][-1] ########################### make_sure_path_exists("./trained_model") cPickle.dump((model, statuses, args), open('./trained_model/' + args['--save_filename'], 'wb'), cPickle.HIGHEST_PROTOCOL)
def train(args): print args numpy.random.seed(int(args['--seed'])) if(args['--validation']): dataset = load_data.load_mnist_for_validation(n_v=int(args['--num_validation_samples'])) else: dataset = load_data.load_mnist_full() x_train, t_train = dataset[0] x_test, t_test = dataset[1] layer_sizes = [int(layer_size) for layer_size in args['--layer_sizes'].split('-')] model = FNN_MNIST(layer_sizes=layer_sizes) x = T.matrix() t = T.ivector() if(args['--cost_type']=='MLE'): cost = costs.cross_entropy_loss(x=x,t=t,forward_func=model.forward_train) elif(args['--cost_type']=='L2'): cost = costs.cross_entropy_loss(x=x,t=t,forward_func=model.forward_train) \ + costs.weight_decay(params=model.params,coeff=float(args['--lamb'])) elif(args['--cost_type']=='AT'): cost = costs.adversarial_training(x,t,model.forward_train, 'CE', epsilon=float(args['--epsilon']), lamb=float(args['--lamb']), norm_constraint = args['--norm_constraint'], forward_func_for_generating_adversarial_examples=model.forward_no_update_batch_stat) elif(args['--cost_type']=='VAT'): cost = costs.virtual_adversarial_training(x,t,model.forward_train, 'CE', epsilon=float(args['--epsilon']), norm_constraint = args['--norm_constraint'], num_power_iter = int(args['--num_power_iter']), forward_func_for_generating_adversarial_examples=model.forward_no_update_batch_stat) elif(args['--cost_type']=='VAT_finite_diff'): cost = costs.virtual_adversarial_training_finite_diff(x,t,model.forward_train, 'CE', epsilon=float(args['--epsilon']), norm_constraint = args['--norm_constraint'], num_power_iter = int(args['--num_power_iter']), forward_func_for_generating_adversarial_examples=model.forward_no_update_batch_stat) nll = costs.cross_entropy_loss(x=x,t=t,forward_func=model.forward_test) error = costs.error(x=x,t=t,forward_func=model.forward_test) optimizer = optimizers.ADAM(cost=cost,params=model.params,alpha=float(args['--initial_learning_rate'])) index = T.iscalar() batch_size = int(args['--batch_size']) f_train = theano.function(inputs=[index], outputs=cost, updates=optimizer.updates, givens={ x:x_train[batch_size*index:batch_size*(index+1)], t:t_train[batch_size*index:batch_size*(index+1)]}) f_nll_train = theano.function(inputs=[index], outputs=nll, givens={ x:x_train[batch_size*index:batch_size*(index+1)], t:t_train[batch_size*index:batch_size*(index+1)]}) f_nll_test = theano.function(inputs=[index], outputs=nll, givens={ x:x_test[batch_size*index:batch_size*(index+1)], t:t_test[batch_size*index:batch_size*(index+1)]}) f_error_train = theano.function(inputs=[index], outputs=error, givens={ x:x_train[batch_size*index:batch_size*(index+1)], t:t_train[batch_size*index:batch_size*(index+1)]}) f_error_test = theano.function(inputs=[index], outputs=error, givens={ x:x_test[batch_size*index:batch_size*(index+1)], t:t_test[batch_size*index:batch_size*(index+1)]}) f_lr_decay = theano.function(inputs=[],outputs=optimizer.alpha, updates={optimizer.alpha:theano.shared(numpy.array(args['--learning_rate_decay']).astype(theano.config.floatX))*optimizer.alpha}) randix = RandomStreams(seed=numpy.random.randint(1234)).permutation(n=x_train.shape[0]) update_permutation = OrderedDict() update_permutation[x_train] = x_train[randix] update_permutation[t_train] = t_train[randix] f_permute_train_set = theano.function(inputs=[],outputs=x_train,updates=update_permutation) statuses = {} statuses['nll_train'] = [] statuses['error_train'] = [] statuses['nll_test'] = [] statuses['error_test'] = [] n_train = x_train.get_value().shape[0] n_test = x_test.get_value().shape[0] sum_nll_train = numpy.sum(numpy.array([f_nll_train(i) for i in xrange(n_train/batch_size)]))*batch_size sum_error_train = numpy.sum(numpy.array([f_error_train(i) for i in xrange(n_train/batch_size)])) sum_nll_test = numpy.sum(numpy.array([f_nll_test(i) for i in xrange(n_test/batch_size)]))*batch_size sum_error_test = numpy.sum(numpy.array([f_error_test(i) for i in xrange(n_test/batch_size)])) statuses['nll_train'].append(sum_nll_train/n_train) statuses['error_train'].append(sum_error_train) statuses['nll_test'].append(sum_nll_test/n_test) statuses['error_test'].append(sum_error_test) print "[Epoch]",str(-1) print "nll_train : " , statuses['nll_train'][-1], "error_train : ", statuses['error_train'][-1], \ "nll_test : " , statuses['nll_test'][-1], "error_test : ", statuses['error_test'][-1] print "training..." make_sure_path_exists("./trained_model") for epoch in xrange(int(args['--num_epochs'])): cPickle.dump((statuses,args),open('./trained_model/'+'tmp-' + args['--save_filename'],'wb'),cPickle.HIGHEST_PROTOCOL) f_permute_train_set() ### update parameters ### [f_train(i) for i in xrange(n_train/batch_size)] ######################### sum_nll_train = numpy.sum(numpy.array([f_nll_train(i) for i in xrange(n_train/batch_size)]))*batch_size sum_error_train = numpy.sum(numpy.array([f_error_train(i) for i in xrange(n_train/batch_size)])) sum_nll_test = numpy.sum(numpy.array([f_nll_test(i) for i in xrange(n_test/batch_size)]))*batch_size sum_error_test = numpy.sum(numpy.array([f_error_test(i) for i in xrange(n_test/batch_size)])) statuses['nll_train'].append(sum_nll_train/n_train) statuses['error_train'].append(sum_error_train) statuses['nll_test'].append(sum_nll_test/n_test) statuses['error_test'].append(sum_error_test) print "[Epoch]",str(epoch) print "nll_train : " , statuses['nll_train'][-1], "error_train : ", statuses['error_train'][-1], \ "nll_test : " , statuses['nll_test'][-1], "error_test : ", statuses['error_test'][-1] f_lr_decay() ### finetune batch stat ### f_finetune = theano.function(inputs=[index],outputs=model.forward_for_finetuning_batch_stat(x), givens={x:x_train[batch_size*index:batch_size*(index+1)]}) [f_finetune(i) for i in xrange(n_train/batch_size)] sum_nll_train = numpy.sum(numpy.array([f_nll_train(i) for i in xrange(n_train/batch_size)]))*batch_size sum_error_train = numpy.sum(numpy.array([f_error_train(i) for i in xrange(n_train/batch_size)])) sum_nll_test = numpy.sum(numpy.array([f_nll_test(i) for i in xrange(n_test/batch_size)]))*batch_size sum_error_test = numpy.sum(numpy.array([f_error_test(i) for i in xrange(n_test/batch_size)])) statuses['nll_train'].append(sum_nll_train/n_train) statuses['error_train'].append(sum_error_train) statuses['nll_test'].append(sum_nll_test/n_test) statuses['error_test'].append(sum_error_test) print "[after finetuning]" print "nll_train : " , statuses['nll_train'][-1], "error_train : ", statuses['error_train'][-1], \ "nll_test : " , statuses['nll_test'][-1], "error_test : ", statuses['error_test'][-1] ########################### make_sure_path_exists("./trained_model") cPickle.dump((model,statuses,args),open('./trained_model/'+args['--save_filename'],'wb'),cPickle.HIGHEST_PROTOCOL)
def train_mlp( n_l, # Number of labeled samples. layer_sizes, # Layer sizes of neural network. For example, layer_sizes = [784,1200,1200,10] indicates 784 input nodes and 2 hidden layers and 1200 hidden nodes and 10 output nodes. activations, # Specification of activation functions. initial_model_learning_rate, # Initial learning rate of ADAM. learning_rate_decay, # Learning rate decay of ADAM. n_epochs, # Number of training epochs, n_it_batches, # Number of parameter update of mini-batch stochastic gradient in each epoch. m_batch_size=100, # Number of mini-batch size. m_ul_batch_size=250, # Number of mini-batch size for calculation of LDS (semi-supervised learning only) cost_type="vat", # Cost type. 'mle' is no regularization, 'at' is Adversarial training, 'vat' is Virtual Adversarial training (ours) lamb=1.0, # Balance parameter. epsilon=0.05, # Norm constraint parameter. num_power_iter=1, # Number of iterations of power method. norm_constraint="L2", # Specification of norm constraint. 'max' is [L-infinity norm] and 'L2' is [L2 norm]. random_seed=1, # Random seed. semi_supervised=False, # Experiment on semi-supervised learning or not. n_v=10000, # Number of validation samples. full_train=False, # Training with all of training samples ( for evaluation on test samples ) monitoring_cost_during_training=False, # Monitoring transitions of cost during training. ): sys.setrecursionlimit(10000) # set random stream rng = numpy.random.RandomState(random_seed) # load mnist dataset if full_train and (not semi_supervised): dataset = load_mnist_for_test() train_set_x, train_set_y = dataset[0] else: dataset = load_mnist_for_validation(n_l=n_l, n_v=n_v, rng=rng) train_set_x, train_set_y, ul_train_set_x = dataset[0] valid_set_x, valid_set_y = dataset[1] n_train_batches = numpy.ceil((train_set_x.get_value(borrow=True).shape[0]) / numpy.float(m_batch_size)) n_valid_batches = numpy.ceil((valid_set_x.get_value(borrow=True).shape[0]) / numpy.float(m_batch_size)) print "... building the model" # define a classifier x = T.matrix("x") y = T.ivector("y") if semi_supervised: n_ul_train_batches = numpy.ceil((ul_train_set_x.get_value(borrow=True).shape[0]) / numpy.float(m_ul_batch_size)) ul_x = T.matrix("ul_x") classifier = mlp_ss.MLP_SS( rng=rng, input=x, ul_input=ul_x, layer_sizes=layer_sizes, activations=activations, epsilon=epsilon, lamb=lamb, m_batch_size=m_batch_size, m_ul_batch_size=m_ul_batch_size, num_power_iter=num_power_iter, norm_constraint=norm_constraint, ) else: classifier = mlp.MLP( rng=rng, input=x, layer_sizes=layer_sizes, activations=activations, epsilon=epsilon, lamb=lamb, m_batch_size=m_batch_size, num_power_iter=num_power_iter, norm_constraint=norm_constraint, ) # define a training_cost if cost_type == "mle": cost = classifier.cost(y) elif cost_type == "vat": cost = classifier.cost_vat(y) elif cost_type == "at": cost = classifier.cost_at(y) else: raise ValueError("cost_type:" + cost_type + " is not defined") # define a schedule of learning rate model_learning_rate = theano.shared(numpy.asarray(initial_model_learning_rate, dtype=theano.config.floatX)) decay_model_learning_rate = theano.function( inputs=[], outputs=model_learning_rate, updates={model_learning_rate: model_learning_rate * learning_rate_decay} ) updates = OrderedDict() updates = ADAM(classifier, cost, model_learning_rate, updates) updates.update(classifier.m_v_updates) # define permutation of train set def update_train_ind(x, y, ind): upd = OrderedDict() upd[x] = x[ind] if y != None: upd[y] = y[ind] return upd, upd[x][0] ind = T.ivector() upd_tr_ind, n_x_0 = update_train_ind(train_set_x, train_set_y, ind) permute_train_set = theano.function(inputs=[ind], outputs=n_x_0, updates=upd_tr_ind) # compile optimization function index = T.lscalar() if semi_supervised: upd_ul_tr_ind, n_ul_x_0 = update_train_ind(ul_train_set_x, None, ind) permute_ul_train_set = theano.function(inputs=[ind], outputs=n_ul_x_0, updates=upd_ul_tr_ind) ul_index = T.lscalar() optimize = theano.function( inputs=[index, ul_index], outputs=cost, updates=updates, givens={ x: train_set_x[m_batch_size * index : m_batch_size * (index + 1)], y: train_set_y[m_batch_size * index : m_batch_size * (index + 1)], ul_x: ul_train_set_x[m_ul_batch_size * ul_index : m_ul_batch_size * (ul_index + 1)], }, on_unused_input="warn", ) else: optimize = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[m_batch_size * index : m_batch_size * (index + 1)], y: train_set_y[m_batch_size * index : m_batch_size * (index + 1)], }, on_unused_input="warn", ) # compile functions for monitoring error and cost training_error = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: train_set_x[m_batch_size * index : m_batch_size * (index + 1)], y: train_set_y[m_batch_size * index : m_batch_size * (index + 1)], }, ) validation_error = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[m_batch_size * index : m_batch_size * (index + 1)], y: valid_set_y[m_batch_size * index : m_batch_size * (index + 1)], }, ) train_nll = theano.function( inputs=[index], outputs=classifier.neg_log_likelihood(y), givens={ x: train_set_x[m_batch_size * index : m_batch_size * (index + 1)], y: train_set_y[m_batch_size * index : m_batch_size * (index + 1)], }, ) valid_nll = theano.function( inputs=[index], outputs=classifier.neg_log_likelihood(y), givens={ x: valid_set_x[m_batch_size * index : m_batch_size * (index + 1)], y: valid_set_y[m_batch_size * index : m_batch_size * (index + 1)], }, ) num_power_iter_for_evalueation_LDS = 10 # num power iter for evaluation LDS train_LDS = theano.function( inputs=[index], outputs=classifier.LDS(num_power_iter=num_power_iter_for_evalueation_LDS), givens={x: train_set_x[m_batch_size * index : m_batch_size * (index + 1)]}, ) valid_LDS = theano.function( inputs=[index], outputs=classifier.LDS(num_power_iter=num_power_iter_for_evalueation_LDS), givens={x: valid_set_x[m_batch_size * index : m_batch_size * (index + 1)]}, ) print "... training" epoch_counter = 0 l_index = 0 ul_index = 0 train_errors = list() valid_errors = list() train_nlls = list() valid_nlls = list() train_LDSs = list() valid_LDSs = list() train_LDSs_std = list() valid_LDSs_std = list() def monitor_error(): training_errors = [training_error(i) for i in xrange(numpy.int(numpy.ceil(n_train_batches)))] validation_errors = [validation_error(i) for i in xrange(numpy.int(numpy.ceil(n_valid_batches)))] this_training_errors = numpy.sum(training_errors) this_validation_errors = numpy.sum(validation_errors) train_errors.append(this_training_errors) valid_errors.append(this_validation_errors) print "epoch:{}, train error {}, valid error {}, learning_rate={}".format( epoch_counter, this_training_errors, this_validation_errors, model_learning_rate.get_value(borrow=True) ) def monitor_cost(): training_LDSs = [train_LDS(i) for i in xrange(numpy.int(numpy.ceil(n_train_batches)))] validation_LDSs = [valid_LDS(i) for i in xrange(numpy.int(numpy.ceil(n_valid_batches)))] train_LDSs.append(numpy.mean(training_LDSs)) valid_LDSs.append(numpy.mean(validation_LDSs)) train_LDSs_std.append(numpy.std(training_LDSs)) valid_LDSs_std.append(numpy.std(validation_LDSs)) print "epoch:" + str(epoch_counter) + " train_KL:" + str(train_LDSs[-1]) + " std:" + str( train_LDSs_std[-1] ) + " valid_KL:" + str(valid_LDSs[-1]) + " std:" + str(valid_LDSs_std[-1]) train_losses = [numpy.mean(train_nll(i)) for i in xrange(numpy.int(numpy.ceil(n_train_batches)))] train_nlls.append(numpy.mean(train_losses)) valid_losses = [numpy.mean(valid_nll(i)) for i in xrange(numpy.int(numpy.ceil(n_valid_batches)))] valid_nlls.append(numpy.mean(valid_losses)) print "epoch:" + str(epoch_counter) + " train neg ll:" + str(train_nlls[-1]) + " valid neg ll:" + str( valid_nlls[-1] ) # error and cost before training monitor_error() monitor_cost() while epoch_counter < n_epochs: epoch_counter = epoch_counter + 1 # parameters update for it in xrange(n_it_batches): if semi_supervised: optimize(l_index, ul_index) ul_index = (ul_index + 1) if ((ul_index + 1) < numpy.int(n_ul_train_batches)) else 0 else: optimize(l_index) l_index = (l_index + 1) if ((l_index + 1) < numpy.int(n_train_batches)) else 0 # permute train set rand_ind = numpy.asarray(rng.permutation(train_set_x.get_value().shape[0]), dtype="int32") permute_train_set(rand_ind) if semi_supervised: ul_rand_ind = numpy.asarray(rng.permutation(ul_train_set_x.get_value().shape[0]), dtype="int32") permute_ul_train_set(ul_rand_ind) decay_model_learning_rate() # error and cost in middle of training monitor_error() monitor_cost() if monitoring_cost_during_training or epoch_counter == n_epochs else None classifier.train_errors = train_errors classifier.valid_errors = valid_errors classifier.train_KLs = train_LDSs classifier.valid_KLs = valid_LDSs classifier.train_KLs_std = train_LDSs_std classifier.valid_KLs_std = valid_LDSs_std classifier.train_nlls = train_nlls classifier.valid_nlls = valid_nlls return classifier
def train_mlp( n_l, # Number of labeled samples. layer_sizes, # Layer sizes of neural network. For example, layer_sizes = [784,1200,1200,10] indicates 784 input nodes and 2 hidden layers and 1200 hidden nodes and 10 output nodes. activations, # Specification of activation functions. initial_model_learning_rate, # Initial learning rate of ADAM. learning_rate_decay, # Learning rate decay of ADAM. n_epochs, # Number of training epochs, n_it_batches, # Number of parameter update of mini-batch stochastic gradient in each epoch. m_batch_size=100, # Number of mini-batch size. m_ul_batch_size=250, # Number of mini-batch size for calculation of LDS (semi-supervised learning only) cost_type='vat', # Cost type. 'mle' is no regularization, 'at' is Adversarial training, 'vat' is Virtual Adversarial training (ours) lamb=1.0, # Balance parameter. epsilon=0.05, # Norm constraint parameter. num_power_iter=1, # Number of iterations of power method. norm_constraint='L2', # Specification of norm constraint. 'max' is [L-infinity norm] and 'L2' is [L2 norm]. random_seed=1, # Random seed. semi_supervised=False, # Experiment on semi-supervised learning or not. n_v=10000, # Number of validation samples. full_train=False, # Training with all of training samples ( and evaluation on test samples ) monitoring_cost_during_training=False # Monitoring transitions of cost during training. ): sys.setrecursionlimit(10000) # set random stream rng = numpy.random.RandomState(random_seed) # load mnist dataset if (full_train and (not semi_supervised)): dataset = load_mnist_for_test() train_set_x, train_set_y = dataset[0] else: dataset = load_mnist_for_validation(n_l=n_l, n_v=n_v, rng=rng) train_set_x, train_set_y, ul_train_set_x = dataset[0] valid_set_x, valid_set_y = dataset[1] n_train_batches = numpy.ceil((train_set_x.get_value(borrow=True).shape[0]) / numpy.float(m_batch_size)) n_valid_batches = numpy.ceil((valid_set_x.get_value(borrow=True).shape[0]) / numpy.float(m_batch_size)) print '... building the model' # define a classifier x = T.matrix('x') y = T.ivector('y') if (semi_supervised): n_ul_train_batches = numpy.ceil((ul_train_set_x.get_value(borrow=True).shape[0]) / numpy.float(m_ul_batch_size)) ul_x = T.matrix('ul_x') classifier = mlp_ss.MLP_SS(rng=rng, input=x, ul_input=ul_x, layer_sizes=layer_sizes, activations=activations, epsilon=epsilon, lamb=lamb, m_batch_size=m_batch_size, m_ul_batch_size=m_ul_batch_size, num_power_iter=num_power_iter, norm_constraint=norm_constraint) else: classifier = mlp.MLP(rng=rng, input=x, layer_sizes=layer_sizes, activations=activations, epsilon=epsilon, lamb=lamb, m_batch_size=m_batch_size, num_power_iter=num_power_iter, norm_constraint=norm_constraint) # define a training_cost if (cost_type == 'mle'): cost = classifier.cost(y) elif (cost_type == 'vat'): cost = classifier.cost_vat(y) elif (cost_type == 'at'): cost = classifier.cost_at(y) else: raise ValueError('cost_type:' + cost_type + ' is not defined') # define a schedule of learning rate model_learning_rate = theano.shared(numpy.asarray(initial_model_learning_rate, dtype=theano.config.floatX)) decay_model_learning_rate = theano.function(inputs=[], outputs=model_learning_rate, updates={ model_learning_rate: model_learning_rate * learning_rate_decay}) updates = OrderedDict() updates = ADAM(classifier, cost, model_learning_rate, updates) updates.update(classifier.m_v_updates_during_training) # define permutation of train set def update_train_ind(x, y, ind): upd = OrderedDict() upd[x] = x[ind] if (y != None): upd[y] = y[ind] return upd, upd[x][0] ind = T.ivector() upd_tr_ind, n_x_0 = update_train_ind(train_set_x, train_set_y, ind) permute_train_set = theano.function(inputs=[ind], outputs=n_x_0, updates=upd_tr_ind) # compile optimization function index = T.lscalar() if (semi_supervised): upd_ul_tr_ind, n_ul_x_0 = update_train_ind(ul_train_set_x, None, ind) permute_ul_train_set = theano.function(inputs=[ind], outputs=n_ul_x_0, updates=upd_ul_tr_ind) ul_index = T.lscalar() optimize = theano.function(inputs=[index, ul_index], outputs=cost, updates=updates, givens={ x: train_set_x[m_batch_size * index:m_batch_size * (index + 1)], y: train_set_y[m_batch_size * index:m_batch_size * (index + 1)], ul_x: ul_train_set_x[m_ul_batch_size * ul_index:m_ul_batch_size * (ul_index + 1)]}, on_unused_input='warn' ) else: optimize = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[m_batch_size * index:m_batch_size * (index + 1)], y: train_set_y[m_batch_size * index:m_batch_size * (index + 1)]}, on_unused_input='warn' ) # compile functions for monitoring error and cost training_error = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: train_set_x[m_batch_size * index:m_batch_size * (index + 1)], y: train_set_y[m_batch_size * index:m_batch_size * (index + 1)]} ) validation_error = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[m_batch_size * index:m_batch_size * (index + 1)], y: valid_set_y[m_batch_size * index:m_batch_size * (index + 1)]} ) train_nll = theano.function(inputs=[index], outputs=classifier.neg_log_likelihood(y), givens={ x: train_set_x[m_batch_size * index:m_batch_size * (index + 1)], y: train_set_y[m_batch_size * index:m_batch_size * (index + 1)]} ) valid_nll = theano.function(inputs=[index], outputs=classifier.neg_log_likelihood(y), givens={ x: valid_set_x[m_batch_size * index:m_batch_size * (index + 1)], y: valid_set_y[m_batch_size * index:m_batch_size * (index + 1)]} ) num_power_iter_for_evalueation_LDS = 10 # num power iter for evaluation LDS train_LDS = theano.function(inputs=[index], outputs=classifier.LDS(num_power_iter=num_power_iter_for_evalueation_LDS), givens={ x: train_set_x[m_batch_size * index:m_batch_size * (index + 1)]} ) valid_LDS = theano.function(inputs=[index], outputs=classifier.LDS(num_power_iter=num_power_iter_for_evalueation_LDS), givens={ x: valid_set_x[m_batch_size * index:m_batch_size * (index + 1)]} ) print '... training' epoch_counter = 0 l_index = 0 ul_index = 0 train_errors = list() valid_errors = list() train_nlls = list() valid_nlls = list() train_LDSs = list() valid_LDSs = list() train_LDSs_std = list() valid_LDSs_std = list() def monitor_error(): training_errors = [training_error(i) for i in xrange(numpy.int(numpy.ceil(n_train_batches)))] validation_errors = [validation_error(i) for i in xrange(numpy.int(numpy.ceil(n_valid_batches)))] this_training_errors = numpy.sum(training_errors) this_validation_errors = numpy.sum(validation_errors) train_errors.append(this_training_errors) valid_errors.append(this_validation_errors) print 'epoch:{}, train error {}, valid error {}, learning_rate={}'.format( epoch_counter, this_training_errors, this_validation_errors, model_learning_rate.get_value(borrow=True)) def monitor_cost(): training_LDSs = [train_LDS(i) for i in xrange(numpy.int(numpy.ceil(n_train_batches)))] validation_LDSs = [valid_LDS(i) for i in xrange(numpy.int(numpy.ceil(n_valid_batches)))] train_LDSs.append(numpy.mean(training_LDSs)) valid_LDSs.append(numpy.mean(validation_LDSs)) train_LDSs_std.append(numpy.std(training_LDSs)) valid_LDSs_std.append(numpy.std(validation_LDSs)) print 'epoch:' + str(epoch_counter) + ' train_LDS:' + str(train_LDSs[-1]) + ' std:' + str( train_LDSs_std[-1]) + ' valid_LDS:' + str( valid_LDSs[-1]) + ' std:' + str(valid_LDSs_std[-1]) train_losses = [numpy.mean(train_nll(i)) for i in xrange(numpy.int(numpy.ceil(n_train_batches)))] train_nlls.append(numpy.mean(train_losses)) valid_losses = [numpy.mean(valid_nll(i)) for i in xrange(numpy.int(numpy.ceil(n_valid_batches)))] valid_nlls.append(numpy.mean(valid_losses)) print 'epoch:' + str(epoch_counter) + ' train neg ll:' + str( train_nlls[-1]) + ' valid neg ll:' + str(valid_nlls[-1]) while epoch_counter < n_epochs: # monitoring error and cost in middle of training monitor_error() monitor_cost() if monitoring_cost_during_training or epoch_counter == 0 else None epoch_counter = epoch_counter + 1 # parameters update for it in xrange(n_it_batches): if (semi_supervised): optimize(l_index, ul_index) ul_index = (ul_index + 1) if ((ul_index + 1) < numpy.int(n_ul_train_batches)) else 0 else: optimize(l_index) l_index = (l_index + 1) if ((l_index + 1) < numpy.int(n_train_batches)) else 0 # permute train set rand_ind = numpy.asarray(rng.permutation(train_set_x.get_value().shape[0]), dtype='int32') permute_train_set(rand_ind) if (semi_supervised): ul_rand_ind = numpy.asarray(rng.permutation(ul_train_set_x.get_value().shape[0]), dtype='int32') permute_ul_train_set(ul_rand_ind) decay_model_learning_rate() print "finished training!" # finetune batch mean and var for batch normalization print "finetuning batch mean and var for batch normalization..." if(semi_supervised): finetune_batch_mean_and_var = theano.function(inputs=[index], outputs=classifier.finetuning_N, updates=classifier.m_v_updates_for_finetuning, givens={ ul_x: ul_train_set_x[m_ul_batch_size * index:m_ul_batch_size * (index + 1)], }) [finetune_batch_mean_and_var(i) for i in xrange(numpy.int(numpy.ceil(n_ul_train_batches)))] else: finetune_batch_mean_and_var = theano.function(inputs=[index], outputs=classifier.finetuning_N, updates=classifier.m_v_updates_for_finetuning, givens={ x: train_set_x[m_batch_size * index:m_batch_size * (index + 1)], }) [finetune_batch_mean_and_var(i) for i in xrange(numpy.int(numpy.ceil(n_train_batches)))] print "final errors and costs:" monitor_error() monitor_cost() classifier.train_errors = train_errors classifier.valid_errors = valid_errors classifier.train_LDSs = train_LDSs classifier.valid_LDSs = valid_LDSs classifier.train_LDSs_std = train_LDSs_std classifier.valid_LDSs_std = valid_LDSs_std classifier.train_nlls = train_nlls classifier.valid_nlls = valid_nlls return classifier