def trainFunction(self): trainPrediction = get_output(self.output_layer) trainLoss = categorical_crossentropy(trainPrediction, self.target_var).mean() params = get_all_params(self.output_layer, trainable=True) update = momentum(trainLoss, params, learning_rate=0.001, momentum=0.9) trainFunc = theano.function([self.input_var, self.target_var], [trainLoss], updates=update) return trainFunc
def __init__(self, feature_size, lr, beta): self.beta = beta self.input_var = T.matrix('inputs', dtype=floatX) target_var = T.vector('targets', dtype=floatX) network = self._build_mlp(feature_size, 6) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.squared_error(prediction, target_var) mean_loss = loss.mean() var_loss = loss.var() params = lasagne.layers.get_all_params(network, trainable=True) updates = momentum(mean_loss, params, learning_rate=lr, momentum=0.2) self.predict_f = theano.function([self.input_var], prediction) self.train_f = theano.function([self.input_var, target_var], [mean_loss, var_loss], updates=updates)
def get_updates(nnet, train_obj, trainable_params, solver=None): implemented_solvers = ("sgd", "momentum", "nesterov", "adagrad", "rmsprop", "adadelta", "adam", "adamax") if solver not in implemented_solvers: nnet.sgd_solver = "adam" else: nnet.sgd_solver = solver if nnet.sgd_solver == "sgd": updates = l_updates.sgd(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "momentum": updates = l_updates.momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "rmsprop": updates = l_updates.rmsprop(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adamax": updates = l_updates.adamax(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def __init__(self, *args, **kwargs): super(TrainerMixin, self).__init__(*args, **kwargs) input_var = tensor.tensor4('inputs') target_var = tensor.ivector('targets') loss, _ = loss_acc(self.model, input_var, target_var, deterministic=False) layers = get_all_layers(self.model) decay = regularize_layer_params(layers, l2) * 0.0001 loss = loss + decay params = get_all_params(self.model, trainable=True) updates = momentum(loss, params, momentum=0.9, learning_rate=self.learning_rate) self.set_training(input_var, target_var, loss, updates)
def trainFunction(self): startTime = time.time() trainPrediction = get_output(self.sectorNet) trainLoss = categorical_crossentropy(trainPrediction, self.targetVar).mean() trainACC = T.mean(T.eq(T.argmax(trainPrediction, axis=1), self.targetVar), dtype=theano.config.floatX) params = get_all_params(self.sectorNet, trainable=True) update = momentum(trainLoss, params, learning_rate=0.001, momentum=0.9) trainFunc = theano.function([self.inputVar, self.targetVar], [trainLoss, trainACC], updates=update) self.logger.info( 'Compiling the train function, which spends {}.'.format( time.time() - startTime)) return trainFunc
def Train(options,init_params,build_model,DataHandler): load=options['load']; loadHis=options['loadHis']; saveto=options['saveto']; loadfrom=options['loadfrom']; dataset=options['dataset']; last_n=options['last_n']; fsize=options['videosize']; print ">>>init params & build graph"; tparams=init_params(options); cost,preds,inner_state,inps,use_noise=build_model(options,tparams); print "build done" print ">>>compile cost&updates function"; start=time.time(); f=theano.function(inps,[cost,preds],allow_input_downcast=True,on_unused_input='ignore'); print "cost function ready" if options['finetune']: updates=momentum(cost, itemlist(tparams), options['lrate'], momentum=options['momentum']); else: updates=adam(cost, itemlist(tparams), learning_rate=options['lrate'], beta1=0.9, beta2=0.999, epsilon=1e-08); print len(itemlist(tparams)) print "updates ready",len(updates) f_update=theano.function(inps,[cost,preds],updates=updates,allow_input_downcast=True,on_unused_input='ignore'); print "update function ready" print "compile finish, use %.1fmin"%((time.time()-start)/60); print '>>>Optimization' # ready dataset dh_train = DataHandler(options['dataset'],datatype=0,fps=options['fps']); dh_train.SetMode('source'); dh_valid = DataHandler(options['dataset'],datatype=1,fps=options['fps']); dh_valid.SetMode('source'); train_log=np.empty((0,4),dtype='float32'); min_valid_cost=1e8; max_valid_acc=0; if loadHis and os.path.exists(loadfrom): print "load log history from",loadfrom train_log = np.load(loadfrom)['train_log']; min_valid_cost=train_log[:,2].min(); max_valid_acc=train_log[:,3].max(); train_num=dh_train.batch_num; # should be set to dh_train.batch_num for epochidx in xrange(options['max_epochs']): use_noise.set_value(1.0); dh_train.Reset(); print 'Epoch ', epochidx start=time.time(); for vidx in xrange(train_num): x,mask,y=dh_train.GetSingleVideoFromSource(size=fsize,scale=1); x=x.reshape([x.shape[0],x.shape[1],fsize,fsize,3]); x=x.transpose([0,1,4,2,3]); x=x.reshape([x.shape[0],x.shape[1],-1]); cost,preds=f_update(x,mask,y); acc=((y.mean(0)).argmax(1)==preds).mean(); print cost,acc; # print tparams['recog/cnn_conv2_w'].get_value().sum(),tparams['recog/cnn_conv3_w'].get_value().sum(),tparams['recog/cnn_conv4_w'].get_value().sum(),tparams['recog/cnn_conv5_w'].get_value().sum(),(tparams['recog/cnn_conv5_w'].get_value()**2).sum() if ((vidx+1)%100==0): print "%d/%d, use %.1fmin"%(vidx+1,dh_train.batch_num,(time.time()-start)/60.0); start=time.time(); use_noise.set_value(0.0); #compute train error dh_train.Reset(); print ">>train cost"; tcost,tacc=Predict(options,f,dh_train,verbose=True,train_num=200); print "cost: %.3f, acc: %.3f"%(tcost,tacc); #compute valid error dh_valid.Reset(); print ">>valid cost"; vcost,vacc=Predict(options,f,dh_valid,verbose=True); print "cost: %.3f, acc: %.3f"%(vcost,vacc); print ">>save point:",options['saveto']; train_log=np.append(train_log,np.array([tcost,tacc,vcost,vacc])[None,...],axis=0); # train_log.append([tcost,tacc,vcost,vacc]); params = unzip(tparams); np.savez(saveto, train_log=train_log, options=options, **params); if (vcost<min_valid_cost): min_valid_cost=vcost; max_valid_acc=max(max_valid_acc,vacc); print ">>save best:",options['bestsaveto']; np.savez(options['bestsaveto'], train_log=train_log, options=options, **params); elif (vacc>max_valid_acc): max_valid_acc=vacc; min_valid_cost=min(min_valid_cost,vcost); print ">>save best:",options['bestsaveto']; np.savez(options['bestsaveto'], train_log=train_log, options=options, **params);
def update(all_grads, all_params, learning_rate): """ Compute updates from gradients """ return momentum(all_grads, all_params, learning_rate, momentum=m)
def main(cf): ######## # DATA # ######## print 'Creating data generators...' train_iterator, valid_iterator, test_iterator = create_data_generators(cf) ############################## # COST, GRADIENT AND UPDATES # ############################## print 'Building model...' cost, accuracy = cf.model.compute_cost(deterministic=False) cost_val, accuracy_val = cf.model.compute_cost(deterministic=True) params = get_all_params(cf.model.net, trainable=True) if cf.algo == 'adam': updates = adam(cost, params, cf.learning_rate) elif cf.algo == 'sgd': updates = sgd(cost, params, cf.learning_rate) elif cf.algo == 'momentum': updates = momentum(cost, params, cf.learning_rate) else: raise ValueError('Specified algo does not exist') ############## # MONITORING # ############## print 'Creating extensions and compiling functions...', train_monitor = TrainMonitor(cf.train_freq_print, cf.model.vars, [cost, accuracy], updates) monitoring_vars = [cost_val, accuracy_val] valid_monitor = ValMonitor('Validation', cf.valid_freq_print, cf.model.vars, monitoring_vars, valid_iterator) test_monitor = ValMonitor('Test', cf.valid_freq_print, cf.model.vars, monitoring_vars, valid_iterator) train_saver = VariableSaver(train_monitor, cf.dump_every_batches, cf.dump_path, 'train') valid_saver = VariableSaver(valid_monitor, cf.dump_every_batches, cf.dump_path, 'valid') test_saver = VariableSaver(test_monitor, None, cf.dump_path, 'test') # Ending conditions end_conditions = [] if hasattr(cf, 'max_iter'): end_conditions.append(MaxIteration(cf.max_iter)) if hasattr(cf, 'max_time'): end_conditions.append(MaxTime(cf.max_iter)) extensions = [ valid_monitor, test_monitor, train_saver, valid_saver, test_saver ] train_m = Trainer(train_monitor, train_iterator, extensions, end_conditions) ############ # TRAINING # ############ train_m.train()
def _prepare(self, X, y, X_valid=None, y_valid=None, sample_weight=None, whole_dataset_in_device=True): self._stats = [] self._class_label_encoder = LabelEncoder() if self.is_classification is True: self._class_label_encoder.fit(y) self.classes_ = self._class_label_encoder.classes_ y = self._class_label_encoder.transform(y).astype(y.dtype) self.y_train_transformed = y if y_valid is not None: y_valid_transformed = self._class_label_encoder.transform( y_valid).astype(y_valid.dtype) self._l_x_in = layers.InputLayer(shape=(None, X.shape[1])) batch_index, X_batch, y_batch, batch_slice = get_theano_batch_variables( self.batch_size, y_softmax=self.is_classification) if sample_weight is not None: t_sample_weight = T.vector('sample_weight') sample_weight = sample_weight.astype(theano.config.floatX) else: t_sample_weight = T.scalar('sample_weight') if self.is_classification is True: y_dim = len(set(y.flatten().tolist())) else: y_dim = y.shape[1] self._prediction_layer = self._build_model(y_dim) self._layers = layers.get_all_layers(self._prediction_layer) self._build_prediction_functions(X_batch, self._prediction_layer) if self.input_noise_function is None: output = layers.get_output(self._prediction_layer, X_batch) else: X_batch_noisy = self.input_noise_function(X_batch) output = layers.get_output(self._prediction_layer, X_batch_noisy) if self.is_classification: loss = -T.mean(t_sample_weight * T.log(output) [T.arange(y_batch.shape[0]), y_batch]) else: loss = T.mean( t_sample_weight * T.sum((output - y_batch) ** 2, axis=1)) loss_unreg = loss all_params = layers.get_all_params(self._prediction_layer) if self._output_softener_coefs is not None: all_params.append(self._output_softener_coefs) W_params = layers.get_all_param_values( self._prediction_layer, regularizable=True) # regularization if self.L1_factor is not None: for L1_factor_layer, W in zip(self.L1_factor, W_params): loss = loss + L1_factor_layer * T.sum(abs(W)) if self.L2_factor is not None: for L2_factor_layer, W in zip(self.L2_factor, W_params): loss = loss + L2_factor_layer * T.sum(W**2) if self.optimization_method == 'nesterov_momentum': gradient_updates = updates.nesterov_momentum(loss, all_params, learning_rate=self.learning_rate, momentum=self.momentum) elif self.optimization_method == 'adadelta': # don't need momentum there gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'adam': gradient_updates = updates.Adam( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'momentum': gradient_updates = updates.momentum( loss, all_params, learning_rate=self.learning_rate, momentum=self.momentum ) elif self.optimization_method == 'adagrad': gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'rmsprop': gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'sgd': gradient_updates = updates.sgd( loss, all_params, learning_rate=self.learning_rate, ) else: raise Exception("wrong optimization method") nb_batches = X.shape[0] // self.batch_size if (X.shape[0] % self.batch_size) != 0: nb_batches += 1 X = X.astype(theano.config.floatX) if self.is_classification == True: y = y.astype(np.int32) else: y = y.astype(theano.config.floatX) if whole_dataset_in_device == True: X_shared = theano.shared(X, borrow=True) y_shared = theano.shared(y, borrow=True) givens = { X_batch: X_shared[batch_slice], y_batch: y_shared[batch_slice] } if sample_weight is not None: sample_weight_shared = theano.shared( sample_weight, borrow=True) givens[t_sample_weight] = sample_weight_shared[batch_slice] else: givens[t_sample_weight] = T.as_tensor_variable( np.array(1., dtype=theano.config.floatX)) iter_update_batch = theano.function( [batch_index], loss, updates=gradient_updates, givens=givens, ) else: if sample_weight is None: iter_update_gradients = theano.function( [X_batch, y_batch], loss, updates=gradient_updates, givens={t_sample_weight: T.as_tensor_variable( np.array(1., dtype=theano.config.floatX))}, ) def iter_update_batch(batch_index): sl = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) return iter_update_gradients(X[sl], y[sl]) else: iter_update_gradients = theano.function( [X_batch, y_batch, t_sample_weight], loss, updates=gradient_updates ) def iter_update_batch(batch_index): sl = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) return iter_update_gradients(X[sl], y[sl], sample_weight[sl]) self._iter_update_batch = iter_update_batch self._get_loss = theano.function( [X_batch, y_batch, t_sample_weight], loss_unreg, allow_input_downcast=True) def iter_update(epoch): losses = [] #self.learning_rate.set_value(self.learning_rate.get_value() * np.array(0.99, dtype=theano.config.floatX)) for i in xrange(nb_batches): losses.append(self._iter_update_batch(i)) # max norm if self.max_norm is not None: for max_norm_layer, layer in zip(self.max_norm, self._layers): layer.W = updates.norm_constraint( layer.W, self.max_norm) losses = np.array(losses) d = OrderedDict() d["epoch"] = epoch #d["loss_train_std"] = losses.std() #d["loss_train"] = losses.mean() d["loss_train"] = self._get_loss( self.X_train, self.y_train_transformed, 1.) d["accuracy_train"] = ( self.predict(self.X_train) == self.y_train).mean() if X_valid is not None and y_valid is not None: d["loss_valid"] = self._get_loss( X_valid, y_valid_transformed, 1.) if self.is_classification == True: d["accuracy_valid"] = ( self.predict(X_valid) == y_valid).mean() if self.verbose > 0: if (epoch % self.report_each) == 0: print(tabulate([d], headers="keys")) self._stats.append(d) return d def quitter(update_status): cur_epoch = len(self._stats) - 1 if self.patience_nb_epochs > 0: # patience heuristic (for early stopping) cur_patience_stat = update_status[self.patience_stat] if self.cur_best_patience_stat is None: self.cur_best_patience_stat = cur_patience_stat first_time = True else: first_time = False thresh = self.patience_progression_rate_threshold if cur_patience_stat < self.cur_best_patience_stat * thresh or first_time: if self.verbose >= 2: fmt = "--Early stopping-- good we have a new best value : {0}={1}, last best : epoch {2}, value={3}" print(fmt.format(self.patience_stat, cur_patience_stat, self.cur_best_epoch, self.cur_best_patience_stat)) self.cur_best_epoch = cur_epoch self.cur_best_patience_stat = cur_patience_stat if hasattr(self, "set_state") and hasattr(self, "get_state"): self.cur_best_model = self.get_state() else: self.cur_best_model = pickle.dumps( self.__dict__, protocol=pickle.HIGHEST_PROTOCOL) if (cur_epoch - self.cur_best_epoch) >= self.patience_nb_epochs: finish = True if hasattr(self, "set_state") and hasattr(self, "get_state"): self.set_state(self.cur_best_model) else: self.__dict__.update(pickle.loads(self.cur_best_model)) self._stats = self._stats[0:self.cur_best_epoch + 1] if self.verbose >= 2: print("out of patience...take the model at epoch {0} and quit".format( self.cur_best_epoch + 1)) else: finish = False return finish else: return False def monitor(update_status): pass def observer(monitor_output): pass return (iter_update, quitter, monitor, observer)
def Train(options,init_params,build_model,DataHandler): load=options['load']; loadHis=options['loadHis']; saveto=options['saveto']; loadfrom=options['loadfrom']; dataset=options['dataset']; last_n=options['last_n']; print ">>>init params & build graph"; tparams=init_params(options); cost,preds,inner_state,inps,use_noise=build_model(options,tparams); print "build done" print ">>>compile cost&updates function"; start=time.time(); f=theano.function(inps,[cost,preds],allow_input_downcast=True,on_unused_input='ignore'); constraint_params=ParamsFilter(tparams,prefix='recog/saliencyFgbg_w'); if options['finetune']: updates=momentum(cost, itemlist(tparams), options['lrate'], momentum=options['momentum']); else: updates=adam(cost, itemlist(tparams), learning_rate=options['lrate'], beta1=0.9, beta2=0.999, epsilon=1e-08); f_update=theano.function(inps,[cost,preds],updates=updates,allow_input_downcast=True,on_unused_input='ignore'); print "compile finish, use %.1fmin"%((time.time()-start)/60); print '>>>Optimization' # ready dataset dh_train = DataHandler(options['dataset'],datatype=0,fps=options['fps']); dh_train.SetMode('single'); dh_valid = DataHandler(options['dataset'],datatype=1,fps=options['fps']); dh_valid.SetMode('single'); train_log=np.empty((0,4),dtype='float32'); min_valid_cost=1e8; max_valid_acc=0; if load and loadHis and os.path.exists(loadfrom): print "load log history from",loadfrom train_log = np.load(loadfrom)['train_log']; min_valid_cost=train_log[:,2].min(); max_valid_acc=train_log[:,3].max(); train_num=dh_train.batch_num; # should be set to dh_train.batch_num for epochidx in xrange(options['max_epochs']): use_noise.set_value(1.0); dh_train.Reset(); print 'Epoch ', epochidx start=time.time(); for vidx in xrange(train_num): x,mask,y=dh_train.GetSingleVideoFeature(); # switch the last two feature dim x=x.reshape([x.shape[0],x.shape[1],options['featureMaps'],options['locations']]); #1024*49 x=x.transpose([0,1,3,2]); x=x.reshape([x.shape[0],x.shape[1],-1]); cost,preds=f_update(x,mask,y); if math.isnan(cost): print "cost is nan, exit"; exit(-1); ParamsConstraint(constraint_params); # make contraint # acc=((y.mean(0)).argmax(1)==preds).mean(); #print "%.3f %.3f, use %.0fs"%(cost,acc,(time.time()-start)); # print cost,acc #,preds.reshape([preds.shape[0]]) if ((vidx+1)%100==0): print "%d/%d, use %.1fmin"%(vidx+1,dh_train.batch_num,(time.time()-start)/60.0); start=time.time(); # if (epochidx%10!=0): # continue use_noise.set_value(0.0); #compute train error dh_train.Reset(); print ">>train cost"; tcost,tacc=Predict(options,f,dh_train,verbose=True,train_num=200); print "cost: %.3f, acc: %.3f"%(tcost,tacc); #compute valid error dh_valid.Reset(); print ">>valid cost"; vcost,vacc=Predict(options,f,dh_valid,verbose=True); print "cost: %.3f, acc: %.3f"%(vcost,vacc); print ">>save point:",options['saveto']; train_log=np.append(train_log,np.array([tcost,tacc,vcost,vacc])[None,...],axis=0); # train_log.append([tcost,tacc,vcost,vacc]); params = unzip(tparams); np.savez(saveto, train_log=train_log, options=options, **params); if (vcost<min_valid_cost): min_valid_cost=vcost; max_valid_acc=max(max_valid_acc,vacc); print ">>save best:",options['bestsaveto']; np.savez(options['bestsaveto'], train_log=train_log, options=options, **params); elif (vacc>max_valid_acc): max_valid_acc=vacc; min_valid_cost=min(min_valid_cost,vcost); print ">>save best:",options['bestsaveto']; np.savez(options['bestsaveto'], train_log=train_log, options=options, **params);
def model_class(ds, paths, param_arch, param_cost, param_updates, param_train): # create a log file containing the architecture configuration formatter = logging.Formatter('%(message)s') logger = logging.getLogger('log_config') if 'start_from_epoch' in param_train: name_tmp = 'config_from_epoch=%04d.log' % ( param_train['start_from_epoch']) else: name_tmp = 'config.log' path_tmp = os.path.join(paths['exp'], name_tmp) if not os.path.isfile(path_tmp): handler = logging.FileHandler( path_tmp, mode='w') # to append at the end of the file use: mode='a' else: raise Exception('[e] the log file ', name_tmp, ' already exists!') handler.setFormatter(formatter) handler.setLevel(logging.INFO) logger.addHandler(handler) logger.setLevel(logging.INFO) # input dimensions dim_desc = ds.descs_train[0].shape[1] dim_labels = ds.labels_train[0].shape[0] print(dim_labels) # architecture definition: print(("[i] architecture definition... "), end=' ') tic = time.time() if param_arch['type'] == 0: desc, patch_op, cla, net, logger = arch_class_00( dim_desc, dim_labels, param_arch, logger) elif param_arch['type'] == 1: desc, patch_op, cla, net, logger = arch_class_01( dim_desc, dim_labels, param_arch, logger) elif param_arch['type'] == 2: desc, patch_op, cla, net, logger = arch_class_02( dim_desc, dim_labels, param_arch, logger) else: raise Exception('[e] architecture not supported!') print(("%02.2fs" % (time.time() - tic))) # cost function definition: print(("[i] cost function definition... "), end=' ') tic = time.time() pred = LL.get_output(cla, deterministic=True) # in case we use dropout feat = LL.get_output(net) target = T.ivector('target') # data term if param_cost['cost_func'] == 'cross_entropy': if param_arch['non_linearity'] == 'softmax': cost_dataterm = T.mean( LO.categorical_crossentropy(pred, target) ) # in the original code we were using *.mean() instead of T.mean(*) elif param_arch['non_linearity'] == 'log_softmax': cost_dataterm = T.mean( categorical_crossentropy_logdomain(pred, target)) elif param_cost['cost_func'] == 'cross_entropy_stable': if param_arch['non_linearity'] == 'softmax': cost_dataterm = T.mean( categorical_crossentropy_stable(pred, target)) else: raise Exception( '[e] the chosen cost function is not implemented for the chosen non-linearity!' ) else: raise Exception('[e] the chosen cost function is not supported!') # classification accuracy acc = LO.categorical_accuracy(pred, target).mean() # regularization cost_reg = param_cost['mu'] * LR.regularize_network_params(cla, LR.l2) # cost function cost = cost_dataterm + cost_reg # get params params = LL.get_all_params(cla) # gradient definition grad = T.grad(cost, params) grad_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grad]), 2) print(("%02.2fs" % (time.time() - tic))) # updates definition: print(("[i] gradient updates definition... "), end=' ') tic = time.time() if param_updates['method'] == 'momentum': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1.0 else: raise Exception('[e] missing learning_rate parameter!') if param_updates.get('momentum') is not None: momentum = param_updates['momentum'] # default: 0.9 else: raise Exception('[e] missing learning_rate parameter!') updates = LU.momentum(grad, params, learning_rate, momentum) elif param_updates['method'] == 'adagrad': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1.0 else: raise Exception('[e] missing learning_rate parameter!') updates = LU.adagrad(grad, params, learning_rate) elif param_updates['method'] == 'adadelta': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1.0 else: raise Exception('[e] missing learning_rate parameter!') updates = LU.adadelta(grad, params, learning_rate) elif param_updates['method'] == 'adam': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1e-03 else: raise Exception('[e] missing learning_rate parameter!') if param_updates.get('beta1') is not None: beta1 = param_updates['beta1'] # default: 0.9 else: raise Exception('[e] missing beta1 parameter!') if param_updates.get('beta2') is not None: beta2 = param_updates['beta2'] # default: 0.999 else: raise Exception('[e] missing beta2 parameter!') if param_updates.get('epsilon') is not None: epsilon = param_updates['epsilon'] # default: 1e-08 else: raise Exception('[e] missing epsilon parameter!') updates = LU.adam(grad, params, learning_rate, beta1, beta2, epsilon) else: raise Exception('[e] updates method not supported!') print(("%02.2fs" % (time.time() - tic))) # train / test functions: funcs = dict() print(("[i] compiling function 'train'... "), end=' ') tic = time.time() funcs['train'] = theano.function( [desc.input_var, patch_op.input_var, target], [cost, cost_dataterm, cost_reg, grad_norm, acc], updates=updates, allow_input_downcast=True, on_unused_input='warn') print(("%02.2fs" % (time.time() - tic))) print(("[i] compiling function 'fwd'... "), end=' ') tic = time.time() funcs['fwd'] = theano.function( [desc.input_var, patch_op.input_var, target], [cost, grad_norm, acc], allow_input_downcast=True, on_unused_input='ignore') print(("%02.2fs" % (time.time() - tic))) print(("[i] compiling function 'pred'... "), end=' ') tic = time.time() funcs['pred'] = theano.function( [desc.input_var, patch_op.input_var, target], [pred], allow_input_downcast=True, on_unused_input='ignore') print(("%02.2fs" % (time.time() - tic))) print(("[i] compiling function 'feat'... "), end=' ') tic = time.time() funcs['feat'] = theano.function( [desc.input_var, patch_op.input_var, target], [feat], allow_input_downcast=True, on_unused_input='ignore') print(("%02.2fs" % (time.time() - tic))) # save cost function parameters to a config file logger.info('\nCost function parameters:') logger.info(' cost function = %s' % param_cost['cost_func']) logger.info(' mu = %e' % param_cost['mu']) # save updates parameters to a config file logger.info('\nUpdates parameters:') logger.info(' method = %s' % param_updates['method']) logger.info(' learning rate = %e' % param_updates['learning_rate']) if param_updates['method'] == 'momentum': logger.info(' momentum = %e' % param_updates['momentum']) if param_updates['method'] == 'adam': logger.info(' beta1 = %e' % param_updates['beta1']) logger.info(' beta2 = %e' % param_updates['beta2']) logger.info(' epsilon = %e' % param_updates['epsilon']) # save training parameters to a config file logger.info('\nTraining parameters:') logger.info(' epoch size = %d' % ds.epoch_size) return funcs, cla, updates
def main(cf): ######## # DATA # ######## print 'Creating data generators...' train_iterator, valid_iterator, test_iterator = create_data_generators(cf) ############################## # COST, GRADIENT AND UPDATES # ############################## print 'Building model...' cost, accuracy = cf.model.compute_cost(deterministic=False) cost_val, accuracy_val = cf.model.compute_cost(deterministic=True) params = get_all_params(cf.model.net, trainable=True) if cf.algo == 'adam': updates = adam(cost, params, cf.learning_rate) elif cf.algo == 'sgd': updates = sgd(cost, params, cf.learning_rate) elif cf.algo == 'momentum': updates = momentum(cost, params, cf.learning_rate) else: raise ValueError('Specified algo does not exist') ############## # MONITORING # ############## print 'Creating extensions and compiling functions...', train_monitor = TrainMonitor( cf.train_freq_print, cf.model.vars, [cost, accuracy], updates) monitoring_vars = [cost_val, accuracy_val] valid_monitor = ValMonitor( 'Validation', cf.valid_freq_print, cf.model.vars, monitoring_vars, valid_iterator) test_monitor = ValMonitor( 'Test', cf.valid_freq_print, cf.model.vars, monitoring_vars, valid_iterator) train_saver = VariableSaver( train_monitor, cf.dump_every_batches, cf.dump_path, 'train') valid_saver = VariableSaver( valid_monitor, cf.dump_every_batches, cf.dump_path, 'valid') test_saver = VariableSaver(test_monitor, None, cf.dump_path, 'test') # Ending conditions end_conditions = [] if hasattr(cf, 'max_iter'): end_conditions.append(MaxIteration(cf.max_iter)) if hasattr(cf, 'max_time'): end_conditions.append(MaxTime(cf.max_iter)) extensions = [ valid_monitor, test_monitor, train_saver, valid_saver, test_saver ] train_m = Trainer(train_monitor, train_iterator, extensions, end_conditions) ############ # TRAINING # ############ train_m.train()