def build_loss(self): net = self.net['out'] prediction = lasagne.layers.get_output(net) prediction = T.clip(prediction, 1e-9, 1 - 1e-9) loss = lasagne.objectives.categorical_crossentropy(prediction, self.target_var) loss = loss.mean() + self.lambda2 * regularization.regularize_network_params(net, regularization.l2) params = lasagne.layers.get_all_params(net, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=self.learning_rate,momentum = 0.9) test_prediction = lasagne.layers.get_output(net, deterministic=True) test_prediction = T.clip(test_prediction, 1e-9, 1 - 1e-9) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, self.target_var) test_loss = test_loss.mean() + self.lambda2 * regularization.regularize_network_params(net, regularization.l2) test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), self.target_var), dtype=theano.config.floatX) train_fn = theano.function([self.input_var, self.target_var], loss, updates=updates) pred_fn = theano.function([self.input_var], lasagne.layers.get_output(net,deterministic=True)) val_fn = theano.function([self.input_var, self.target_var], [test_loss, test_acc]) return train_fn,val_fn,pred_fn
def my_loss(model, predictions, targets, regularization, params): predictions = predictions[0][0][ params['left_border']:-params['right_border']] targets = targets[0][params['left_border']:-params['right_border']] loss = tensor.abs_(tensor.log((targets * predictions).sum() / targets.sum())) +\ tensor.abs_(tensor.log(((1-targets) * (1-predictions)).sum() / (1-targets).sum())) reg_loss_l1 = regularize_network_params(model, l1) * 1e-4 reg_loss_l2 = regularize_network_params(model, l2) if regularization: return loss + reg_loss_l1 # + reg_loss_l2 else: return loss
def build_loss(self, env, agent, replay_seq_len): # get agent's Qvalues obtained via experience replay _, _, _, _, qvalues_seq = agent.get_sessions( env, # initial_hidden = env.preceding_agent_memories, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) scaled_reward_seq = env.rewards elwise_mse_loss = qlearning_n_step.get_elementwise_objective(qvalues_seq, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=self.gamma, n_steps=self.n_steps) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(agent.state_variables.keys(), l2) * 10 ** -5 return mse_loss + reg_l2
def make_training_functions(network, encode_layer, input_var, aug_var, target_var, stack_params, weight_decay): output = lasagne.layers.get_output(network, deterministic=True) loss = lasagne.objectives.squared_error(output, target_var).mean() + \ weight_decay * regularization.regularize_network_params( layer = network, penalty = regularization.l2, tags={'regularizable' : True}) params = layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.0001, momentum=0.95) stack_updates = lasagne.updates.nesterov_momentum(loss, stack_params, learning_rate=0.0001, momentum=0.95) encode = lasagne.layers.get_output(encode_layer, deterministic=True) val_fn = theano.function([input_var, aug_var, target_var], [loss, encode, output]) train_fn = theano.function([input_var, aug_var, target_var], loss, updates=updates) stack_train_fn = theano.function([input_var, aug_var, target_var], loss, updates=stack_updates) return val_fn, train_fn, stack_train_fn
def build_loss(self, env): _, _, _, _, qvalues_seq = self.agent.get_sessions( env, session_length=self.replay_seq_len, batch_size=self.replay_batch_size, optimize_experience_replay=True, # unroll_scan=, ) scaled_reward_seq = env.rewards elwise_mse_loss = qlearning_n_step.get_elementwise_objective(qvalues_seq, env.actions[0], scaled_reward_seq, env.is_alive, n_steps=self.n_steps, gamma_or_gammas=self.gamma, ) mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() reg_l2 = regularize_network_params(self.resolver, l2) * 10 ** -4 loss = mse_loss + reg_l2 return loss
def __init__(self, istrained, name=None, args=None): self.istrained = istrained self.X = T.tensor4('X') self.y = T.ivector('y') self.outprob = build_model(self.X) if self.istrained: params = cPickle.load(open(dataset_path + 'plain_cnn.pkl', 'r')) layers.set_all_param_values(self.outprob, params) self.yFullProb = layers.get_output(self.outprob, deterministic=True) self.predfn = makeFunc([self.X, ], [self.yFullProb, ], None) else: self.lr, self.C, self.momentum = args self.params = layers.get_all_params(self.outprob, trainable=True) reg = regularization.regularize_network_params(self.outprob, regularization.l2) reg /= layers.helper.count_params(self.outprob) # 训练集 self.yDropProb = layers.get_output(self.outprob) trCrossentropy = objectives.categorical_crossentropy(self.yDropProb, self.y) self.trCost = trCrossentropy.mean() + self.C * reg # 验证、测试集 self.yFullProb = layers.get_output(self.outprob, deterministic=True) vateCrossentropy = objectives.categorical_crossentropy(self.yFullProb, self.y) self.vateCost = vateCrossentropy.mean() + self.C * reg # 训练函数,输入训练集,输出训练损失和误差 updatesDict = updates.nesterov_momentum(self.trCost, self.params, self.lr, self.momentum) self.trainfn = makeFunc([self.X, self.y], [self.trCost, self.yDropProb], updatesDict) # 验证或测试函数,输入验证或测试集,输出损失和误差,不进行更新 self.vatefn = makeFunc([self.X, self.y], [self.vateCost, self.yFullProb], None)
def complieTrainFunction(self): message = 'Compiling the Training Function' self.logger.info(logMessage('+', message)) startTime = time.time() trainPrediction = get_output(self.outputLayer, deterministic = False, batch_norm_update_averages=False, batch_norm_use_averages=False) # TODO. Chack wheather the flatten style of targetvar and output are same. self.flattenedTargetVar = T.flatten(self.targetVar) trainLoss = categorical_crossentropy(trainPrediction, self.flattenedTargetVar).mean() weightNorm = regularize_network_params(self.outputLayer, lasagne.regularization.l2) trainLoss += self.weightDecay * weightNorm trainPredictionLabel = T.argmax(trainPrediction, axis = 1) trainACC = T.mean(T.eq(trainPredictionLabel, self.flattenedTargetVar), dtype = theano.config.floatX) params = get_all_params(self.outputLayer, trainable = True) update = self.optimizer(trainLoss, params, learning_rate = self.learningRate) trainFunc = theano.function([self.inputVar, self.targetVar], [trainLoss, trainACC], updates = update) message = 'Compiled the Training Function, spent {:.2f}s'.format(time.time()- startTime) self.logger.info(logMessage('+', message)) return trainFunc
def build_loss(self, env): _, _, _, _, qvalues_seq = self.agent.get_sessions( env, session_length=self.replay_seq_len, batch_size=self.replay_batch_size, optimize_experience_replay=True, # unroll_scan=, ) scaled_reward_seq = env.rewards elwise_mse_loss = qlearning_n_step.get_elementwise_objective( qvalues_seq, env.actions[0], scaled_reward_seq, env.is_alive, n_steps=self.n_steps, gamma_or_gammas=self.gamma, ) mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() reg_l2 = regularize_network_params(self.resolver, l2) * 10**-4 loss = mse_loss + reg_l2 return loss
def build_loss(self, env, agent, replay_seq_len): # get agent's Qvalues obtained via experience replay _, _, _, _, qvalues_seq = agent.get_sessions( env, # initial_hidden = env.preceding_agent_memories, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) scaled_reward_seq = env.rewards elwise_mse_loss = qlearning_n_step.get_elementwise_objective( qvalues_seq, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=self.gamma, n_steps=self.n_steps) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(agent.state_variables.keys(), l2) * 10**-5 return mse_loss + reg_l2
def create_iter_funcs_train(l_out, lr, mntm, wd): X = T.tensor4('X') y = T.ivector('y') X_batch = T.tensor4('X_batch') y_batch = T.ivector('y_batch') y_hat = layers.get_output(l_out, X, deterministic=False) # softmax loss train_loss = T.mean(T.nnet.categorical_crossentropy(y_hat, y)) # L2 regularization train_loss += wd * regularize_network_params(l_out, l2) train_acc = T.mean(T.eq(y_hat.argmax(axis=1), y)) all_params = layers.get_all_params(l_out, trainable=True) updates = lasagne.updates.nesterov_momentum(train_loss, all_params, lr, mntm) train_iter = theano.function( inputs=[theano.Param(X_batch), theano.Param(y_batch)], outputs=[train_loss, train_acc], updates=updates, givens={ X: X_batch, y: y_batch, }, ) return train_iter
def __build_loss_train__fn__(self): # create loss function prediction = layers.get_output(self.net) loss = objectives.categorical_crossentropy(prediction, self.__target_var__) loss = loss.mean() + 1e-4 * regularization.regularize_network_params( self.net, regularization.l2) val_acc = T.mean(T.eq(T.argmax(prediction, axis=1), self.__target_var__), dtype=theano.config.floatX) # create parameter update expressions params = layers.get_all_params(self.net, trainable=True) self.eta = theano.shared(sp.array(sp.float32(0.05), dtype=sp.float32)) update_rule = updates.nesterov_momentum(loss, params, learning_rate=self.eta, momentum=0.9) # compile training function that updates parameters and returns training loss self.__train_fn__ = theano.function( [self.__input_var__, self.__target_var__], loss, updates=update_rule) self.__predict_fn__ = theano.function( [self.__input_var__], layers.get_output(self.net, deterministic=True)) self.__val_fn__ = theano.function( [self.__input_var__, self.__target_var__], [loss, val_acc])
def define_updates(network, input_var, target_var, weight_var, learning_rate=0.01, momentum=0.9, l2_lambda=1e-5): params = lasagne.layers.get_all_params(network, trainable=True) out = lasagne.layers.get_output(network) test_out = lasagne.layers.get_output(network, deterministic=True) l2_loss = l2_lambda * regularize_network_params(network, l2) train_metrics = _score_metrics(out, target_var, weight_var, l2_loss) loss, acc, target_prediction, prediction = train_metrics val_metrics = _score_metrics(test_out, target_var, weight_var, l2_loss) t_loss, t_acc, t_target_prediction, t_prediction = val_metrics updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learning_rate, momentum=momentum) train_fn = theano.function([input_var, target_var, weight_var],[ loss, l2_loss, acc, target_prediction, prediction], updates=updates) val_fn = theano.function([input_var, target_var, weight_var], [ t_loss, l2_loss, t_acc, t_target_prediction, t_prediction]) return train_fn, val_fn
def compileValFunction(self): message = 'Compiling the Validation Function' self.logger.info(logMessage('+', message)) startTime = time.time() valPrediction = get_output(self.outputLayer, deterministic = True, batch_norm_update_averages=False, batch_norm_use_averages=False) # TODO. Chack wheather the flatten style of targetvar and output are same. self.flattenedTargetVar = T.flatten(self.targetVar) valLoss = categorical_crossentropy(valPrediction, self.flattenedTargetVar).mean() weightNorm = regularize_network_params(self.outputLayer, lasagne.regularization.l2) valLoss += self.weightDecay * weightNorm valPredictionLabel = T.argmax(valPrediction, axis = 1) valACC = T.mean(T.eq(valPredictionLabel, self.flattenedTargetVar), dtype = theano.config.floatX) valFunc = theano.function([self.inputVar, self.targetVar], [valLoss, valACC]) message = 'Compiled the Validation Function, spent {:.2f}s'.format(time.time()- startTime) self.logger.info(logMessage('+', message)) return valFunc
def __init__(self, C, lr): self.C = C self.X = T.ftensor4() self.Y = T.fmatrix() self.net = self._forward() params = layers.get_all_params(self.net['flatten'], trainable=True) netout = layers.get_output(self.net['out']) flattenout = layers.get_output(self.net['flatten']) reg = regularization.regularize_network_params(self.net['flatten'], regularization.l2) reg /= layers.helper.count_params(self.net['flatten']) self.flattenfn = theano.function([self.X], flattenout, allow_input_downcast=True) self.predictfn = theano.function([self.X], netout, allow_input_downcast=True) accrarcy = myUtils.basic.accuracy(netout, self.Y) self.scorefn = theano.function([self.X, self.Y], accrarcy, allow_input_downcast=True) self.sharedBeta = self.net['out'].get_params()[0] crossentropy = objectives.categorical_crossentropy(netout, self.Y) cost = T.mean(crossentropy) + C * reg updatesDict = updates.nesterov_momentum(cost, params, lr, 0.9) # 训练随机参数 self.trainfn = theano.function([self.X, self.Y], [cost, accrarcy], updates=updatesDict, allow_input_downcast=True)
def create_iter_funcs_train(l_out, lr, mntm, wd): X = T.tensor4('X') y = T.ivector('y') X_batch = T.tensor4('X_batch') y_batch = T.ivector('y_batch') y_hat = layers.get_output(l_out, X, deterministic=False) # softmax loss train_loss = T.mean( T.nnet.categorical_crossentropy(y_hat, y)) # L2 regularization train_loss += wd * regularize_network_params(l_out, l2) train_acc = T.mean( T.eq(y_hat.argmax(axis=1), y)) all_params = layers.get_all_params(l_out, trainable=True) updates = lasagne.updates.nesterov_momentum( train_loss, all_params, lr, mntm) train_iter = theano.function( inputs=[theano.Param(X_batch), theano.Param(y_batch)], outputs=[train_loss, train_acc], updates=updates, givens={ X: X_batch, y: y_batch, }, ) return train_iter
def _create_loss(self, output_layer, predicted, target, error_threshold, proto_loss_multiplier=1.0): # Regularization term reg_term = self.reg_weight * regularize_network_params(output_layer, l2) # Source-target penalty term: # distance between source and target is subtracted from loss # So less is learned from non-cognates # Phonetic: binary crossentropy, multi-label classifcation if self.output_encoding == "phonetic" or self.output_encoding == "embedding": loss = T.sum(lasagne.objectives.binary_crossentropy(predicted, target)) / self.batch_size + reg_term # Character: categorical crossentropy, single label classification elif self.output_encoding == "character": loss = T.sum(lasagne.objectives.categorical_crossentropy(predicted, target)) / self.batch_size + reg_term # Multiply loss with cognacy prior: # more should be learned from probable cognate examples if self.cognacy_prior > 0.0: target_prediction_error = T.sum(lasagne.objectives.squared_error(predicted, target)) / self.batch_size # sigmoid(-error+mean_error_history) # Cognacy prior is high for low error, but declines steeply # when error above mean_error_history cognacy_prior_factor = utility.sigmoid(-target_prediction_error + error_threshold) loss *= cognacy_prior_factor else: cognacy_prior_factor = T.constant(1) target_prediction_error = T.constant(0) loss *= proto_loss_multiplier return loss, cognacy_prior_factor, target_prediction_error, error_threshold
def __init__(self, lr, C, momentum): self.lr = lr self.C = C self.momentum = momentum self.X = T.tensor4('X') self.y = T.ivector('y') self.network = self._build() self.params = layers.get_all_params(self.network, trainable=True) reg = regularization.regularize_network_params(self.network, regularization.l2) reg /= layers.helper.count_params(self.network) # 训练集 yDropProb = layers.get_output(self.network) self.trEqs = myUtils.basic.eqs(yDropProb, self.y) trCrossentropy = objectives.categorical_crossentropy(yDropProb, self.y) self.trCost = trCrossentropy.mean() + C * reg # 验证、测试集 yFullProb = layers.get_output(self.network, deterministic=True) self.vateEqs = myUtils.basic.eqs(yFullProb, self.y) vateCrossentropy = objectives.categorical_crossentropy(yFullProb, self.y) self.vateCost = vateCrossentropy.mean() + C * reg self.yPred = yFullProb # 训练函数,输入训练集,输出训练损失和误差 updatesDict = updates.nesterov_momentum(self.trCost, self.params, lr, momentum) self.trainfn = myUtils.basic.makeFunc([self.X, self.y], [self.trCost, self.trEqs], updatesDict) # 验证或测试函数,输入验证或测试集,输出损失和误差,不进行更新 self.vatefn = myUtils.basic.makeFunc([self.X, self.y], [self.vateCost, self.vateEqs], None)
def compile_train_predict(self, stochastic_train, stochastic_predict): # symbolic functions to compute marginal posterior GP input_vars = self.post_gp.data_variables gp_hyperparams = self.post_gp.params self.gp_hyperparams = gp_hyperparams mu = self.post_gp.mean() mu = mu.dimshuffle('x', 0) # make a row out of 1d vector (N to 1xN) self.train_network = self.extend_network(mu, stochastic_train) train_predict = lasagne.layers.get_output(self.train_network) # Compute the exepcted prediction #if stochastic_train and self.n_samples > 1: # train_predict = train_predict.mean(axis=0, keepdims=True) label = T.ivector('label') # For expected loss if stochastic_train: label_rep = label.repeat(self.n_samples) else: label_rep = label loss = categorical_crossentropy(train_predict, label_rep).mean() # For expected prediction #loss = categorical_crossentropy(train_predict, label).mean() if self.regularize_weight > 0: penalty = (self.regularize_weight * regularize_network_params(self.train_network, l2)) loss += penalty params = lasagne.layers.get_all_params(self.train_network, trainable=True) update_params = params if self.update_gp: update_params += gp_hyperparams grad_loss = theano.grad(loss, update_params, consider_constant=input_vars) updates = self.optimizer(grad_loss, update_params, **self.optimizer_kwargs) self.train_fn = theano.function(input_vars + [label], loss, updates=updates) if stochastic_train == stochastic_predict: self.test_network = self.train_network self.copy_params = False else: self.test_network = self.extend_network(mu, stochastic_predict) self.copy_params = True # Set deterministic=True for dropout training if used. test_predict = lasagne.layers.get_output(self.test_network, deterministic=True) if stochastic_predict and self.n_samples > 1: test_predict = test_predict.mean(axis=0, keepdims=True) self.predict_fn = theano.function(input_vars, test_predict)
def train_setup(): x = T.tensor3('input') y = T.matrix('output') encoding, decoding = cnn( x, config.input_length, config.output_length, \ config.encoding_length ) print 'Number of Parameters {0}'.format(count_params(decoding)) if config.init_model is not None: with np.load(config.init_model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(decoding, param_values) # training tasks in sequence prediction = get_output(decoding) error = squared_error(y, prediction) error = error.mean() l1_norm = config.l1_weight * regularize_network_params(decoding, l1) l2_norm = config.l2_weight * regularize_network_params(decoding, l2) total_error = error + l1_norm + l2_norm params = get_all_params(decoding, trainable=True) updates = adadelta( total_error, params, config.learning_rate, \ config.rho, \ config.eps ) train_fn = function( [x, y], [error, l1_norm, l2_norm], \ updates = updates, \ allow_input_downcast = True ) val_prediction = get_output(decoding, deterministic=True) val_error = squared_error(y, val_prediction) val_error = val_error.mean() val_fn = function([x, y], val_error, allow_input_downcast=True) return encoding, decoding, train_fn, val_fn
def train_setup(): x = T.tensor3('input') y = T.lvector('output') network = cnn(x, config.input_length, config.output_length) print 'Number of Parameters {0}'.format(count_params(network)) if config.init_model is not None: with np.load(config.init_model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(decoding, param_values) # training tasks in sequence prediction = get_output(network) ent = categorical_crossentropy(prediction, y) ent = ent.mean() l1_norm = config.l1_weight * regularize_network_params(network, l1) l2_norm = config.l2_weight * regularize_network_params(network, l2) total_error = ent + l1_norm + l2_norm params = get_all_params(network, trainable=True) updates = adadelta( total_error, params, config.learning_rate, \ config.rho, \ config.eps ) train_fn = function( [x, y], [ent, l1_norm, l2_norm, prediction], \ updates = updates, \ allow_input_downcast = True ) val_prediction = get_output(network, deterministic=True) val_ent = categorical_crossentropy(val_prediction, y) val_ent = val_ent.mean() val_fn = function([x, y], [val_ent, val_prediction], allow_input_downcast=True) return network, train_fn, val_fn
def get_functions(cfg): input_var = T.tensor4('inputs') target_var = T.ivector('targets') weights_var = T.vector('weights') lr=theano.shared(np.float32(0.0000)) network = build_network(cfg,input_var) prediction = lasagne.layers.get_output(network) test_prediction = lasagne.layers.get_output(network, deterministic = True) output_shape = lasagne.layers.get_output_shape(network) l2_penalty = regularize_network_params(network, l2) l1_penalty = regularize_network_params(network, l1) cost = loss(prediction,target_var,weights_var) + 5*1e-6*(l1_penalty + l2_penalty) params = lasagne.layers.get_all_params(network, trainable=True) #print (len(params)) def save_params(path): np.savez(path,params) return def load_params(path): data = np.load(path) param_values = [ x.get_value() for x in data['arr_0'] ] # print (len(param_values)) lasagne.layers.set_all_param_values(network, param_values, trainable=True) return def set_lr(value): lr.set_value(value) return optimiser= cfg['optimiser'] updates = get_updates(cost,params, optimiser ,lr) def acc(yp,yt): output = T.argmax(yp,axis=1) return T.mean(T.eq(output, target_var)) accuracy = acc(prediction,target_var) train_fn = theano.function([input_var, target_var,weights_var], cost, updates=updates) val_fn = theano.function([input_var, target_var], accuracy) train_predict_fn = theano.function([input_var], prediction) test_predict_fn = theano.function([input_var], test_prediction) return train_fn, test_predict_fn, train_predict_fn, save_params, load_params, output_shape, set_lr
def __init__(self, steps = 1, num_layers = 2, num_units = 32, eps = 1e-2, recurrent = False, nonlinearity = tanh, ): self.steps = steps self.X = T.fmatrix() self.Y = T.fmatrix() def network(l): if recurrent: l = ReshapeLayer(l, shape = (-1, steps, 1)) l = LSTMLayer(l, num_units) for k in range(num_layers): l = DenseLayer(l, num_units = num_units, nonlinearity = nonlinearity) l = DenseLayer(l, num_units = 1, nonlinearity = linear) return l self.network = network l = InputLayer(input_var = self.X, shape = (None, steps)) l = self.network(l) self.l_ = l self.x_ = get_output(self.l_) self.f = theano.function([self.X], self.x_, allow_input_downcast=True) l2_penalty = regularize_network_params(l,L2) error = squared_error(self.x_, self.Y).mean() loss = error + eps * l2_penalty params = get_all_params(l) updates = adam(loss, params) self.error = theano.function([self.X,self.Y], error, allow_input_downcast=True) self.train = theano.function([self.X,self.Y], loss, updates=updates, allow_input_downcast=True)
def set_network_trainer(input_data, input_mask, target_data, target_mask, network, updater, learning_rate, grad_max_norm=10., # l2_lambda=1e-5, load_updater_params=None): # get network output data predict_data = get_output(network, deterministic=False) predict_idx = T.argmax(predict_data, axis=-1) # get prediction cost train_predict_cost = categorical_crossentropy(predictions=T.reshape(predict_data, (-1, predict_data.shape[-1])) + eps, targets=T.flatten(target_data, 1)) train_predict_cost = train_predict_cost*T.flatten(target_mask, 1) train_predict_cost = train_predict_cost.sum()/target_mask.sum() # get regularizer cost train_regularizer_cost = regularize_network_params(network, penalty=l2) # get network parameters network_params = get_all_params(network, trainable=True) # get network gradients with clipping network_grads = theano.grad(cost=train_predict_cost + train_regularizer_cost*l2_lambda, wrt=network_params) network_grads = theano.grad(cost=train_predict_cost, wrt=network_params) network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) # set updater train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, trainer_params = updater(loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) # get training (update) function training_fn = theano.function(inputs=[input_data, input_mask, target_data, target_mask], outputs=[predict_data, predict_idx, train_predict_cost, train_regularizer_cost], network_grads_norm], updates=train_updates, allow_input_downcast=True)
def build_treatment_model(self, n_vars, **kwargs): input_vars = TT.matrix() instrument_vars = TT.matrix() targets = TT.vector() inputs = layers.InputLayer((None, n_vars), input_vars) inputs = layers.DropoutLayer(inputs, p=0.2) dense_layer = layers.DenseLayer(inputs, 2 * kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) dense_layer= layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) self.treatment_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.treatment_output) prediction = layers.get_output(self.treatment_output, deterministic=False) test_prediction = layers.get_output(self.treatment_output, deterministic=True) l2_cost = regularization.regularize_network_params(self.treatment_output, regularization.l2) loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost params = layers.get_all_params(self.treatment_output, trainable=True) param_updates = updates.adadelta(loss, params) self._train_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, updates=param_updates ) self._loss_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, ) self._output_fn = theano.function( [ input_vars, ], test_prediction, ) return init_params
def test_regularize_network_params(self, layers): from lasagne.regularization import regularize_network_params l_1, l_2, l_3 = layers penalty = Mock(return_value=0) loss = regularize_network_params(l_3, penalty) assert penalty.call_count == 2 penalty.assert_any_call(l_2.W) penalty.assert_any_call(l_3.W)
def test_regularize_network_params(self, layers): from lasagne.regularization import regularize_network_params l_1, l_2, l_3 = layers penalty = Mock(return_value=0) loss = regularize_network_params(l_3, penalty) assert penalty.call_count == 2 penalty.assert_any_call(l_2.W) penalty.assert_any_call(l_3.W)
def build_instrument_model(self, n_vars, **kwargs): targets = TT.vector() instrument_vars = TT.matrix() instruments = layers.InputLayer((None, n_vars), instrument_vars) instruments = layers.DropoutLayer(instruments, p=0.2) dense_layer = layers.DenseLayer(instruments, kwargs['dense_size'], nonlinearity=nonlinearities.tanh) dense_layer = layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.tanh) dense_layer = layers.DropoutLayer(dense_layer, p=0.5) self.instrument_output = layers.DenseLayer( dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.instrument_output) prediction = layers.get_output(self.instrument_output, deterministic=False) test_prediction = layers.get_output(self.instrument_output, deterministic=True) # flexible here, endog variable can be categorical, continuous, etc. l2_cost = regularization.regularize_network_params( self.instrument_output, regularization.l2) loss = objectives.squared_error( prediction.flatten(), targets.flatten()).mean() + 1e-4 * l2_cost loss_total = objectives.squared_error(prediction.flatten(), targets.flatten()).mean() params = layers.get_all_params(self.instrument_output, trainable=True) param_updates = updates.adadelta(loss, params) self._instrument_train_fn = theano.function([ targets, instrument_vars, ], loss, updates=param_updates) self._instrument_loss_fn = theano.function([ targets, instrument_vars, ], loss_total) self._instrument_output_fn = theano.function([instrument_vars], test_prediction) return init_params
def triplet_loss_iter(embedder, update_params={}): X_triplets = { 'anchor':T.tensor4(), 'positive':T.tensor4(), 'negative':T.tensor4(), } # each will be a batch of images final_emb_layer = embedder[-1] all_layers = ll.get_all_layers(embedder) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_triplets.items()} predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_triplets.items()} # each output should be batch_size x embed_size # should give us a vector of batch_size of distances btw anchor and positive alpha = 0.2 # FaceNet alpha triplet_pos = lambda pred: (pred['anchor'] - pred['positive']).norm(2,axis=1) triplet_neg = lambda pred: (pred['anchor'] - pred['negative']).norm(2,axis=1) triplet_distances = lambda pred: (triplet_pos(pred) - triplet_neg(pred) + alpha).clip(0, np.inf) triplet_failed = lambda pred: T.mean(triplet_distances(pred) > alpha) triplet_loss = lambda pred: T.sum(triplet_distances(pred)) decay = 0.001 reg = regularize_network_params(final_emb_layer, l2) * decay losses_reg = lambda pred: triplet_loss(pred) + reg loss_train = losses_reg(predicted_embeds_train) loss_train.name = 'TL' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder])) all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) print("Compiling network for training") tic = time.time() train_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [triplet_loss(predicted_embeds_valid), losses_reg(predicted_embeds_valid), triplet_failed(predicted_embeds_valid)]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def contrastive_loss_iter(embedder, update_params={}): X_pairs = { 'img1':T.tensor4(), 'img2':T.tensor4(), } y = T.ivector() # basically class labels final_emb_layer = embedder[-1] all_layers = ll.get_all_layers(embedder) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_pairs.items()} predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_pairs.items()} margin = 1 # if distance is 0 that's bad distance = lambda pred: (pred['img1'] - pred['img2'] + 1e-7).norm(2, axis=1) contrastive_loss = lambda pred: T.mean(y*(distance(pred)) + (1 - y)*(margin - distance(pred)).clip(0,np.inf)) failed_matches = lambda pred: T.switch(T.eq(T.sum(y),0), 0, T.sum((y*distance(pred)) > margin) / T.sum(y)) failed_nonmatches = lambda pred: T.switch(T.eq(T.sum(1-y),0), 0, T.sum((1-y*distance(pred)) < margin) / T.sum(1-y)) failed_pairs = lambda pred: 0.5*failed_matches(pred) + 0.5*failed_nonmatches(pred) decay = 0.0001 reg = regularize_network_params(final_emb_layer, l2) * decay losses_reg = lambda pred: contrastive_loss(pred) + reg loss_train = losses_reg(predicted_embeds_train) loss_train.name = 'CL' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder])) all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) print("Compiling network for training") tic = time.time() train_iter = theano.function([X_pairs['img1'], X_pairs['img2'], y], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X_pairs['img1'], X_pairs['img2'], y], [ contrastive_loss(predicted_embeds_valid), losses_reg(predicted_embeds_valid), failed_pairs(predicted_embeds_valid)]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def loss_iter(segmenter, update_params={}): X = T.tensor4() y = T.tensor4() pixel_weights = T.tensor3() final_pred_layer = segmenter[-1] all_layers = ll.get_all_layers(segmenter) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_masks_train = ll.get_output(segmenter, X) predicted_mask_valid = ll.get_output(final_pred_layer, X, deterministic=True) thresh = 0.5 accuracy = lambda pred: T.mean(T.eq(T.argmax(pred, axis=1), T.argmax(y, axis=1))) true_pos = lambda pred: T.sum((pred[:,0,:,:] > thresh) * (y[:,0,:,:] > thresh)) false_pos = lambda pred: T.sum((pred[:,0,:,:] > thresh) - (y[:,0,:,:] > thresh)) precision = lambda pred: (true_pos(pred) / (true_pos(pred) + false_pos(pred))) pixel_weights_1d = pixel_weights.flatten(ndim=1) losses = lambda pred: T.mean(crossentropy_flat(pred + 1e-7, y + 1e-7) * pixel_weights_1d) decay = 0.0001 reg = regularize_network_params(final_pred_layer, l2) * decay losses_reg = lambda pred: losses(pred) + reg loss_train = T.sum([losses_reg(mask) for mask in predicted_masks_train]) loss_train.name = 'CE' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in segmenter])) all_params = ll.get_all_params(segmenter, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) acc_train = accuracy(predicted_masks_train[-1]) acc_valid = accuracy(predicted_mask_valid) prec_train = precision(predicted_masks_train[-1]) prec_valid = precision(predicted_mask_valid) print("Compiling network for training") tic = time.time() train_iter = theano.function([X, y, pixel_weights], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X, y, pixel_weights], [losses(predicted_mask_valid), losses_reg(predicted_mask_valid), prec_valid]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def initialize(self): self.prediction = lasagne.layers.get_output(self.network) loss = lasagne.objectives.categorical_crossentropy( self.prediction, self.target) self.loss = loss.mean() self.params = lasagne.layers.get_all_params(self.network, trainable=True) self.updates = lasagne.updates.nesterov_momentum( self.loss, self.params, learning_rate=self.learning_rate, momentum=0.9) self.train_fn = theano.function([self.input, self.target], loss, updates=self.updates, allow_input_downcast=True) outputs = T.argmax(self.prediction, axis=1) # self.predict_values = theano.function([self.input], self.prediction, allow_input_downcast=True) self.predict_values = theano.function([self.input], outputs, allow_input_downcast=True) self.test_prediction = lasagne.layers.get_output(self.network, deterministic=True) self.test_loss = lasagne.objectives.categorical_crossentropy( self.test_prediction, self.target) l1 = regularize_network_params(self.network, lasagne.regularization.l1) l2 = regularize_network_params(self.network, lasagne.regularization.l2) self.test_loss = self.test_loss.mean() + (l1 * 1e-4) + l2 self.test_acc = T.mean(T.eq(T.argmax(self.test_prediction, axis=1), self.target), dtype=theano.config.floatX) self.val_fn = theano.function([self.input, self.target], [self.test_loss, self.test_acc], allow_input_downcast=True)
def _create_network(self): logger.info("Building network ...") net, input_var = self._build_network() target_values = T.matrix('target_output') actions = T.icol('actions') # Create masks # mask = theano.shared(np.zeros((self.batch_size, self.num_actions)).astype(np.int32)) mask = T.zeros_like(target_values) mask = T.set_subtensor( mask[T.arange(self.batch_size), actions.reshape((-1, ))], 1) # feed-forward path network_output = lasagne.layers.get_output(net, input_var / 255.0) # Add regularization penalty loss = squared_error(network_output * mask, target_values).mean() if self.weight_decay > 0.0: loss += regularize_network_params(net, l2) * self.weight_decay # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(net, trainable=True) # Compute updates for training if self.clip_error: grads = theano.gradient.grad(loss, all_params) grads = [ lasagne.updates.norm_constraint(grad, self.clip_error, range(grad.ndim)) for grad in grads ] updates = self.optimizer(grads, all_params, learning_rate=self.learning_rate, rho=self.decay_rate) else: updates = self.optimizer(loss, all_params, learning_rate=self.learning_rate, rho=self.decay_rate) # Theano functions for training and computing cost logger.info("Compiling functions ...") train = theano.function([input_var, target_values, actions], [loss, network_output, target_values, mask], updates=updates) predict = theano.function([input_var], network_output) return net, train, predict
def initialize(self): self.prediction = lasagne.layers.get_output(self.network) loss = lasagne.objectives.categorical_crossentropy(self.prediction, self.target) self.loss = loss.mean() self.params = lasagne.layers.get_all_params(self.network, trainable=True) self.updates = lasagne.updates.nesterov_momentum( self.loss, self.params, learning_rate=self.learning_rate, momentum=0.9) self.train_fn = theano.function([self.input, self.target], loss, updates=self.updates, allow_input_downcast=True) self.predict_values = theano.function([self.input], T.argmax(self.prediction, axis=1), allow_input_downcast=True) self.test_prediction = lasagne.layers.get_output(self.network, deterministic=True) self.test_loss = lasagne.objectives.categorical_crossentropy(self.test_prediction, self.target) l1 = regularize_network_params(self.network, lasagne.regularization.l1) l2 = regularize_network_params(self.network, lasagne.regularization.l2) self.test_loss = self.test_loss.mean() + (l1 * 1e-4) + l2 self.test_acc = T.mean(T.eq(T.argmax(self.test_prediction, axis=1), self.target), dtype=theano.config.floatX) self.val_fn = theano.function([self.input, self.target], [self.test_loss, self.test_acc], allow_input_downcast=True)
def build_network(args, network): X = T.tensor4('X') Y = T.ivector('Y') #physics weights W = T.dvector('W') #make sum to 1 #w = W / T.sum(W) #network = build_layers(args) '''write loss function equation''' prediction = get_output(network, X) loss = categorical_crossentropy(prediction, Y) #multiply by weights loss = T.dot(loss.T,W) weightsl2 = regularize_network_params(network, l2) loss += args['weight_decay'] * weightsl2 '''calculate test loss (cross entropy with no regularization) and accuracy''' test_prediction = get_output(network, X, deterministic=True) test_loss = categorical_crossentropy(test_prediction, Y) test_loss = T.dot(test_loss.T,W) '''classification percentage: we can change this based on false postive/false negative criteria''' test_acc = categorical_accuracy(test_prediction,Y) test_acc = T.dot(test_acc.T,W) / T.sum(W) params = get_all_params(network, trainable=True) updates = adam(loss, learning_rate=args['learning_rate'], params=params) #updates = nesterov_momentum(loss, params, learning_rate=args['learning_rate'], momentum=args['momentum']) '''train_fn -> takes in input,label pairs -> outputs loss ''' train_fn = theano.function([X, Y, W], loss, updates=updates) '''val_fn -> takes in input,label pairs -> outputs non regularized loss and accuracy ''' val_fn = theano.function([X, Y, W], test_loss) acc_fn = theano.function([X, Y, W], test_acc) out_fn = theano.function([X], test_prediction) score_fn = theano.function([X], test_prediction[:,1].T) return {"net":network}, {'tr': train_fn, 'val': val_fn, 'test': val_fn, 'acc': acc_fn, 'out': out_fn, "score":score_fn}
def build_instrument_model(self, n_vars, **kwargs): targets = TT.vector() instrument_vars = TT.matrix() instruments = layers.InputLayer((None, n_vars), instrument_vars) instruments = layers.DropoutLayer(instruments, p=0.2) dense_layer = layers.DenseLayer(instruments, kwargs['dense_size'], nonlinearity=nonlinearities.tanh) dense_layer = layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.tanh) dense_layer = layers.DropoutLayer(dense_layer, p=0.5) self.instrument_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.instrument_output) prediction = layers.get_output(self.instrument_output, deterministic=False) test_prediction = layers.get_output(self.instrument_output, deterministic=True) # flexible here, endog variable can be categorical, continuous, etc. l2_cost = regularization.regularize_network_params(self.instrument_output, regularization.l2) loss = objectives.squared_error(prediction.flatten(), targets.flatten()).mean() + 1e-4 * l2_cost loss_total = objectives.squared_error(prediction.flatten(), targets.flatten()).mean() params = layers.get_all_params(self.instrument_output, trainable=True) param_updates = updates.adadelta(loss, params) self._instrument_train_fn = theano.function( [ targets, instrument_vars, ], loss, updates=param_updates ) self._instrument_loss_fn = theano.function( [ targets, instrument_vars, ], loss_total ) self._instrument_output_fn = theano.function([instrument_vars], test_prediction) return init_params
def build(layer_heads, params): """""" fns = {} # model methods x = T.tensor4('input') for target in params['targets']: fns[target['name']] = {} out_layer = layer_heads[target['name']] y = T.matrix('target') o = L.get_output(out_layer, inputs=x) o_vl = L.get_output(out_layer, inputs=x, deterministic=True) if 'class_weight' in params and params['class_weight']: loss_fn = partial(weighted_cce, weights=params['class_weight']) else: loss_fn = obj.categorical_crossentropy loss = loss_fn(o, y).mean() loss_vl = loss_fn(o_vl, y).mean() wd_l2 = reg.regularize_network_params(out_layer, reg.l2) wd_l2 *= params['beta'] acc_vl = obj.categorical_accuracy(o_vl, y).mean() updates_ = updates.adam(loss + wd_l2, L.get_all_params(out_layer, trainable=True), learning_rate=params['learning_rate'], epsilon=params['epsilon']) fns[target['name']]['train'] = theano.function( [x, y], updates=updates_, allow_input_downcast=True) fns[target['name']]['predict'] = theano.function( [x], o_vl, allow_input_downcast=True) fns[target['name']]['cost'] = theano.function( [x, y], loss_vl, allow_input_downcast=True) fns[target['name']]['acc'] = theano.function([x, y], acc_vl, allow_input_downcast=True) fns[target['name']]['transform'] = theano.function( [x], L.get_output(L.get_all_layers(layer_heads[target['name']])[-2], inputs=x, deterministic=True), allow_input_downcast=True) return fns, layer_heads
def build_network(args, network): X = T.tensor4('X') Y = T.ivector('Y') #physics weights W = T.dvector('W') #make sum to 1 #w = W / T.sum(W) #network = build_layers(args) '''write loss function equation''' prediction = get_output(network, X) loss = categorical_crossentropy(prediction, Y) #multiply by weights loss = T.dot(loss.T, W) weightsl2 = regularize_network_params(network, l2) loss += args['weight_decay'] * weightsl2 '''calculate test loss (cross entropy with no regularization) and accuracy''' test_prediction = get_output(network, X, deterministic=True) test_loss = categorical_crossentropy(test_prediction, Y) test_loss = T.dot(test_loss.T, W) '''classification percentage: we can change this based on false postive/false negative criteria''' test_acc = categorical_accuracy(test_prediction, Y) test_acc = T.dot(test_acc.T, W) / T.sum(W) params = get_all_params(network, trainable=True) updates = adam(loss, learning_rate=args['learning_rate'], params=params) #updates = nesterov_momentum(loss, params, learning_rate=args['learning_rate'], momentum=args['momentum']) '''train_fn -> takes in input,label pairs -> outputs loss ''' train_fn = theano.function([X, Y, W], loss, updates=updates) '''val_fn -> takes in input,label pairs -> outputs non regularized loss and accuracy ''' val_fn = theano.function([X, Y, W], test_loss) acc_fn = theano.function([X, Y, W], test_acc) out_fn = theano.function([X], test_prediction) score_fn = theano.function([X], test_prediction[:, 1].T) return { "net": network }, { 'tr': train_fn, 'val': val_fn, 'test': val_fn, 'acc': acc_fn, 'out': out_fn, "score": score_fn }
def build_network(args, network): X = T.tensor4('X') #Y = T.tensor4('Y') thresh = 1.0 #network = build_layers(args) '''write loss function equation''' prediction = get_output(network, X) loss = squared_error(prediction, X).mean() weightsl2 = regularize_network_params(network, l2).sum() loss += args['weight_decay'] * weightsl2 '''calculate test loss (cross entropy with no regularization) and accuracy''' test_prediction = get_output(network, X, deterministic=True) test_loss = squared_error(test_prediction, X).sum() '''classification percentage: we can change this based on false postive/false negative criteria''' '''max reconstriuction error''' test_acc = test_loss test_score = T.sum(squared_error(test_prediction, X), axis=(1,2,3)) with T.autocast_float_as("float64"): test_score = test_score / (T.prod(X.shape[1:])) inds = test_score[test_score > thresh].nonzero() test_score = T.set_subtensor(test_score[inds], 1) #test_score = ifelse(T.gt(test_score,thresh), thresh,test_score ) test_score = 1 - test_score params = get_all_params(network, trainable=True) updates = adam(loss, learning_rate=args['learning_rate'], params=params) #updates = nesterov_momentum(loss, params, learning_rate=args['learning_rate'], momentum=args['momentum']) '''train_fn -> takes in input,label pairs -> outputs loss ''' train_fn = theano.function([X], loss, updates=updates) '''val_fn -> takes in input,label pairs -> outputs non regularized loss and accuracy ''' val_fn = theano.function([X], test_loss) acc_fn = theano.function([X], test_acc) out_fn = theano.function([X], test_prediction) score_fn = theano.function([X], test_score) return {"net":network}, {'tr': train_fn, 'val': val_fn, 'acc': acc_fn, 'out': out_fn, "score": score_fn}
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] first_layer = layers[1] network_output = lasagne.layers.get_output( output_layer, deterministic=deterministic, **get_output_kw) if not deterministic: losses = loss_function(network_output, target) \ + l2 * regularization.regularize_network_params( output_layer, regularization.l2) \ + l1 * regularization.regularize_layer_params( first_layer, regularization.l1) else: losses = loss_function(network_output, target) return aggregate(losses)
def __build_loss_train__fn__(self): # create loss function prediction = layers.get_output(self.net) loss = objectives.categorical_crossentropy(prediction, self.__target_var__) loss = loss.mean() + 1e-4 * regularization.regularize_network_params(self.net, regularization.l2) val_acc = T.mean(T.eq(T.argmax(prediction, axis=1), self.__target_var__),dtype=theano.config.floatX) # create parameter update expressions params = layers.get_all_params(self.net, trainable=True) self.eta = theano.shared(sp.array(sp.float32(0.05), dtype=sp.float32)) update_rule = updates.nesterov_momentum(loss, params, learning_rate=self.eta, momentum=0.9) # compile training function that updates parameters and returns training loss self.__train_fn__ = theano.function([self.__input_var__,self.__target_var__], loss, updates=update_rule) self.__predict_fn__ = theano.function([self.__input_var__], layers.get_output(self.net,deterministic=True)) self.__val_fn__ = theano.function([self.__input_var__,self.__target_var__], [loss,val_acc])
def similarity_iter(output_layer, match_layer, update_params, match_layer_w=0): X1 = T.tensor4() X2 = T.tensor4() y = T.ivector() # find the input layers # TODO this better all_layers = ll.get_all_layers(match_layer) # make image of all layers imwrite_architecture(all_layers, './layer_rep.png') input_1 = filter(lambda x: x.name == 'input1', all_layers)[0] input_2 = filter(lambda x: x.name == 'input2', all_layers)[0] descriptors_train, match_prob_train = ll.get_output([output_layer, match_layer], {input_1: X1, input_2: X2}) descriptors_eval, match_prob_eval = ll.get_output([output_layer, match_layer], {input_1: X1, input_2: X2}, deterministic=True) #descriptor_shape = ll.get_output_shape(output_layer, {input_1: X1, input_2: X2}) #print("Network output shape: %r" % (descriptor_shape,)) # distance minimization distance = lambda x: (x[:,0,:] - x[:,1,:] + 1e-7).norm(2, axis=1) #distance_eval = (descriptors_eval[:,0,:] - descriptors_eval[:,1,:] + 1e-7).norm(2, axis=1) # 9/21 squaring the loss seems to prevent it from getting to 0.5 really quickly (i.e. w/in 3 epochs) # let's see if it will learn something good margin = 1 decay = 0 reg = regularize_network_params(match_layer, l2) * decay loss = lambda x, z: ((1-match_layer_w)*T.mean(y*(distance(x)) + (1 - y)*(T.maximum(0, margin - distance(x))))/2 # constrastive loss + match_layer_w*T.mean(binary_crossentropy(z.T + 1e-7,y))) # matching loss loss_reg = lambda x, z: (loss(x,z) + reg) # this loss doesn't work since it just pushes all the descriptors near each other and then predicts 0 all the time for tha matching #jason_loss = lambda x, z: T.mean(distance(x)*y + (1-y)*binary_crossentropy(z.T + 1e-7,y)) #loss_eval = T.mean(y*(distance_eval**2) + (1 - y)*(T.maximum(0, 1 - distance_eval)**2)) all_params = ll.get_all_params(match_layer) # unsure how I would do this if there were truly two trainable branches... loss_train = loss_reg(descriptors_train, match_prob_train) loss_train.name = 'combined_loss' # for the names grads = T.grad(loss_train, all_params, add_names=True) #updates = adam(grads, all_params, **update_params) updates = nesterov_momentum(grads, all_params, **update_params) train_iter = theano.function([X1, X2, y], [loss_train, loss(descriptors_train, match_prob_train)] + grads, updates=updates) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) valid_iter = theano.function([X1, X2, y], loss(descriptors_eval, match_prob_eval)) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def get_train_fn(self, last_only=False): input_var = self.net['input'].input_var target_var = T.ivector('targets') prediction = lasagne.layers.get_output(self.output_layer) loss = categorical_crossentropy(prediction, target_var) loss = loss.mean() error = T.mean(T.neq(T.argmax(prediction, axis=1), target_var), dtype=theano.config.floatX) regularization = self.regularizer_amount * regularize_network_params( self.output_layer, l2) if last_only: all_params = self.output_layer.get_params(trainable=True) else: all_params = lasagne.layers.get_all_params(self.output_layer, trainable=True) updates = nesterov_momentum(loss + regularization, all_params, learning_rate=self.lr) return theano.function([input_var, target_var], (loss, error), updates=updates)
def __init__(self, lambda1 = 1e-5, lambda2 = 1e-6): self.input_var = T.tensor4('inputs') self.target_var = T.matrix('targets') self.are_net = build_ARE(self.input_var, ENCODE_SIZE) self.reconstructed = lasagne.layers.get_output(self.are_net) self.encode_layer, _ = get_layer_by_name(self.are_net, 'encode') self.action_layer, _ = get_layer_by_name(self.are_net, 'action') self.encoded_feature = lasagne.layers.get_output(self.encode_layer) self.transformed_feature = lasagne.layers.get_output(self.action_layer) self.XXT = T.dot(self.encoded_feature, self.encoded_feature.transpose()) self.l2_penalty = regularize_network_params(self.are_net,l2) self.loss = lasagne.objectives.squared_error(self.reconstructed, self.target_var) self.loss = 1000*self.loss.mean() - lambda1 * self.XXT.trace() + lambda2 * self.l2_penalty self.params = lasagne.layers.get_all_params(self.are_net, trainable=True) self.updates = lasagne.updates.adadelta(self.loss, self.params) self.train_fn = theano.function([self.input_var, self.target_var], self.loss, updates=self.updates,on_unused_input='warn') self.best_err = 999 self.action1_w = np.eye(ENCODE_SIZE, dtype = np.float32) self.action1_b = np.zeros(ENCODE_SIZE, dtype = np.float32) self.action2_w = np.eye(ENCODE_SIZE, dtype = np.float32) self.action2_b = np.zeros(ENCODE_SIZE, dtype = np.float32)
def make_training_functions(network_layers, input_var, target_var, stack_params, weight_decay): encode_layer, hidden_layer, smth_act_layer, network = network_layers; output = lasagne.layers.get_output(network, deterministic = True); loss = lasagne.objectives.squared_error(output, target_var).mean() + \ weight_decay * regularization.regularize_network_params( layer = network, penalty = regularization.l2, tags={'regularizable' : True}); params = layers.get_all_params(network, trainable = True); updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate = 0.00001, momentum = 0.95); stack_updates = lasagne.updates.nesterov_momentum(loss, stack_params, learning_rate = 0.00001, momentum = 0.95); encode = lasagne.layers.get_output(encode_layer, deterministic = True); hidden = lasagne.layers.get_output(hidden_layer, deterministic = True); smth_act = lasagne.layers.get_output(smth_act_layer, deterministic = True); val_fn = theano.function([input_var, target_var], [loss, encode, hidden, smth_act, output]); train_fn = theano.function([input_var, target_var], loss, updates = updates); stack_train_fn = theano.function([input_var, target_var], loss, updates = stack_updates); return val_fn, train_fn, stack_train_fn;
def loss_iter(segmenter, update_params={}): X = T.tensor4() y = T.tensor4() pixel_weights = T.tensor3() all_layers = ll.get_all_layers(segmenter) imwrite_architecture(all_layers, './layer_rep.png') predicted_mask_train = ll.get_output(segmenter, X) predicted_mask_valid = ll.get_output(segmenter, X, deterministic=True) accuracy = lambda pred: T.mean(T.eq(T.argmax(pred, axis=1), T.argmax(y, axis=1))) pixel_weights_1d = pixel_weights.flatten(ndim=1) losses = lambda pred: T.mean(crossentropy_flat(pred + 1e-7, y + 1e-7) * pixel_weights_1d) decay = 0.0001 reg = regularize_network_params(segmenter, l2) * decay losses_reg = lambda pred: losses(pred) + reg loss_train = losses_reg(predicted_mask_train) loss_train.name = 'combined_loss' # for the names all_params = ll.get_all_params(segmenter) grads = T.grad(loss_train, all_params, add_names=True) #updates = adam(grads, all_params, **update_params) updates = adam(grads, all_params, **update_params) acc_train = accuracy(predicted_mask_train) acc_valid = accuracy(predicted_mask_valid) print("Compiling network for training") tic = time.time() train_iter = theano.function([X, y, pixel_weights], [loss_train, losses(predicted_mask_train), acc_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X, y, pixel_weights], [losses(predicted_mask_valid), acc_valid]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def _create_network(self): print("Building network ...") net, input_var = self._build_network() target_values = T.matrix('target_output') maxQ_idx = target_values.argmax(1) # Create masks mask = theano.shared( np.ones((BATCH_SIZE, self.actionsNum)).astype(np.int32)) maxQ_mask = theano.shared( np.zeros((BATCH_SIZE, self.actionsNum)).astype(np.int32)) mask = T.set_subtensor(mask[np.arange(BATCH_SIZE), maxQ_idx], 0) maxQ_mask = T.set_subtensor(maxQ_mask[np.arange(BATCH_SIZE), maxQ_idx], 1) # lasagne.layers.get_output produces a variable for the output of the net network_output = lasagne.layers.get_output(net) new_target_values = target_values * maxQ_mask + network_output * mask err = squared_error(network_output, new_target_values) # Add regularization penalty cost = err.mean() + regularize_network_params(net, l2) * DECAY # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(net, trainable=True) # Compute SGD updates for training updates = lasagne.updates.adadelta(cost, all_params) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function( [input_var, target_values], [cost, new_target_values, network_output, err.mean(1), maxQ_idx], updates=updates) predict = theano.function([input_var], lasagne.layers.get_output(net)) return net, train, predict
def __init__(self, lambda1 = 0, lambda2 = 0): self.input_var = T.tensor4('inputs') self.target_var = T.matrix('targets') self.are_net = build_ARE(self.input_var, ENCODE_SIZE) self.reconstructed = lasagne.layers.get_output(self.are_net) self.encode_layer, _ = get_layer_by_name(self.are_net, 'encode') self.action_layer, _ = get_layer_by_name(self.are_net, 'action') self.encoded_feature = lasagne.layers.get_output(self.encode_layer) self.transformed_feature = lasagne.layers.get_output(self.action_layer) self.l1_penalty = regularize_network_params(self.are_net, l1) self.loss = lasagne.objectives.squared_error(self.reconstructed, self.target_var) self.XXT = T.dot(self.encoded_feature, self.encoded_feature.transpose()) + T.dot(self.transformed_feature, self.transformed_feature.transpose()) self.loss = self.loss.mean() + lambda1 * self.l1_penalty + lambda2 * self.XXT.trace() self.loss = self.loss.mean() + lambda1 * self.l1_penalty self.params = lasagne.layers.get_all_params(self.are_net, trainable=True) self.l_r = theano.shared(np.array(0.01, dtype=theano.config.floatX)) self.updates = lasagne.updates.nesterov_momentum( self.loss, self.params, learning_rate=self.l_r, momentum=0.90) self.train_fn = theano.function([self.input_var, self.target_var], self.loss, updates=self.updates,on_unused_input='warn') self.best_err = 999 self.action1_w = np.eye(ENCODE_SIZE, dtype = np.float32) self.action1_b = np.zeros(ENCODE_SIZE, dtype = np.float32) self.action2_w = np.eye(ENCODE_SIZE, dtype = np.float32) self.action2_b = np.zeros(ENCODE_SIZE, dtype = np.float32)
def define_updates(network, input_var, target_var, weight_var): params = lasagne.layers.get_all_params(network, trainable=True) out = lasagne.layers.get_output(network) test_out = lasagne.layers.get_output(network, deterministic=True) l2_loss = P.L2_LAMBDA * regularize_network_params(network, l2) train_metrics = score_metrics(out, target_var, weight_var, l2_loss) loss, acc, dice_score, target_prediction, prediction, prediction_binary = train_metrics val_metrics = score_metrics(test_out, target_var, weight_var, l2_loss) t_loss, t_acc, t_dice_score, t_target_prediction, t_prediction, t_prediction_binary = train_metrics l_r = theano.shared(np.array(P.LEARNING_RATE, dtype=theano.config.floatX)) if P.OPTIMIZATION == 'nesterov': updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=l_r, momentum=P.MOMENTUM) if P.OPTIMIZATION == 'adam': updates = lasagne.updates.adam( loss, params, learning_rate=l_r) logging.info("Defining train function") train_fn = theano.function([input_var, target_var, weight_var],[ loss, l2_loss, acc, dice_score, target_prediction, prediction, prediction_binary], updates=updates) logging.info("Defining validation function") val_fn = theano.function([input_var, target_var, weight_var], [ t_loss, l2_loss, t_acc, t_dice_score, t_target_prediction, t_prediction, t_prediction_binary]) return train_fn, val_fn, l_r
def _create_network(self): logger.info("Building network ...") net, input_var = self._build_network() target_values = T.matrix('target_output') actions = T.icol('actions') # Create masks # mask = theano.shared(np.zeros((self.batch_size, self.num_actions)).astype(np.int32)) mask = T.zeros_like(target_values) mask = T.set_subtensor(mask[T.arange(self.batch_size), actions.reshape((-1,))], 1) # feed-forward path network_output = lasagne.layers.get_output(net, input_var / 255.0) # Add regularization penalty loss = squared_error(network_output * mask, target_values).mean() if self.weight_decay > 0.0: loss += regularize_network_params(net, l2) * self.weight_decay # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(net, trainable=True) # Compute updates for training if self.clip_error: grads = theano.gradient.grad(loss, all_params) grads = [lasagne.updates.norm_constraint(grad, self.clip_error, range(grad.ndim)) for grad in grads] updates = self.optimizer(grads, all_params, learning_rate=self.learning_rate, rho=self.decay_rate) else: updates = self.optimizer(loss, all_params, learning_rate=self.learning_rate, rho=self.decay_rate) # Theano functions for training and computing cost logger.info("Compiling functions ...") train = theano.function([input_var, target_values, actions], [loss, network_output, target_values, mask], updates=updates) predict = theano.function([input_var], network_output) return net, train, predict
def train(dataset, learn_step=0.005, weight_decay=1e-4, num_epochs=500, max_patience=100, data_augmentation={}, savepath=None, loadpath=None, early_stop_class=None, batch_size=None, resume=False, train_from_0_255=False): # # Prepare load/save directories # exp_name = 'unet_' + 'data_aug' if bool(data_augmentation) else '' if savepath is None: raise ValueError('A saving directory must be specified') savepath = os.path.join(savepath, dataset, exp_name) # loadpath = os.path.join(loadpath, dataset, exp_name) print(savepath) # print loadpath if not os.path.exists(savepath): os.makedirs(savepath) else: print('\033[93m The following folder already exists {}. ' 'It will be overwritten in a few seconds...\033[0m'.format( savepath)) print('Saving directory : ' + savepath) with open(os.path.join(savepath, "config.txt"), "w") as f: for key, value in locals().items(): f.write('{} = {}\n'.format(key, value)) # # Define symbolic variables # input_var = T.tensor4('input_var') target_var = T.ivector('target_var') # # Build dataset iterator # if batch_size is not None: bs = batch_size else: bs = [10, 1, 1] train_iter = IsbiEmStacksDataset(which_set='train', batch_size=batch_size[0], seq_per_subset=0, seq_length=0, data_augm_kwargs=data_augmentation, return_one_hot=False, return_01c=False, overlap=0, use_threads=True, shuffle_at_each_epoch=True, return_list=True, return_0_255=False) val_iter = IsbiEmStacksDataset(which_set='val', batch_size=batch_size[1], seq_per_subset=0, seq_length=0, return_one_hot=False, return_01c=False, use_threads=True, shuffle_at_each_epoch=False, return_list=True, return_0_255=False) test_iter = None batch = train_iter.next() input_dim = (np.shape(batch[0])[2], np.shape(batch[0])[3]) #(x,y) image shape n_batches_train = train_iter.nbatches n_batches_val = val_iter.nbatches n_batches_test = test_iter.nbatches if test_iter is not None else 0 n_classes = train_iter.non_void_nclasses void_labels = train_iter.void_labels nb_in_channels = train_iter.data_shape[0] print("Batch. train: %d, val %d, test %d" % (n_batches_train, n_batches_val, n_batches_test)) print("Nb of classes: %d" % (n_classes)) print("Nb. of input channels: %d" % (nb_in_channels)) # # Build network # net = build_UNet(n_input_channels= nb_in_channels,# BATCH_SIZE = batch_size, num_output_classes = n_classes, base_n_filters = 64, do_dropout=False, input_dim = (None, None)) output_layer = net["output_flattened"] # # Define and compile theano functions # print("Defining and compiling training functions") prediction = lasagne.layers.get_output(output_layer, input_var) loss = crossentropy_metric(prediction, target_var, void_labels) if weight_decay > 0: weightsl2 = regularize_network_params(output_layer, lasagne.regularization.l2) loss += weight_decay * weightsl2 params = lasagne.layers.get_all_params(output_layer, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=learn_step) train_fn = theano.function([input_var, target_var], loss, updates=updates) print("Defining and compiling test functions") test_prediction = lasagne.layers.get_output(output_layer, input_var,deterministic=True) test_loss = crossentropy_metric(test_prediction, target_var, void_labels) test_acc = accuracy_metric(test_prediction, target_var, void_labels) test_jacc = jaccard_metric(test_prediction, target_var, n_classes) val_fn = theano.function([input_var, target_var], [test_loss, test_acc, test_jacc]) # # Train # err_train = [] err_valid = [] acc_valid = [] jacc_valid = [] patience = 0 # Training main loop print("Start training") for epoch in range(num_epochs): # Single epoch training and validation start_time = time.time() cost_train_tot = 0 # Train print('Training steps ') for i in range(n_batches_train): print(i) # Get minibatch X_train_batch, L_train_batch = train_iter.next() L_train_batch = np.reshape(L_train_batch, np.prod(L_train_batch.shape)) # Training step cost_train = train_fn(X_train_batch, L_train_batch) out_str = "cost %f" % (cost_train) cost_train_tot += cost_train err_train += [cost_train_tot/n_batches_train] # Validation cost_val_tot = 0 acc_val_tot = 0 jacc_val_tot = np.zeros((2, n_classes)) print('Validation steps') for i in range(n_batches_val): print(i) # Get minibatch X_val_batch, L_val_batch = val_iter.next() L_val_batch = np.reshape(L_val_batch, np.prod(L_val_batch.shape)) # Validation step cost_val, acc_val, jacc_val = val_fn(X_val_batch, L_val_batch) acc_val_tot += acc_val cost_val_tot += cost_val jacc_val_tot += jacc_val err_valid += [cost_val_tot/n_batches_val] acc_valid += [acc_val_tot/n_batches_val] jacc_perclass_valid = jacc_val_tot[0, :] / jacc_val_tot[1, :] if early_stop_class == None: jacc_valid += [np.mean(jacc_perclass_valid)] else: jacc_valid += [jacc_perclass_valid[early_stop_class]] out_str = "EPOCH %i: Avg epoch training cost train %f, cost val %f" +\ ", acc val %f, jacc val class 0 % f, jacc val class 1 %f, jacc val %f took %f s" out_str = out_str % (epoch, err_train[epoch], err_valid[epoch], acc_valid[epoch], jacc_perclass_valid[0], jacc_perclass_valid[1], jacc_valid[epoch], time.time()-start_time) print(out_str) with open(os.path.join(savepath, "unet_output.log"), "a") as f: f.write(out_str + "\n") # Early stopping and saving stuff if epoch == 0: best_jacc_val = jacc_valid[epoch] elif epoch > 1 and jacc_valid[epoch] > best_jacc_val: best_jacc_val = jacc_valid[epoch] patience = 0 np.savez(os.path.join(savepath, 'new_unet_model_best.npz'), *lasagne.layers.get_all_param_values(output_layer)) np.savez(os.path.join(savepath, 'unet_errors_best.npz'), err_valid, err_train, acc_valid, jacc_valid) else: patience += 1 np.savez(os.path.join(savepath, 'new_unet_model_last.npz'), *lasagne.layers.get_all_param_values(output_layer)) np.savez(os.path.join(savepath, 'unet_errors_last.npz'), err_valid, err_train, acc_valid, jacc_valid) # Finish training if patience has expired or max nber of epochs # reached if patience == max_patience or epoch == num_epochs-1: if test_iter is not None: # Load best model weights with np.load(os.path.join(savepath, 'new_unet_model_best.npz')) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] nlayers = len(lasagne.layers.get_all_params(output_layer)) lasagne.layers.set_all_param_values(output_layer, param_values[:nlayers]) # Test cost_test_tot = 0 acc_test_tot = 0 jacc_test_tot = np.zeros((2, n_classes)) for i in range(n_batches_test): # Get minibatch X_test_batch, L_test_batch = test_iter.next() L_test_batch = np.reshape(L_test_batch, np.prod(L_test_batch.shape)) # Test step cost_test, acc_test, jacc_test = val_fn(X_test_batch, L_test_batch) acc_test_tot += acc_test cost_test_tot += cost_test jacc_test_tot += jacc_test err_test = cost_test_tot/n_batches_test acc_test = acc_test_tot/n_batches_test jacc_test_perclass = jacc_test_tot[0, :] / jacc_test_tot[1, :] jacc_test = np.mean(jacc_test_perclass) out_str = "FINAL MODEL: err test % f, acc test %f, " +\ "jacc test class 0 %f, jacc test class 1 %f, jacc test %f" out_str = out_str % (err_test, acc_test, jacc_test_perclass[0], jacc_test_perclass[1], jacc_test) print(out_str) if savepath != loadpath: print('Copying model and other training files to {}'.format(loadpath)) copy_tree(savepath, loadpath) # End return
def train(cf): ############### # load data # ############### print('-' * 75) print('Loading data') #TODO ; prepare a public version of the data loader train_iter, val_iter, test_iter = load_data(cf.dataset, train_crop_size=cf.train_crop_size, batch_size=cf.batch_size, horizontal_flip=True, ) n_classes = train_iter.get_n_classes() void_labels = train_iter.get_void_labels() print('Number of images : train : {}, val : {}, test : {}'.format( train_iter.get_n_samples(), val_iter.get_n_samples(), test_iter.get_n_samples())) ################### # Build model # ################### # Build model and display summary net = cf.net net.summary() # Restore if hasattr(cf, 'pretrained_model'): print('Using a pretrained model : {}'.format(cf.pretrained_model)) net.restore(cf.pretrained_model) # Compile functions print('Compilation starts at ' + str(datetime.now()).split('.')[0]) params = lasagne.layers.get_all_params(net.output_layer, trainable=True) lr_shared = theano.shared(np.array(cf.learning_rate, dtype='float32')) lr_decay = np.array(cf.lr_sched_decay, dtype='float32') # Create loss and metrics for key in ['train', 'valid']: # LOSS pred = get_output(net.output_layer, deterministic=key == 'valid', batch_norm_update_averages=False, batch_norm_use_averages=False) loss = crossentropy(pred, net.target_var, void_labels) if cf.weight_decay: weightsl2 = regularize_network_params(net.output_layer, lasagne.regularization.l2) loss += cf.weight_decay * weightsl2 # METRICS I, U, acc = theano_metrics(pred, net.target_var, n_classes, void_labels) # COMPILE start_time_compilation = time.time() if key == 'train': updates = cf.optimizer(loss, params, learning_rate=lr_shared) train_fn = theano.function([net.input_var, net.target_var], [loss, I, U, acc], updates=updates) else: val_fn = theano.function([net.input_var, net.target_var], [loss, I, U, acc]) print('{} compilation took {:.3f} seconds'.format(key, time.time() - start_time_compilation)) ################### # Main loops # ################### # metric's sauce init_history = lambda: {'loss': [], 'jaccard': [], 'accuracy': []} history = {'train': init_history(), 'val': init_history(), 'test': init_history()} patience = 0 best_jacc_val = 0 best_epoch = 0 if hasattr(cf, 'pretrained_model'): print('Validation score before training') print batch_loop(val_iter, val_fn, 0, 'val', {'val': init_history()}) # Training main loop print('-' * 30) print('Training starts at ' + str(datetime.now()).split('.')[0]) print('-' * 30) for epoch in range(cf.num_epochs): # Train start_time_train = time.time() history = batch_loop(train_iter, train_fn, epoch, 'train', history) # Validation start_time_valid = time.time() history = batch_loop(val_iter, val_fn, epoch, 'val', history) # Print out_str = \ '\r\x1b[2 Epoch {} took {}+{} sec. ' \ 'loss = {:.5f} | jacc = {:.5f} | acc = {:.5f} || ' \ 'loss = {:.5f} | jacc = {:.5f} | acc = {:.5f}'.format( epoch, int(start_time_valid - start_time_train), int(time.time() - start_time_valid), history['train']['loss'][-1], history['train']['jaccard'][-1], history['train']['accuracy'][-1], history['val']['loss'][-1], history['val']['jaccard'][-1], history['val']['accuracy'][-1]) # Monitoring jaccard if history['val']['jaccard'][-1] > best_jacc_val: out_str += ' (BEST)' best_jacc_val = history['val']['jaccard'][-1] best_epoch = epoch patience = 0 net.save(os.path.join(cf.savepath, 'model.npz')) else: patience += 1 print out_str np.savez(os.path.join(cf.savepath, 'errors.npz'), metrics=history, best_epoch=best_epoch) # Learning rate scheduler lr_shared.set_value(lr_shared.get_value() * lr_decay) # Finish training if patience has expired or max nber of epochs reached if patience == cf.max_patience or epoch == cf.num_epochs - 1: # Load best model weights net.restore(os.path.join(cf.savepath, 'model.npz')) # Test print('Training ends\nTest') if test_iter.get_n_samples() == 0: print 'No test set' else: history = batch_loop(test_iter, val_fn, epoch, 'test', history) print ('Average cost test = {:.5f} | jacc test = {:.5f} | acc_test = {:.5f} '.format( history['test']['loss'][-1], history['test']['jaccard'][-1], history['test']['accuracy'][-1])) np.savez(os.path.join(cf.savepath, 'errors.npz'), metrics=history, best_epoch=best_epoch) # Exit return
OUTPUT = open(progress_filename, 'w') OUTPUT.write("NUM_PARAMS,"+str(lasagne.layers.count_params(cnn_model['output']))+'\n') OUTPUT.write("EPOCH,RMSE,MSE\n") OUTPUT.close() #mulitply our training predictions and visualizations by 1.0 # this makes it so these numbers are part of the theano graph but not changed in value # in this way, theano doesn't complain at me for unused variables context_output_train = lasagne.layers.get_output(cnn_model['output'],deterministic=False) train_prediction = context_output_train[0] * 1.0 visual_predictions_train = context_output_train[1] * 1.0 train_prediction = train_prediction.flatten() train_loss = lasagne.objectives.squared_error(target_vals,train_prediction) #get our loss and our cost l2_loss = regularize_network_params(cnn_model['output'],l2) train_cost = T.mean(train_loss) + l2_loss*l2_regularization_lambda #then get our parameters and update from lasagne params = lasagne.layers.get_all_params(cnn_model['output'], trainable=True) updates = lasagne.updates.adam(train_cost, params, learning_rate=learning_rate) #then get the outputs for the test and multiply them by 1.0 like above context_output_test = lasagne.layers.get_output(cnn_model['output'],deterministic=True) test_predicition = context_output_test[0] * 1.0 visual_predictions_test = context_output_test[1] * 1.0 test_predicition = test_predicition.flatten() test_cost = lasagne.objectives.squared_error(target_vals,test_predicition) #then define my theano functions for train and test train_func = theano.function([input_atom,input_bonds,input_atom_index,\
def initialize_network(self): """ :description: this method initializes the network, updates, and theano functions for training and retrieving q values. Here's an outline: 1. build the q network and target q network 2. initialize theano symbolic variables used for compiling functions 3. initialize the theano numeric variables used as input to functions 4. formulate the symbolic loss 5. formulate the symbolic updates 6. compile theano functions for training and for getting q_values """ batch_size, input_shape = self.batch_size, self.input_shape lasagne.random.set_rng(self.rng) # 1. build the q network and target q network self.l_out = self.build_network(input_shape, self.num_actions, batch_size) self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size) self.reset_target_network() # 2. initialize theano symbolic variables used for compiling functions states = T.tensor4('states') actions = T.icol('actions') rewards = T.col('rewards') next_states = T.tensor4('next_states') # terminals are used to indicate a terminal state in the episode and hence a mask over the future # q values i.e., Q(s',a') terminals = T.icol('terminals') # 3. initialize the theano numeric variables used as input to functions self.states_shape = (batch_size,) + (1,) + input_shape self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # 4. formulate the symbolic loss q_vals = lasagne.layers.get_output(self.l_out, states) next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector' diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) # a lot of the deepmind work clips the td error at 1 so we do that here # the problem is that gradient backpropagating through this minimum node # will be zero if diff is larger then 1.0 (because changing params before # the minimum does not impact the output of the minimum). To account for # this we take the part of the td error (magnitude) greater than 1.0 and simply # add it to the loss, which allows gradient to backprop but just linearly # in the td error rather than quadratically quadratic_part = T.minimum(abs(diff), 1.0) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + linear_part loss = T.mean(loss) + self.regularization * regularize_network_params(self.l_out, l2) # 5. formulate the symbolic updates params = lasagne.layers.helper.get_all_params(self.l_out) updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate) # 6. compile theano functions for training and for getting q_values givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared})
def test_space_invaders(game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None,) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states window_size = 3 # prev state input prev_window = InputLayer((None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") memory_dict = {window: prev_window} # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None,) + window.output_shape[2:]) # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(window_max, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #fakes for a2c policy_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.softmax, name="a2c action probas") state_value_eval = DenseLayer(nn, num_units=1, nonlinearity=None, name="a2c state values") # resolver resolver = ProbabilisticResolver(policy_eval, name="resolver") # agent agent = Agent(observation_layer, memory_dict, (q_eval,policy_eval,state_value_eval), resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [np.zeros((batch_size,) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values, policy, etc obtained via experience replay _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) (q_values_sequence,policy_sequence,value_sequence) = estimators # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = 0. #1-step algos for algo in qlearning,sarsa: elwise_mse_loss += algo.get_elementwise_objective(q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) #qlearning_n_step for n in (1,3,replay_seq_len-1, replay_seq_len, replay_seq_len+1,None): elwise_mse_loss += qlearning_n_step.get_elementwise_objective(q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=n) #a2c n_step elwise_mse_loss += a2c_n_step.get_elementwise_objective(policy_sequence, value_sequence[:,:,0], env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=3) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10 ** -4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function([], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % ( epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
def test_memory(game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None,) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states memory_dict = OrderedDict([]) ###Window window_size = 3 # prev state input prev_window = InputLayer((None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None,) + window.output_shape[2:]) memory_dict[window] = prev_window ###Stack #prev stack stack_w,stack_h = 4, 5 stack_inputs = DenseLayer(observation_reshape,stack_w,name="prev_stack") stack_controls = DenseLayer(observation_reshape,3, nonlinearity=lasagne.nonlinearities.softmax, name="prev_stack") prev_stack = InputLayer((None,stack_h,stack_w), name="previous stack state") stack = StackAugmentation(stack_inputs,prev_stack, stack_controls) memory_dict[stack] = prev_stack stack_top = lasagne.layers.SliceLayer(stack,0,1) ###RNN preset prev_rnn = InputLayer((None,16), name="previous RNN state") new_rnn = RNNCell(prev_rnn,observation_reshape) memory_dict[new_rnn] = prev_rnn ###GRU preset prev_gru = InputLayer((None,16), name="previous GRUcell state") new_gru = GRUCell(prev_gru,observation_reshape) memory_dict[new_gru] = prev_gru ###GRUmemorylayer prev_gru1 = InputLayer((None,15), name="previous GRUcell state") new_gru1 = GRUMemoryLayer(15,observation_reshape,prev_gru1) memory_dict[new_gru1] = prev_gru1 #LSTM with peepholes prev_lstm0_cell = InputLayer((None,13), name="previous LSTMCell hidden state [with peepholes]") prev_lstm0_out = InputLayer((None,13), name="previous LSTMCell output state [with peepholes]") new_lstm0_cell,new_lstm0_out = LSTMCell(prev_lstm0_cell,prev_lstm0_out, input_or_inputs = observation_reshape, peepholes=True,name="newLSTM1 [with peepholes]") memory_dict[new_lstm0_cell] = prev_lstm0_cell memory_dict[new_lstm0_out] = prev_lstm0_out #LSTM without peepholes prev_lstm1_cell = InputLayer((None,14), name="previous LSTMCell hidden state [no peepholes]") prev_lstm1_out = InputLayer((None,14), name="previous LSTMCell output state [no peepholes]") new_lstm1_cell,new_lstm1_out = LSTMCell(prev_lstm1_cell,prev_lstm1_out, input_or_inputs = observation_reshape, peepholes=False,name="newLSTM1 [no peepholes]") memory_dict[new_lstm1_cell] = prev_lstm1_cell memory_dict[new_lstm1_out] = prev_lstm1_out ##concat everything for i in [flatten(window_max),stack_top,new_rnn,new_gru,new_gru1]: print(i.output_shape) all_memory = concat([flatten(window_max),stack_top,new_rnn,new_gru,new_gru1,new_lstm0_out,new_lstm1_out,]) # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(all_memory, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") # resolver resolver = EpsilonGreedyResolver(q_eval, epsilon=0.1, name="resolver") # agent agent = Agent(observation_layer, memory_dict, q_eval, resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [np.zeros((batch_size,) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values obtained via experience replay _env_states, _observations, _memories, _imagined_actions, q_values_sequence = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = qlearning.get_elementwise_objective(q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10 ** -4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function([], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % ( epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
def test_reasoning_value_based(n_parallel_games=25, algo = qlearning, n_steps=1 ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param algo: training algorithm to use (module) """ # instantiate an experiment environment with default parameters env = experiment.BooleanReasoningEnvironment() # hidden neurons n_hidden_neurons = 64 observation_size = (None,) + tuple(env.observation_shapes) observation_layer = lasagne.layers.InputLayer(observation_size, name="observation_input") prev_state_layer = lasagne.layers.InputLayer([None, n_hidden_neurons], name="prev_state_input") # memory layer (this isn't the same as lasagne recurrent units) rnn = RNNCell(prev_state_layer, observation_layer, name="rnn0") # q_values (estimated using very simple neural network) q_values = lasagne.layers.DenseLayer(rnn, num_units=env.n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") # resolver uses epsilon - parameter which defines a probability of randomly taken action. epsilon = theano.shared(np.float32(0.1), name="e-greedy.epsilon") resolver = EpsilonGreedyResolver(q_values, epsilon=epsilon, name="resolver") # packing this into agent agent = Agent(observation_layer, agent_states={rnn:prev_state_layer}, policy_estimators=q_values, action_layers=resolver) # Since it's a lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver,trainable=True) # produce interaction sequences of length <= 10 (state_seq,), observation_seq, agent_state, action_seq, qvalues_seq = agent.get_sessions( env, session_length=10, batch_size=env.batch_size, ) hidden_seq = agent_state[rnn] # get rewards for all actions rewards_seq = env.get_reward_sequences(state_seq, action_seq) # get indicator whether session is still active is_alive_seq = env.get_whether_alive(observation_seq) # gamma - delayed reward coefficient - what fraction of reward is retained if it is obtained one tick later gamma = theano.shared(np.float32(0.99), name='q_learning_gamma') squarred_Qerror = algo.get_elementwise_objective( qvalues_seq, action_seq, rewards_seq, is_alive_seq, gamma_or_gammas=gamma) # take sum over steps, average over sessions mse_Qloss = squarred_Qerror.sum(axis=1).mean() # impose l2 regularization on network weights reg_l2 = regularize_network_params(resolver, l2) * 10**-3 loss = mse_Qloss + reg_l2 # compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.1) # take sum over steps, average over sessions mean_session_reward = rewards_seq.sum(axis=1).mean() train_fun = theano.function([], [loss, mean_session_reward], updates=updates) compute_mean_session_reward = theano.function([], mean_session_reward) score_log = Metrics() for epoch in range(5000): # update resolver's epsilon (chance of random action instead of optimal one) # epsilon decreases over time current_epsilon = 0.05 + 0.95 * np.exp(-epoch / 2500.) resolver.epsilon.set_value(np.float32(current_epsilon)) # train env.generate_new_data_batch(n_parallel_games) loss, avg_reward = train_fun() # show current learning progress if epoch % 100 == 0: print(epoch), # estimate reward for epsilon-greedy strategy avg_reward_current = compute_mean_session_reward() score_log["expected epsilon-greedy reward"][epoch] = avg_reward_current # estimating the reward under assumption of greedy strategy resolver.epsilon.set_value(0) avg_reward_greedy = compute_mean_session_reward() score_log["expected greedy reward"][epoch] = avg_reward_greedy if avg_reward_greedy > 2: print("converged") break else: print("diverged") raise ValueError("Algorithm diverged")
def build_update_functions(train_set_x, train_set_y, valid_set_x, valid_set_y, network, y, X, train_MASK, val_MASK, batch_size=32, l2_reg=.0001, learning_rate=.005, momentum=.9): # build update functions # extract tensor representing the network predictions prediction = get_output(network) ################################################ ##################old########################### # # collect squared error # loss_RMSE = squared_error(prediction, y) # # compute the root mean squared error # loss_RMSE = loss_RMSE.mean().sqrt() ###################New######################### # Aggregate the element-wise error into a scalar value using a mask # note that y should note contain NAN, replace them with 0 or -1. The value does not matter. It # is not used to calculate the aggregated error and update of the network. # MASK should be a matrix of size(y), with 0s in place of NaN values and 1s everywhere else. # build tensor variable for mask trainMASK = T.matrix('trainMASK') # collect squared error loss_RMSE = squared_error(prediction, y) # Drop nan values and average over the remaining values loss_RMSE = aggregate(loss_RMSE, weights=trainMASK, mode='normalized_sum') # compute the square root loss_RMSE = loss_RMSE.sqrt() ############################################### # add l2 regularization l2_penalty = regularize_network_params(network, l2) loss = (1 - l2_reg) * loss_RMSE + l2_reg * l2_penalty # get network params params = get_all_params(network, trainable = True) # # create update criterion # print('nestrov') # updates = nesterov_momentum( loss, params, learning_rate=.01, momentum=.9) # print('AdaGrad') # updates = adagrad(loss, params,learning_rate= 1e-2) # print('RMSPROP \n') updates = rmsprop(loss, params, learning_rate=learning_rate) # create validation/test loss expression # the loss represents the loss for all the labels test_prediction = get_output(network, deterministic=True) ################################################ ##################old########################### # # collect squared error # test_loss = squared_error(test_prediction,y) # # compute the root mean squared error # test_loss = test_loss.mean().sqrt() # # test_loss_withl2 = (1-l2_reg) * test_loss + l2_reg * l2_penalty ################################################ ###################New######################### # Aggregate the element-wise error into a scalar value using a mask # note that y should note contain NAN, replace them with 0 or -1. The value does not matter. It # is not used to calculate the aggregated error and update of the network. # MASK should be a matrix of size(y), with 0s in place of NaN values and 1s everywhere else. # build tensor variable for mask valMASK = T.matrix('valMASK') # collect squared error test_loss = squared_error(test_prediction, y) # Drop nan values and average over the remaining values test_loss = aggregate(test_loss, weights=valMASK, mode='normalized_sum') # compute the square root test_loss = test_loss.sqrt() ################################################ # index for mini-batch slicing index = T.lscalar() # training function train_set_x_size = train_set_x.get_value().shape[0] val_set_x_size = valid_set_x.get_value().shape[0] train_fn = theano.function(inputs=[index], outputs=[loss, loss_RMSE], updates=updates, givens={X: train_set_x[ index * batch_size: T.minimum((index + 1) * batch_size, train_set_x_size)], y: train_set_y[ index * batch_size: T.minimum((index + 1) * batch_size, train_set_x_size)], trainMASK: train_MASK[index * batch_size: T.minimum((index + 1) * batch_size, train_set_x_size)]}) # validation function val_fn = theano.function(inputs=[index], outputs=[test_loss, prediction], givens={X: valid_set_x[ index * batch_size: T.minimum((index + 1) * batch_size, val_set_x_size)], y: valid_set_y[ index * batch_size: T.minimum((index + 1) * batch_size, val_set_x_size)], valMASK: val_MASK[ index * batch_size: T.minimum((index + 1) * batch_size, val_set_x_size)]}) return train_fn, val_fn
def train(model, batch_size = 200, learning_rate=0.1): np.random.seed(5468) net = model() x = net['input'].input_var y = T.ivector('y') print("........ building model") prediction = lasagne.layers.get_output(net['output'],x) train_prediction = lasagne.layers.get_output(net['output'], x, deterministic=False) test_prediction = lasagne.layers.get_output(net['output'], x, deterministic=True) global_avg = lasagne.layers.get_output(net['global_avg'],x) before_avg = lasagne.layers.get_output(net['conv7_1'],x) lamda = 0.001 l2_penalty = regularize_network_params(net['output'], l2) loss = lasagne.objectives.categorical_crossentropy(train_prediction, y) loss_train = lasagne.objectives.aggregate(loss, mode='mean') + lamda*l2_penalty test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, y) loss_test = lasagne.objectives.aggregate(test_loss, mode='mean') params = lasagne.layers.get_all_params(net['output'], trainable=True) lr_theano = T.fscalar() updates = lasagne.updates.momentum(loss_train, params, momentum=np.float32(0.9), learning_rate=lr_theano) # updates = gradient_descend_momentum(cost=loss_train, params=params, lr=lr_theano, m=np.float32(0.9)) lr_epochs = [200,250, 300] y_pred = T.argmax(test_prediction, axis=1) errors = T.mean(T.neq(y_pred, y)) test_prediction_fn = theano.function(inputs=[x], outputs=test_prediction) index = T.iscalar() # Load Dataset train_x, train_y, test_x, test_y = load_cifar_whitened() valid_x, valid_y = test_x, test_y n_train_batches = train_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_x.get_value(borrow=True).shape[0] // batch_size train_model = theano.function(inputs=[index, lr_theano], outputs=[loss_train], updates=updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y: train_y[index*batch_size:(index+1)*batch_size] }) validate_model = theano.function(inputs=[index], outputs=[errors], givens={ x: valid_x[index*batch_size:(index+1)*batch_size], y: valid_y[index*batch_size:(index+1)*batch_size] }) test_model = theano.function(inputs=[index], outputs=[errors], givens={ x: test_x[index*batch_size:(index+1)*batch_size], y: test_y[index*batch_size:(index+1)*batch_size] }) get_pred = theano.function(inputs=[index], outputs=[y_pred], givens={ x: test_x[index*batch_size:(index+1)*batch_size] }) global_avg_fn = theano.function(inputs=[index], outputs=[global_avg], givens={ x: train_x[index*batch_size:(index+1)*batch_size] }) before_avg_fn = theano.function(inputs=[index], outputs=[before_avg], givens={ x: train_x[index*batch_size:(index+1)*batch_size] }) print("........ training") model_name = model.__name__ n_epochs=350 lr_epochs=[200, 250, 300] verbose = True lr = learning_rate """ Wrapper function for training and test THEANO model :type train_model: Theano.function :param train_model: :type validate_model: Theano.function :param validate_model: :type test_model: Theano.function :param test_model: :type n_train_batches: int :param n_train_batches: number of training batches :type n_valid_batches: int :param n_valid_batches: number of validation batches :type n_test_batches: int :param n_test_batches: number of testing batches :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type verbose: boolean :param verbose: to print out epoch summary or not to """ # early-stopping parameters patience = 100000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.9995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 best_epoch = 0 done_looping = False curframe = inspect.currentframe() calframe = inspect.getouterframes(curframe, 2) while (epoch < n_epochs) and (not done_looping): if epoch % 50 == 0 or epoch in lr_epochs: save_model(net['output'], "{0}_{1}.pklz".format(model_name, epoch)) if epoch in lr_epochs: lr *= 0.1 epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if (iter % 100 == 0) and verbose: print('training @ iter = ', iter, file=sys.stderr) cost_ij = train_model(minibatch_index, lr) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = np.mean(validation_losses) if verbose: print('epoch %i, loss %f, minibatch %i/%i, validation error %f %%' % (epoch, cost_ij[0], minibatch_index + 1, n_train_batches, this_validation_loss * 100.), file=sys.stderr) # if we got the best validation score until now if this_validation_loss < best_validation_loss: best_epoch = epoch #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = np.mean(test_losses) if verbose: csvfile = open(model_name + '_results.csv', 'a') resultswriter = csv.writer(csvfile) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.), file=sys.stderr) resultswriter.writerow([best_validation_loss, epoch, best_iter, n_train_batches, test_score, model_name, learning_rate]) csvfile.close() if patience <= iter or (best_validation_loss == 0.0 and test_score == 0.0): done_looping = True break end_time = timeit.default_timer() # Retrieve the name of function who invokes train_nn() (caller's name) # Print out summary print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The training process for function ' + calframe[1][3] + ' ran for %.2fm' % ((end_time - start_time) / 60.)))
def train(dataset, learning_rate=0.0005, weight_decay=0.001, num_epochs=500, max_patience=25, data_augmentation={}, savepath=None, loadpath=None, batch_size=None, resume=False): if savepath is None: raise ValueError('A saving directory must be specified') if batch_size is None: batch_size = [1024, 1024, 1] # Model hyperparameters n_filters = 64 filter_size = 25 depth = 8 block = 'bn_relu_conv' # Hyperparameters for the dataset loader smooth_or_raw = 'both' # use both input channels shuffle_at_each_epoch = True # # Prepare load/save directories # exp_name = 'fcn1D' exp_name += '_lrate=' + str(learning_rate) exp_name += '_fil=' + str(n_filters) exp_name += '_fsizes=' + str(filter_size) exp_name += '_depth=' + str(depth) exp_name += '_data=' + smooth_or_raw exp_name += '_decay=' + str(weight_decay) exp_name += '_pat=' + str(max_patience) savepath = os.path.join(savepath, dataset, exp_name) loadpath = os.path.join(loadpath, dataset, exp_name) print('Savepath : ') print(savepath) print('Loadpath : ') print(loadpath) if not os.path.exists(savepath): os.makedirs(savepath) else: print('\033[93m The following folder already exists {}. ' 'It will be overwritten in a few seconds...\033[0m'.format( savepath)) print('Saving directory : ' + savepath) with open(os.path.join(savepath, "config.txt"), "w") as f: for key, value in locals().items(): f.write('{} = {}\n'.format(key, value)) # # Define symbolic variables # input_var = T.tensor3('input_var') # n_example*nb_in_channels*ray_size target_var = T.ivector('target_var') # n_example*ray_size # learning rate is defined below as a theano variable. learn_step = theano.shared(np.array(learning_rate, dtype=theano.config.floatX)) # # Build dataset iterator # if smooth_or_raw == 'both': nb_in_channels = 2 use_threads = False else: nb_in_channels = 1 use_threads = True train_iter = Cortical6LayersDataset( which_set='train', smooth_or_raw=smooth_or_raw, batch_size=batch_size[0], data_augm_kwargs=data_augmentation, shuffle_at_each_epoch=True, return_one_hot=False, return_01c=False, return_list=False, use_threads=use_threads, preload=True) val_iter = Cortical6LayersDataset( which_set='valid', smooth_or_raw=smooth_or_raw, batch_size=batch_size[1], shuffle_at_each_epoch=True, return_one_hot=False, return_01c=False, return_list=False, use_threads=use_threads, preload=True) test_iter = None n_batches_train = train_iter.nbatches n_batches_val = val_iter.nbatches n_batches_test = test_iter.nbatches if test_iter is not None else 0 n_classes = train_iter.non_void_nclasses void_labels = train_iter.void_labels # # Build network # simple_net_output, net = build_model(input_var, filter_size=filter_size, n_filters=n_filters, depth=depth, block=block, nb_in_channels=nb_in_channels, n_classes=n_classes) # # Define and compile theano functions # print("Defining and compiling training functions") prediction = lasagne.layers.get_output(simple_net_output[0]) loss = categorical_crossentropy(prediction, target_var) loss = loss.mean() if weight_decay > 0: weightsl2 = regularize_network_params( simple_net_output, lasagne.regularization.l2) loss += weight_decay * weightsl2 train_acc = accuracy_metric(prediction, target_var, void_labels) params = lasagne.layers.get_all_params(simple_net_output, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=learn_step) train_fn = theano.function([input_var, target_var], [loss, train_acc], updates=updates) print("Done") print("Defining and compiling valid functions") valid_prediction = lasagne.layers.get_output(simple_net_output[0], deterministic=True) valid_loss = categorical_crossentropy(valid_prediction, target_var).mean() valid_acc = accuracy_metric(valid_prediction, target_var, void_labels) valid_jacc = jaccard(valid_prediction, target_var, n_classes) valid_fn = theano.function([input_var, target_var], [valid_loss, valid_acc, valid_jacc]) print("Done") # # Train loop # err_train = [] acc_train = [] err_valid = [] acc_valid = [] jacc_valid = [] patience = 0 # Training main loop print("Start training") for epoch in range(num_epochs): learn_step.set_value((learn_step.get_value() * 0.99).astype(theano.config.floatX)) # Single epoch training and validation start_time = time.time() # Cost train and acc train for this epoch cost_train_epoch = 0 acc_train_epoch = 0 for i in range(n_batches_train): # Get minibatch (comment the next line if only 1 minibatch in training) train_batch = train_iter.next() X_train_batch, L_train_batch, idx_train_batch = train_batch['data'], train_batch['labels'], \ train_batch['filenames'][0] L_train_batch = np.reshape(L_train_batch, np.prod(L_train_batch.shape)) # Training step cost_train_batch, acc_train_batch = train_fn(X_train_batch, L_train_batch) # Update epoch results cost_train_epoch += cost_train_batch acc_train_epoch += acc_train_batch # Add epoch results err_train += [cost_train_epoch / n_batches_train] acc_train += [acc_train_epoch / n_batches_train] # Validation cost_val_epoch = 0 acc_val_epoch = 0 jacc_val_epoch = np.zeros((2, n_classes)) for i in range(n_batches_val): # Get minibatch (comment the next line if only 1 minibatch in training) val_batch = val_iter.next() X_val_batch, L_val_batch, idx_val_batch = val_batch['data'], val_batch['labels'], val_batch['filenames'][0] L_val_batch = np.reshape(L_val_batch, np.prod(L_val_batch.shape)) # Validation step cost_val_batch, acc_val_batch, jacc_val_batch = valid_fn(X_val_batch, L_val_batch) # Update epoch results cost_val_epoch += cost_val_batch acc_val_epoch += acc_val_batch jacc_val_epoch += jacc_val_batch # Add epoch results err_valid += [cost_val_epoch / n_batches_val] acc_valid += [acc_val_epoch / n_batches_val] jacc_perclass_valid = jacc_val_epoch[0, :] / jacc_val_epoch[1, :] jacc_valid += [np.mean(jacc_perclass_valid)] # worse_indices_valid += [worse_indices_val_epoch] # Print results (once per epoch) out_str = ("EPOCH %i: Avg cost train %f, acc train %f" + ", cost val %f, acc val %f, jacc val per class %s, " "jacc val %f took %f s") out_str = out_str % (epoch, err_train[epoch], acc_train[epoch], err_valid[epoch], acc_valid[epoch], ['%d: %f' % (i, j) for i, j in enumerate(jacc_perclass_valid)], jacc_valid[epoch], time.time() - start_time) print(out_str) # Early stopping and saving stuff with open(os.path.join(savepath, "fcn1D_output.log"), "a") as f: f.write(out_str + "\n") if epoch == 0: best_jacc_val = jacc_valid[epoch] elif epoch > 1 and jacc_valid[epoch] > best_jacc_val: print('saving best (and last) model') best_jacc_val = jacc_valid[epoch] patience = 0 np.savez(os.path.join(savepath, 'new_fcn1D_model_best.npz'), *lasagne.layers.get_all_param_values(simple_net_output)) np.savez(os.path.join(savepath, "fcn1D_errors_best.npz"), err_train=err_train, acc_train=acc_train, err_valid=err_valid, acc_valid=acc_valid, jacc_valid=jacc_valid) else: patience += 1 print('saving last model') np.savez(os.path.join(savepath, 'new_fcn1D_model_last.npz'), *lasagne.layers.get_all_param_values(simple_net_output)) np.savez(os.path.join(savepath, "fcn1D_errors_last.npz"), err_train=err_train, acc_train=acc_train, err_valid=err_valid, acc_valid=acc_valid, jacc_valid=jacc_valid) # Finish training if patience has expired or max nber of epochs reached if patience == max_patience or epoch == num_epochs - 1: if savepath != loadpath: print('Copying model and other training files to {}'.format(loadpath)) copy_tree(savepath, loadpath) break