def test_aggregate_weighted_mean(): from lasagne.objectives import aggregate x = theano.tensor.matrix('x') w = theano.tensor.matrix('w') assert theano.gof.graph.is_same_graph(aggregate(x, w), (x * w).mean()) assert theano.gof.graph.is_same_graph(aggregate(x, w, mode='mean'), (x * w).mean())
def test_aggregate_invalid(): from lasagne.objectives import aggregate with pytest.raises(ValueError) as exc: aggregate(theano.tensor.matrix(), mode='asdf') assert 'mode must be' in exc.value.args[0] with pytest.raises(ValueError) as exc: aggregate(theano.tensor.matrix(), mode='normalized_sum') assert 'require weights' in exc.value.args[0]
def objective(layers_, target, **kwargs): out_a_layer = layers_['output_a'] out_b_layer = layers_['output_b'] # Get the outputs out_a, out_b = get_output([out_a_layer, out_b_layer]) # Get the targets gt_a = T.cast(target[:, 0], 'int32') gt_b = target[:, 1].reshape((-1, 1)) # Calculate the multi task loss cls_loss = aggregate(categorical_crossentropy(out_a, gt_a)) reg_loss = aggregate(categorical_crossentropy(out_b, gt_b)) loss = cls_loss + reg_loss return loss
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, l1=0, l2=0, tv=0, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output( output_layer, deterministic=deterministic, **get_output_kw) loss = aggregate(loss_function(network_output, target)) if l1: loss += regularization.regularize_layer_params( layers[-2], regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params( layers[-2], regularization.l2) * l2 if tv: loss += T.mean(T.abs_(network_output[:, 1:] - network_output[:, :-1]))*tv return loss
def test_maxpool_layer(): l_in1 = InputLayer((None, 2)) l_in2 = InputLayer((None, 20)) l_hid = DenseLayer(l_in2, num_units=30, nonlinearity=rectify) l_pool = MaxpoolLayer([l_in1, l_hid]) l_out = DenseLayer(l_pool, num_units=1, nonlinearity=sigmoid) bounds = theano.tensor.lmatrix('bounds') data = theano.tensor.matrix('data') targets = theano.tensor.matrix('targets') predictions = get_output(l_out, {l_in1: bounds, l_in2: data}) loss = categorical_crossentropy(predictions, targets) loss = aggregate(loss, mode='mean') params = get_all_params(l_out) updates_sgd = sgd(loss, params, learning_rate=0.0001) train_function = theano.function([bounds, data, targets], updates=updates_sgd, allow_input_downcast=True) test_bounds = np.array([[0, 3], [3, 5], [5, 7]]) test_X = np.random.randn(10, 20) test_Y = np.array([[0], [1], [0]]) train_function(test_bounds, test_X, test_Y)
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output(output_layer, deterministic=deterministic, **get_output_kw) losses = loss_function(network_output, target) return aggregate(losses)
def objective(layers, loss_function, target, aggregate=aggregate, aggregation_weights=None, deterministic=False, l1=0, l2=0, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output( output_layer, deterministic=deterministic, **get_output_kw) if isfunction(aggregation_weights): weights = aggregation_weights(layers) else: weights = aggregation_weights loss = aggregate(loss_function(network_output, target), weights) if l1: loss += regularization.regularize_layer_params( layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params( layers.values(), regularization.l2) * l2 return loss
def objective(layers, loss_function, target, aggregate=aggregate, aggregation_weights=None, deterministic=False, l1=0, l2=0, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output(output_layer, deterministic=deterministic, **get_output_kw) if isfunction(aggregation_weights): weights = aggregation_weights(layers) else: weights = aggregation_weights loss = aggregate(loss_function(network_output, target), weights) if l1: loss += regularization.regularize_layer_params(layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params(layers.values(), regularization.l2) * l2 return loss
def objective(layers, loss_function, target, aggregate=aggregate, mode='mean', weights=None, deterministic=False, l1=0, l2=0, l3=0, l3_layers=[], get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output( output_layer, deterministic=deterministic, **get_output_kw) loss = aggregate(loss_function(network_output, target), weights=weights, mode=mode) if l1: loss += regularization.regularize_layer_params( layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params( layers.values(), regularization.l2) * l2 if l3: for layer in l3_layers: loss += regularization.regularize_layer_params( layer, regularization.l2) * l3 return loss
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, l1=0, l2=0, get_output_kw=None): """ Default implementation of the NeuralNet Objective. :param layers: The underlying layers of the NeuralNetwork :param loss_function: The callable loss function to use :param target: the expected output :param aggregate: the aggregation function to use :param deterministic: Whether or not to get a deterministic output :param l1: Optional l1 regularization parameter :param l2: Optional l2 regularization parameter :param get_output_kw: optional kwargs to pass to :meth:`NeuralNetwork.get_output` :return: The total calculated loss """ if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output( output_layer, deterministic=deterministic, **get_output_kw) loss = aggregate(loss_function(network_output, target)) if l1: loss += regularization.regularize_layer_params( layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params( layers.values(), regularization.l2) * l2 return loss
def bc_with_ranking(y_prediction, y_true): """ Trying to combine ranking loss with numeric precision""" # first get the log loss like normal #logloss = aggregate(T.nnet.binary_crossentropy(y_pred, y_true)) # clip the probabilities to keep stability y_pred_clipped = T.clip(y_prediction, _EPSILON, 1.-_EPSILON) logloss = aggregate(T.nnet.categorical_crossentropy(y_prediction, y_true)) y_pred = y_pred_clipped[:,1] # next, build a rank loss # translate into the raw scores before the logit y_pred_score = T.log(y_pred / (1. - y_pred)) # determine what the maximum score for a zero outcome is max_zerooutcome = T.max(y_pred_score * (y_true <1.)) mean_oneoutcome = T.mean(y_pred_score * (y_true > 0.1)) border = ifelse(T.gt(max_zerooutcome, mean_oneoutcome), mean_oneoutcome, max_zerooutcome) # determine how much each score is above or below it rankloss = y_pred_score - border # only keep losses for positive outcomes rankloss = rankloss * y_true # only keep losses where the score is below the max rankloss = T.sqr(T.clip(rankloss, -100., 0.)) # average the loss for just the positive outcomes rankloss = T.sum(rankloss) / (T.sum(y_true > 0.1) + 1.) # determine what the maximum score for a zero outcome is min_oneoutcome = T.min(y_pred_score * (y_true > 0.1)) mean_zerooutcome = T.mean(y_pred_score * (y_true < 1.)) border = ifelse(T.lt(min_oneoutcome, mean_zerooutcome), mean_zerooutcome, min_oneoutcome) # determine how much each score is above or below it rankloss_ = y_pred_score - border # only keep losses for positive outcomes rankloss_ = rankloss_ * (1. - y_true) # only keep losses where the score is below the max rankloss_ = T.sqr(T.clip(rankloss_, 0., 100.)) # average the loss for just the positive outcomes rankloss_ = T.sum(rankloss_, axis=0) / (T.sum(y_true < 1.) + 1.) # return (rankloss + 1) * logloss - an alternative to try #return rankloss + logloss return 0.01*rankloss_ + 0.01*rankloss + logloss
def build_model(self, train_set, test_set, validation_set=None): super(CAE, self).build_model(train_set, test_set, validation_set) y_train = get_output(self.model, self.sym_x) loss = aggregate(squared_error(y_train, self.sym_x), mode='mean') loss += +1e-4 * lasagne.regularization.regularize_network_params( self.model, lasagne.regularization.l2) y_test = get_output(self.model, self.sym_x, deterministic=True) loss_test = aggregate(squared_error(y_test, self.sym_x), mode='mean') grads_collect = T.grad(loss, self.trainable_model_params) sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, self.trainable_model_params, self.sym_lr, sym_beta1, sym_beta2) # Training function x_batch = self.sh_train_x[self.batch_slice] givens = {self.sym_x: x_batch} inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2 ] outputs = [loss] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Validation and test function givens = {self.sym_x: self.sh_test_x} f_test = theano.function(inputs=[], outputs=[loss_test], givens=givens) self.train_args['inputs']['batchsize'] = 128 self.train_args['inputs']['learningrate'] = 1e-3 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 1e-6 self.train_args['outputs']['loss'] = '%0.6f' self.test_args['outputs']['loss_test'] = '%0.6f' return f_train, f_test, None, self.train_args, self.test_args, self.validate_args
def _get_loss_function(self): # TODO: remove `or True` if self._loss is None: if self._regression: self._loss = squared_error else: self._loss = categorical_crossentropy return aggregate(self._loss(self._get_output(), self.t_label), mode='mean')
def __call__(self,layers, loss_function, target, aggregate=T.mean, **kwargs): output_layer = layers[-1] network_output = get_output(output_layer, **kwargs) return -aggregate(self.auc_error(network_output[:,1], target))
def compute_cost(rnn_outputs, forward_probabilities, backward_pointers, x_end, y_end, label): def backward_step(backlinks, position): new_position = backlinks[position] return new_position, position initial_state = T.argmax(forward_probabilities[x_end-1,y_end-2:y_end]) + y_end - 2 results, _ = theano.scan(fn = backward_step, sequences = backward_pointers[0:x_end,:], outputs_info = [initial_state, None], go_backwards = True) alignment = label[results[1][::-1]] return aggregate(categorical_crossentropy(rnn_outputs[0:x_end], alignment), mode='sum')
def __init__(self, num_features, num_layers, num_nodes, dropout, learning_rate, weight_decay, verbose=False): self.verbose = verbose self.input_var = T.matrix('inputs') self.target_var = T.ivector('targets') self.num_features = num_features self.num_layers = num_layers self.num_nodes = num_nodes self.dropout = dropout self.network = self.build_network() self.prediction = lasagne.layers.get_output(self.network, deterministic=True) self.predict_function = theano.function([self.input_var], self.prediction, allow_input_downcast=True) self.loss = categorical_crossentropy(self.prediction, self.target_var) self.loss = aggregate(self.loss, mode='mean') if not os.path.exists('models'): os.mkdir('models') # L2 regularization with weight decay weightsl2 = lasagne.regularization.regularize_network_params( self.network, lasagne.regularization.l2) weightsl1 = lasagne.regularization.regularize_network_params( self.network, lasagne.regularization.l1) self.loss += weight_decay * weightsl2 #+ 1e-5*weightsl1 # ADAM training params = lasagne.layers.get_all_params(self.network, trainable=True) updates = lasagne.updates.adagrad(self.loss, params, learning_rate=learning_rate) #updates = lasagne.updates.adam(self.loss, params) #updates = lasagne.updates.nesterov_momentum(self.loss, params, # learning_rate=learning_rate, momentum=momentum) self.train = theano.function([self.input_var, self.target_var], self.loss, updates=updates) self.create_test_function() self.create_bayes_test_function()
def get_network(model): input_data = tensor.dmatrix('x') targets_var = tensor.dmatrix('y') network = layers.InputLayer((model['batch_size'], model['input_vars']), input_data) nonlin = nonlinearities.rectify if model['hidden_nonlinearity'] != 'ReLu': nonlin = nonlinearities.tanh prev_layer = network for l in range(model['nlayers']): fc = layers.DenseLayer(prev_layer, model['units'], nonlinearity=nonlin) if model['dropout']: fc = layers.DropoutLayer(fc, 0.5) prev_layer = fc output_lin = None if model['output_mode'] == OUTPUT_LOG: output_lin = nonlinearities.tanh output_layer = layers.DenseLayer(prev_layer, 1, nonlinearity=output_lin) predictions = layers.get_output(output_layer) if model['output_mode'] == OUTPUT_BOUNDED: (minth, maxth) = model['maxmin'][model['control']] maxt = theano.shared(np.ones((model['batch_size'], 1)) * maxth) mint = theano.shared(np.ones((model['batch_size'], 1)) * minth) predictions = tensor.min(tensor.concatenate([maxt, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) predictions = tensor.max(tensor.concatenate([mint, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) loss = objectives.squared_error(predictions, targets_var) loss = objectives.aggregate(loss, mode='mean') params = layers.get_all_params(output_layer) test_prediction = layers.get_output(output_layer, deterministic=True) test_loss = objectives.squared_error(test_prediction, targets_var) test_loss = test_loss.mean() updates_sgd = updates.sgd(loss, params, learning_rate=model['lr']) ups = updates.apply_momentum(updates_sgd, params, momentum=0.9) train_fn = theano.function([input_data, targets_var], loss, updates=ups) pred_fn = theano.function([input_data], predictions) val_fn = theano.function([input_data, targets_var], test_loss) return {'train': train_fn, 'eval': val_fn, 'pred': pred_fn, 'layers': output_layer}
def build_loss(targets, prediction, optimization): """ setup loss function with weight decay regularization """ if optimization["objective"] == 'categorical': loss = objectives.categorical_crossentropy(prediction, targets) elif optimization["objective"] == 'binary': prediction = T.clip(prediction, 1e-7, 1-1e-7) loss = -(targets*T.log(prediction) + (1.0-targets)*T.log(1.0-prediction)) # loss = objectives.binary_crossentropy(prediction[:,loss_index], targets[:,loss_index]) elif (optimization["objective"] == 'squared_error'): loss = objectives.squared_error(prediction, targets) loss = objectives.aggregate(loss, mode='mean') return loss
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] first_layer = layers[1] network_output = lasagne.layers.get_output( output_layer, deterministic=deterministic, **get_output_kw) if not deterministic: losses = loss_function(network_output, target) \ + l2 * regularization.regularize_network_params( output_layer, regularization.l2) \ + l1 * regularization.regularize_layer_params( first_layer, regularization.l1) else: losses = loss_function(network_output, target) return aggregate(losses)
def get_functions(): input_layer=layers.InputLayer(shape=(BATCH_SIZE, INPUT_LENGTH)) print "input_layer size: " + str(input_layer.shape[0])+","+ str(input_layer.shape[1]) layer = input_layer for layer_num in range(len(NUM_UNITS_HIDDEN_LAYER)): print "layer_num-"+str(layer_num) layer=layers.DenseLayer(layer, num_units=NUM_UNITS_HIDDEN_LAYER[layer_num], W=lasagne.init.Normal(0.01), nonlinearity=nonlinearities.tanh) output_layer=layers.DenseLayer(layer, num_units=OUTPUT_SIZE, nonlinearity=nonlinearities.softmax) network_output=get_output(output_layer) expected_output=T.ivector() loss_train=aggregate(categorical_crossentropy(network_output, expected_output), mode='mean') all_weigths=layers.get_all_params(output_layer) update_rule=lasagne.updates.nesterov_momentum(loss_train, all_weigths, learning_rate=LEARNING_RATE) print "input_layer_end size: " + str(input_layer.shape[0])+","+ str(input_layer.shape[1]) train_function=theano.function(inputs=[input_layer.input_var, expected_output], outputs=loss_train, updates=update_rule, allow_input_downcast=True) prediction = T.argmax(network_output, axis=1) accuracy = T.mean(T.eq(prediction, expected_output), dtype=theano.config.floatX) # @UndefinedVariable test_function=theano.function(inputs=[input_layer.input_var, expected_output], outputs=[loss_train, accuracy, prediction], allow_input_downcast=True) output_function=theano.function([input_layer.input_var],get_output(output_layer), allow_input_downcast=True) return train_function,test_function,output_function
def compute_cost(rnn_outputs, forward_probabilities, backward_pointers, x_end, y_end, label): def backward_step(backlinks, position): new_position = backlinks[position] return new_position, position initial_state = T.argmax( forward_probabilities[x_end - 1, y_end - 2:y_end]) + y_end - 2 results, _ = theano.scan(fn=backward_step, sequences=backward_pointers[0:x_end, :], outputs_info=[initial_state, None], go_backwards=True) alignment = label[results[1][::-1]] return aggregate(categorical_crossentropy(rnn_outputs[0:x_end], alignment), mode='sum')
def grad_supervised(l_ram, labels): """ return: loss = 1 / M * sum_i_{1..M} cross_entroy_loss(groundtruth, a_T) grads = theano.grad(loss, params) inputs: labels = (n_batch,) [theano tensor variable] """ loc_mean_t, loc_t, h_t, prob, pred = lasagne.layers.get_output(l_ram) params = lasagne.layers.get_all_params(l_ram, trainable=True) ### loss estimation (cross entropy loss) loss = categorical_crossentropy(prob, labels) loss = aggregate(loss, mode='mean') ### gradient estimation grads = theano.grad(loss, params, disconnected_inputs='ignore') return loss, grads
def weight_decay_objective(layers, loss_function, target, penalty_conv=1e-8, penalty_conv_type = l2, penalty_output=1e-8, penalty_output_type = l2, aggregate=aggregate, deterministic=False, get_output_kw={}): ''' Defines L2 weight decay on network weights. ''' net_out = get_output(layers[-1], deterministic=deterministic, **get_output_kw) loss = loss_function(net_out, target) p1 = penalty_conv * regularize_layer_params(layers[1], penalty_conv_type) p2 = penalty_output * regularize_layer_params(layers[-1], penalty_output_type) losses = loss + p1 + p2 return aggregate(losses)
def objective( output_layer, regularize_layers, target, loss_function=squared_error, aggregate=aggregate, deterministic=False, l1=0, l2=0, tv=0, ): network_output = layers.get_output(output_layer, deterministic=deterministic) loss = aggregate(loss_function(network_output, target)) for layer in regularize_layers: if l1: loss += regularization.regularize_layer_params(layer, regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params(layer, regularization.l2) * l2 if tv: loss += T.mean(T.abs_(network_output[:, 1:] - network_output[:, :-1])) * tv return loss
def weight_decay_objective(layers, loss_function, target, penalty_conv=1e-8, penalty_conv_type=l2, penalty_output=1e-8, penalty_output_type=l2, aggregate=aggregate, deterministic=False, get_output_kw={}): ''' Defines L2 weight decay on network weights. ''' net_out = get_output(layers[-1], deterministic=deterministic, **get_output_kw) loss = loss_function(net_out, target) p1 = penalty_conv * regularize_layer_params(layers[1], penalty_conv_type) p2 = penalty_output * regularize_layer_params(layers[-1], penalty_output_type) losses = loss + p1 + p2 return aggregate(losses)
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, l1=0, l2=0, get_output_kw=None): """ Default implementation of the NeuralNet objective. :param layers: The underlying layers of the NeuralNetwork :param loss_function: The callable loss function to use :param target: the expected output :param aggregate: the aggregation function to use :param deterministic: Whether or not to get a deterministic output :param l1: Optional l1 regularization parameter :param l2: Optional l2 regularization parameter :param get_output_kw: optional kwargs to pass to :meth:`NeuralNetwork.get_output` :return: The total calculated loss """ if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output(output_layer, deterministic=deterministic, **get_output_kw) loss = aggregate(loss_function(network_output, target)) if l1: loss += regularization.regularize_layer_params(layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params(layers.values(), regularization.l2) * l2 return loss
def test_aggregate_sum(): from lasagne.objectives import aggregate x = theano.tensor.matrix('x') assert theano.gof.graph.is_same_graph(aggregate(x, mode='sum'), x.sum())
def train(options): # -------- setup options and data ------------------ np.random.seed(options['seed']) # Load options host = socket.gethostname() # get computer hostname start_time = datetime.datetime.now().strftime("%y-%m-%d-%H-%M") model = importlib.import_module(options['model_file']) # ---------- build model and compile --------------- input_batch = T.tensor4() # input image sequences target = T.tensor4() # target image print('Build model...') model = model.Model(**options['modelOptions']) print('Compile ...') net, outputs, filters = model.build_model(input_batch) # compute loss outputs = get_output(outputs + [filters]) output_frames = outputs[:-1] output_filter = outputs[-1] train_losses = [] for i in range(options['modelOptions']['target_seqlen']): output_frame = output_frames[i] if options['loss'] == 'squared_error': frame_loss = squared_error(output_frame, target[:, [i], :, :]) elif options['loss'] == 'binary_crossentropy': # Clipping to avoid NaN's in binary crossentropy: https://github.com/Lasagne/Lasagne/issues/436 output_frame = T.clip(output_frame, np.finfo(np.float32).eps, 1-np.finfo(np.float32).eps) frame_loss = binary_crossentropy(output_frame, target[:,[i],:,:]) else: assert False train_losses.append(aggregate(frame_loss)) train_loss = sum(train_losses) / options['modelOptions']['target_seqlen'] # update sh_lr = theano.shared(lasagne.utils.floatX(options['learning_rate'])) # to allow dynamic learning rate layers = get_all_layers(net) all_params = get_all_params(layers, trainable = True) updates = adam(train_loss, all_params, learning_rate=sh_lr) _train = theano.function([input_batch, target], train_loss, updates=updates, allow_input_downcast=True) _test = theano.function([input_batch, target], [train_loss, output_filter] + output_frames, allow_input_downcast=True) # ------------ data setup ---------------- print('Prepare data...') dataset = importlib.import_module(options['dataset_file']) dh = dataset.DataHandler(**options['datasetOptions']) # ------------ training setup ---------------- if options['pretrained_model_path'] is not None: checkpoint = pickle.load(open(options['pretrained_model_path'], 'rb')) model_values = checkpoint['model_values'] # overwrite the values of model parameters lasagne.layers.set_all_param_values(layers, model_values) history_train = checkpoint['history_train'] start_epoch = checkpoint['epoch'] + 1 options['batch_size'] = checkpoint['options']['batch_size'] sh_lr.set_value(floatX(checkpoint['options']['learning_rate'])) else: start_epoch = 0 history_train = [] # ------------ actual training ---------------- print 'Start training ...' input_seqlen = options['modelOptions']['input_seqlen'] for epoch in range(start_epoch, start_epoch + options['num_epochs']): epoch_start_time = time.time() history_batch = [] for batch_index in range(0, options['batches_per_epoch']): batch = dh.GetBatch() # generate data on the fly if options['dataset_file'] == 'datasets.stereoCarsColor': batch_input = batch[..., :input_seqlen].squeeze(axis=4) # first frames batch_target = batch[..., input_seqlen:].squeeze(axis=4) # last frame else: batch_input = batch[..., :input_seqlen].transpose(0,4,2,3,1).squeeze(axis=4) # first frames batch_target = batch[..., input_seqlen:].transpose(0,4,2,3,1).squeeze(axis=4) # last frame # train loss_train = _train(batch_input, batch_target) history_batch.append(loss_train) print("Epoch {} of {}, batch {} of {}, took {:.3f}s".format(epoch + 1, options['num_epochs'], batch_index+1, options['batches_per_epoch'], time.time() - epoch_start_time)) print(" training loss:\t{:.6f}".format(loss_train.item())) # clear the screen display.clear_output(wait=True) # print statistics history_train.append(np.mean(history_batch)) history_batch = [] print("Epoch {} of {}, took {:.3f}s".format(epoch + 1, options['num_epochs'], time.time() - epoch_start_time)) print(" training loss:\t{:.6f}".format(history_train[epoch].item())) # set new learning rate (maybe this is unnecessary with adam updates) if (epoch+1) % options['decay_after'] == 0: options['learning_rate'] = sh_lr.get_value() * 0.5 print "New LR:", options['learning_rate'] sh_lr.set_value(floatX(options['learning_rate'])) # save the model if (epoch+1) % options['save_after'] == 0: save_model(layers, epoch, history_train, start_time, host, options) print("Model saved")
def modifiedObjective(layers, loss_function, target, aggregate=aggregate, deterministic=False, l1=0, l2=0, logitSens=0, probSens=0, lossSens=0, std=None, get_output_kw=None): """ Modified implementation of the NeuralNet objective. :param layers: The underlying layers of the NeuralNetwork :param loss_function: The callable loss function to use :param target: the expected output :param aggregate: the aggregation function to use :param deterministic: Whether or not to get a deterministic output :param l1: Optional l1 regularization parameter :param l2: Optional l2 regularization parameter :param lossSens: Optional loss sensitivity regularization parameter :param lossSens: Optional loss sensitivity regularization parameter :param lossSens: Optional loss sensitivity regularization parameter :param get_output_kw: optional kwargs to pass to :meth:`NeuralNetwork.get_output` :return: The total calculated loss """ if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] logit_layer = layers[-2] input_layer = layers[0] network_input = input_layer.input_var network_output = get_output(output_layer, deterministic=deterministic, **get_output_kw) logit_output = get_output(logit_layer, deterministic=deterministic, **get_output_kw) L = loss_function( network_output, lasagne.utils.one_hot(target, output_layer.output_shape[1])) loss = aggregate(L) if l1: loss += regularization.regularize_layer_params(layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params(layers.values(), regularization.l2) * l2 # logit sensitivity if logitSens: logit = T.sum( logit_output * lasagne.utils.one_hot(target, output_layer.output_shape[1]), axis=1) G_logit = T.grad(T.sum(logit), network_input) if std is not None: G_logit = std * G_logit # Sparse saliency regularization absG_logit = T.abs_(G_logit) sumAbsG_logit = T.sum(absG_logit, axis=(1, 2, 3)) loss += aggregate(sumAbsG_logit) * logitSens # probability sensitivity if probSens: prob = T.sum( network_output * lasagne.utils.one_hot(target, output_layer.output_shape[1]), axis=1) G_prob = T.grad(T.sum(prob), network_input) if std is not None: G_prob = std * G_prob # Sparse saliency regularization absG_prob = T.abs_(G_prob) sumAbsG_prob = T.sum(absG_prob, axis=(1, 2, 3)) loss += aggregate(sumAbsG_prob) * probSens # Loss sensitivity if lossSens: G_loss = theano.grad(T.sum(L), network_input) if std is not None: G_loss = std * G_loss absG_loss = T.abs_(G_loss) loss += aggregate(T.sum(absG_loss, axis=(1, 2, 3))) * lossSens # Double Backpropagation, uncomment if desired #sqG = G**2 #sumSqG = T.sum(sqG,axis = (1,2,3)) #loss += aggregate(sumSqG) * tv return loss
def test_aggregate_weighted_normalized_sum(): from lasagne.objectives import aggregate x = theano.tensor.matrix('x') w = theano.tensor.matrix('w') assert theano.gof.graph.is_same_graph(aggregate(x, w, 'normalized_sum'), (x * w).sum() / w.sum())
def loss(x, t): return aggregate(binary_crossentropy(x, t))
def run_network(data=None, num_epochs=10, ratio=0.5): try: global_start_time = time() sequence_length = 50 batchsize = 512 path_to_dataset = 'household_power_consumption.txt' # Loading the data if data is None: print 'Loading data... ' X_train, y_train, X_test, y_test = data_power_consumption( path_to_dataset, sequence_length, ratio) else: X_train, y_train, X_test, y_test = data val_ratio = 0.005 val_rows = round(val_ratio * X_train.shape[0]) X_val = X_train[:val_rows] y_val = y_train[:val_rows] y_val = np.reshape(y_val, (y_val.shape[0], 1)) X_train = X_train[val_rows:] y_train = y_train[val_rows:] # Creating the Theano variables input_var = T.tensor3('inputs') target_var = T.matrix('targets') # Building the Theano expressions on these variables network = build_model(input_var) prediction = lasagne.layers.get_output(network) loss = squared_error(prediction, target_var) loss = aggregate(loss) params = lasagne.layers.get_all_params(network, trainable=True) updates = rmsprop(loss, params, learning_rate=0.001) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = squared_error(test_prediction, target_var) test_loss = aggregate(test_loss) # Compiling the graph by declaring the Theano functions compile_time = time() print 'Data:' print 'X_train ', X_train.shape, ' y_train ', y_train.shape print 'X_val ', X_val.shape, ' y_val ', y_val.shape print 'X_test ', X_test.shape, ' y_test ', y_test.shape print "Compiling..." train_fn = theano.function([input_var, target_var], loss, updates=updates) val_fn = theano.function([input_var, target_var], test_loss) get_pred_fn = theano.function([input_var], prediction) print "Compiling time : ", time() - compile_time # For loop that goes each time through the hole training # and validation data # T R A I N I N G # - - - - - - - - print "Starting training...\n" for epoch in range(num_epochs): # Going over the training data train_err = 0 train_batches = 0 start_time = time() nb_batches = X_train.shape[0] / batchsize time_line = np.zeros(nb_batches) for batch in iterate_minibatches(X_train, y_train, batchsize, shuffle=True): current_time = time() inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 str_out = "\rTrain Batch " + str(train_batches) str_out += "/" + str(nb_batches) str_out += " | Loss : " + str(train_err / train_batches)[:7] str_out += " | Remaining time (s) : " remaining_seconds = time() - current_time remaining_seconds *= (nb_batches - train_batches) time_line[train_batches - 1] = round(remaining_seconds) if (train_batches - 1) % 5 == 0: durations = time_line[train_batches-1: train_batches+50] durations = np.mean([t for t in durations if t > 0]) str_out += str(durations) sys.stdout.write(str_out) sys.stdout.flush() print "\nGoing through validation data" # Going over the validation data val_err = 0 val_batches = 0 for batch in iterate_minibatches( X_val, y_val, batchsize, shuffle=False): inputs, targets = batch err = val_fn(inputs, targets) val_err += err val_batches += 1 # Then we print the results for this epoch: # train_batches - 1 because started at 1 and not 0 print "training loss:\t\t\t" + str(train_err / train_batches) print "validation loss:\t\t" + str(val_err / val_batches) print("Epoch {} of {} took {:.3f}s \n\n".format( epoch + 1, num_epochs, time() - start_time)) # Now that the training is over, let's test the network: test_err = 0 test_batches = 0 for batch in iterate_minibatches( X_test, y_test, batchsize, shuffle=False): inputs, targets = batch err = val_fn(inputs, targets) test_err += err test_batches += 1 print "\nFinal results in {0} seconds:".format( time()-global_start_time) print "Test loss:\t\t\t{:.6f}".format(test_err / test_batches) prediction_size = 200 predicted = get_pred_fn(X_test[:prediction_size]) try: plt.plot(predicted) plt.plot(y_test[prediction_size]) plt.show(block=False) except Exception as e: print str(e) print "predicted = ", repr( np.reshape(predicted[:prediction_size], (prediction_size,))) print '\n' print "y = ", repr( np.reshape(y_test[:prediction_size], (prediction_size,))) return network except KeyboardInterrupt: return network
def aggregated_loss_func(prediction, target, weights=None): loss = loss_func(prediction, target) return aggregate(loss, mode=loss_aggregation_mode, weights=weights)
def do_regression(num_epochs=2, # No. of epochs to train init_file=None, # Saved parameters to initialise training epoch_size=680780, # Whole dataset size valid_size=34848, train_batch_multiple=10637, # No. of minibatches per batch valid_batch_multiple=1089, # No. of minibatches per batch train_minibatch_size=64, valid_minibatch_size=32, eval_multiple=50, # No. of minibatches to ave. in report save_model=True, input_width=19, rng_seed=100009, cross_val=0, # Cross-validation subset label dataver=1, # Label for different runs/architectures/etc rate_init=1.0, rate_decay=0.999983): ################################################### ################# 0. User inputs ################## ################################################### for i in range(1,len(sys.argv)): if sys.argv[i].startswith('-'): option = sys.argv[i][1:] if option == 'i': init_file = sys.argv[i+1] elif option[0:2] == 'v=' : dataver = int(option[2:]) elif option[0:3] == 'cv=' : cross_val = int(option[3:]) elif option[0:3] == 'rs=' : rng_seed = int(option[3:]) elif option[0:3] == 'ri=' : rate_init = np.float32(option[3:]) elif option[0:3] == 'rd=' : rate_decay = np.float32(option[3:]) print("Running with dataver %s" % (dataver)) print("Running with cross_val %s" % (cross_val)) ################################################### ############# 1. Housekeeping values ############## ################################################### # Batch size is possibly not equal to epoch size due to memory limits train_batch_size = train_batch_multiple*train_minibatch_size assert epoch_size >= train_batch_size # Number of times we expect the training/validation generator to be called max_train_gen_calls = (num_epochs*epoch_size)//train_batch_size # Number of evaluations (total minibatches / eval_multiple) num_eval = max_train_gen_calls*train_batch_multiple / eval_multiple ################################################### ###### 2. Define model and theano variables ####### ################################################### if rng_seed is not None: print("Setting RandomState with seed=%i" % (rng_seed)) rng = np.random.RandomState(rng_seed) set_rng(rng) print("Defining variables...") index = T.lscalar() # Minibatch index x = T.tensor3('x') # Inputs y = T.fvector('y') # Target print("Defining model...") network_0 = build_1Dregression_v1( input_var=x, input_width=input_width, nin_units=12, h_num_units=[64,128,256,128,64], h_grad_clip=1.0, output_width=1 ) if init_file is not None: print("Loading initial model parametrs...") init_model = np.load(init_file) init_params = init_model[init_model.files[0]] LL.set_all_param_values([network_0], init_params) ################################################### ################ 3. Import data ################### ################################################### # Loading data generation model parameters print("Defining shared variables...") train_set_y = theano.shared(np.zeros(1, dtype=theano.config.floatX), borrow=True) train_set_x = theano.shared(np.zeros((1,1,1), dtype=theano.config.floatX), borrow=True) valid_set_y = theano.shared(np.zeros(1, dtype=theano.config.floatX), borrow=True) valid_set_x = theano.shared(np.zeros((1,1,1), dtype=theano.config.floatX), borrow=True) # Validation data (pick a single augmented instance, rand0 here) print("Creating validation data...") chunk_valid_data = np.load( "./valid/data_valid_augmented_cv%s_t%s_rand0.npy" % (cross_val, input_width) ).astype(theano.config.floatX) chunk_valid_answers = np.load( "./valid/data_valid_expected_cv%s.npy" % (cross_val) ).astype(theano.config.floatX) print("chunk_valid_answers.shape", chunk_valid_answers.shape) print("Assigning validation data...") valid_set_y.set_value(chunk_valid_answers[:]) valid_set_x.set_value(chunk_valid_data.transpose(0,2,1)) # Create output directory if not os.path.exists("output_cv%s_v%s" % (cross_val, dataver)): os.makedirs("output_cv%s_v%s" % (cross_val, dataver)) ################################################### ########### 4. Create Loss expressions ############ ################################################### print("Defining loss expressions...") prediction_0 = LL.get_output(network_0) train_loss = aggregate(T.abs_(prediction_0 - y.dimshuffle(0,'x'))) valid_prediction_0 = LL.get_output(network_0, deterministic=True) valid_loss = aggregate(T.abs_(valid_prediction_0 - y.dimshuffle(0,'x'))) ################################################### ############ 5. Define update method ############# ################################################### print("Defining update choices...") params = LL.get_all_params(network_0, trainable=True) learn_rate = T.scalar('learn_rate', dtype=theano.config.floatX) updates = lasagne.updates.adadelta(train_loss, params, learning_rate=learn_rate) ################################################### ######### 6. Define train/valid functions ######### ################################################### print("Defining theano functions...") train_model = theano.function( [index, learn_rate], train_loss, updates=updates, givens={ x: train_set_x[(index*train_minibatch_size): ((index+1)*train_minibatch_size)], y: train_set_y[(index*train_minibatch_size): ((index+1)*train_minibatch_size)] } ) validate_model = theano.function( [index], valid_loss, givens={ x: valid_set_x[index*valid_minibatch_size: (index+1)*valid_minibatch_size], y: valid_set_y[index*valid_minibatch_size: (index+1)*valid_minibatch_size] } ) ################################################### ################ 7. Begin training ################ ################################################### print("Begin training...") sys.stdout.flush() cum_iterations = 0 this_train_loss = 0.0 this_valid_loss = 0.0 best_valid_loss = np.inf best_iter = 0 train_eval_scores = np.empty(num_eval) valid_eval_scores = np.empty(num_eval) eval_index = 0 aug_index = 0 for batch in range(max_train_gen_calls): start_time = time.time() chunk_train_data = np.load( "./train/data_train_augmented_cv%s_t%s_rand%s.npy" % (cross_val, input_width, aug_index) ).astype(theano.config.floatX) chunk_train_answers = np.load( "./train/data_train_expected_cv%s.npy" % (cross_val) ).astype(theano.config.floatX) train_set_y.set_value(chunk_train_answers[:]) train_set_x.set_value(chunk_train_data.transpose(0, 2, 1)) # Iterate over minibatches in each batch for mini_index in range(train_batch_multiple): this_rate = np.float32(rate_init*(rate_decay**cum_iterations)) this_train_loss += train_model(mini_index, this_rate) cum_iterations += 1 # Report loss if (cum_iterations % eval_multiple == 0): this_train_loss = this_train_loss / eval_multiple this_valid_loss = np.mean([validate_model(i) for i in range(valid_batch_multiple)]) train_eval_scores[eval_index] = this_train_loss valid_eval_scores[eval_index] = this_valid_loss # Save report every five evaluations if ((eval_index+1) % 5 == 0): np.savetxt( "output_cv%s_v%s/training_scores.txt" % (cross_val, dataver), train_eval_scores, fmt="%.5f" ) np.savetxt( "output_cv%s_v%s/validation_scores.txt" % (cross_val, dataver), valid_eval_scores, fmt="%.5f" ) np.savetxt( "output_cv%s_v%s/last_learn_rate.txt" % (cross_val, dataver), [np.array(this_rate)], fmt="%.5f" ) # Save model if best validation score if (this_valid_loss < best_valid_loss): best_valid_loss = this_valid_loss best_iter = cum_iterations-1 if save_model: np.savez("output_cv%s_v%s/model.npz" % (cross_val, dataver), LL.get_all_param_values(network_0)) # Reset evaluation reports eval_index += 1 this_train_loss = 0.0 this_valid_loss = 0.0 aug_index += 1 end_time = time.time() print("Computing time for batch %d: %f" % (batch, end_time-start_time)) print("Best validation loss %f after %d epochs" % (best_valid_loss, (best_iter*train_minibatch_size//epoch_size))) del train_set_x, train_set_y, valid_set_x, valid_set_y gc.collect() return None
def run_mlp(train, val, num_epochs): # Partition Data train_rows, train_cols = train.shape train_rows, train_cols = train_rows, (train_cols - 1) val_rows, val_cols = val.shape val_rows, val_cols = val_rows, (val_cols-1) X_train,y_train = train[0:train_rows ,0:train_cols],train[0:train_rows,train_cols:] X_val,y_val = val[0:val_rows,0:val_cols],val[0:val_rows,val_cols:] # Theano variables input_var = T.matrix('inputs') target_var = T.matrix('targets') network = build_mlp(input_var, train_cols, 1) # """loading weight values from the previous model""" # with np.load('model_first_run.npz') as f: # param_values = [f['arr_%d'%i] for i in range(len(f.files))] # param_values[0] = param_values[0][4:43] # lasagne.layers.set_all_param_values(network,param_values) prediction = lasagne.layers.get_output(network,input_var, deterministic = True) loss = lasagne.objectives.binary_crossentropy(prediction, target_var) loss = aggregate(loss, mode='mean') params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.sgd(loss, params, learning_rate=0.5) # Train Function train_fn = theano.function([input_var, target_var],loss, updates=updates) # Validation function val_fn = theano.function([input_var, target_var],loss) # test function f_test = theano.function([input_var], prediction) val_err_list,train_err_list = list(),list() print("Starting training...") val_err_list,train_err_list = list(),list() for epoch in range(num_epochs): start_time = timeit.default_timer() train_err = 0 train_batches = 0 #start_time = time.time() for batch in iterate_minibatches(X_train,y_train,100,shuffle=True): inputs,targets = batch #print (inputs.shape, targets.shape) batch_error = train_fn(inputs,targets) #print (list(f_test(inputs))) train_err += batch_error train_batches += 1 #print (batch_error, train_batches) #if (train_batches%10==0): #print(train_batches) #print (batch_error) train_err_list.append(train_err) val_err = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, 50, shuffle=False): inputs, targets = batch err = val_fn(inputs, targets) val_err += err val_batches += 1 #print (err,val_batches) val_err_list.append(val_err) # Save results of the epoch if (epoch%1 == 0): file_name = 'model_epoch_' + str(epoch) + '.npz' np.savez(file_name,*lasagne.layers.get_all_param_values(network)) print ('Epoch: ',epoch, ' train error: ',train_err,' val erro:',val_err) train_err_line = ('Epoch:,'+str(epoch)+", train_error:, "+str(train_err)+", val_error:, "+str(val_err)+"\n") file_result = 'C:\\Users\\Administrator\\Desktop\\Input_FUT\\NN_model\\' + str(train_dt) + '\\python runs' filepath = os.path.join(file_result, 'train_val_err.csv') fout = open(filepath,"a") # train_err_file.writelines(train_err_line) fout.write(train_err_line) fout.close() end_time = timeit.default_timer() print (end_time - start_time) return val_err_list,train_err_list
def loss(prediction, target): return aggregate(categorical_crossentropy_logdomain(prediction,target))
def get_network(model): input_data = tensor.dmatrix('x') targets_var = tensor.dmatrix('y') network = layers.InputLayer((model['batch_size'], model['input_vars']), input_data) nonlin = nonlinearities.rectify if model['hidden_nonlinearity'] != 'ReLu': nonlin = nonlinearities.tanh prev_layer = network for l in range(model['nlayers']): W = None if model['hidden_nonlinearity'] == 'ReLu': W = lasagne.init.GlorotUniform('relu') else: W = lasagne.init.GlorotUniform(1) fc = layers.DenseLayer(prev_layer, model['units'], nonlinearity=nonlin, W=W) if model['dropout']: fc = layers.DropoutLayer(fc, 0.5) prev_layer = fc output_lin = None if model['output_mode'] == OUTPUT_LOG: output_lin = nonlinearities.tanh output_layer = layers.DenseLayer(prev_layer, 1, nonlinearity=output_lin) predictions = layers.get_output(output_layer) if model['output_mode'] != OUTPUT_LOG: (minth, maxth) = model['maxmin'][model['control']] maxt = theano.shared(np.ones((model['batch_size'], 1)) * maxth) mint = theano.shared(np.ones((model['batch_size'], 1)) * minth) predictions = tensor.min(tensor.concatenate([maxt, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) predictions = tensor.max(tensor.concatenate([mint, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) if model['output_mode'] == OUTPUT_NO: prediction_unboun = layers.get_output(output_layer) loss = objectives.squared_error(prediction_unboun, targets_var) else: loss = objectives.squared_error(predictions, targets_var) loss = objectives.aggregate(loss, mode='mean') params = layers.get_all_params(output_layer) # test_prediction = layers.get_output(output_layer, deterministic=True) #fix for dropout test_loss = objectives.squared_error(predictions, targets_var) test_loss = test_loss.mean() if model['hidden_nonlinearity'] == 'ReLu': model['lr'] *= 0.5 updates_sgd = updates.sgd(loss, params, learning_rate=model['lr']) ups = updates.apply_momentum(updates_sgd, params, momentum=0.9) train_fn = theano.function([input_data, targets_var], loss, updates=ups) pred_fn = theano.function([input_data], predictions) # pred_fn = theano.function([input_data], prediction_unboun) val_fn = theano.function([input_data, targets_var], test_loss) return { 'train': train_fn, 'eval': val_fn, 'pred': pred_fn, 'layers': output_layer }
def build_model(self, train_set, test_set, validation_set=None): super(UFCNN, self).build_model(train_set, test_set, validation_set) epsilon = 1e-8 loss_cc = aggregate(categorical_crossentropy( T.clip(get_output(self.model, self.sym_x), epsilon, 1), self.sym_t), mode='mean') y = T.clip(get_output(self.model, self.sym_x, deterministic=True), epsilon, 1) loss_eval = aggregate(categorical_crossentropy(y, self.sym_t), mode='mean') loss_acc = categorical_accuracy(y, self.sym_t).mean() all_params = get_all_params(self.model, trainable=True) grads = T.grad(loss_cc, all_params) for idx, param in enumerate(all_params): param_name = param.name if ('h2.W' in param_name) or ('g2.W' in param_name): print(param_name) grads[idx] *= self.l2_mask if ('h3.W' in param_name) or ('g3.W' in param_name): print(param_name) grads[idx] *= self.l3_mask sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') updates = adam(grads, all_params, self.sym_lr, sym_beta1, sym_beta2) inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2 ] f_train = theano.function( inputs, [loss_cc], updates=updates, givens={ self.sym_x: self.sh_train_x[self.batch_slice], self.sym_t: self.sh_train_t[self.batch_slice], }, ) f_test = theano.function( [self.sym_index, self.sym_batchsize], [loss_eval], givens={ self.sym_x: self.sh_test_x[self.batch_slice], self.sym_t: self.sh_test_t[self.batch_slice], }, ) f_validate = None if validation_set is not None: f_validate = theano.function( [self.sym_index, self.sym_batchsize], [loss_eval, loss_acc], givens={ self.sym_x: self.sh_valid_x[self.batch_slice], self.sym_t: self.sh_valid_t[self.batch_slice], }, ) self.train_args['inputs']['batchsize'] = 128 self.train_args['inputs']['learningrate'] = 1e-3 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['outputs']['loss_cc'] = '%0.6f' self.test_args['inputs']['batchsize'] = 128 self.test_args['outputs']['loss_eval'] = '%0.6f' self.validate_args['inputs']['batchsize'] = 128 self.validate_args['outputs']['loss_eval'] = '%0.6f' self.validate_args['outputs']['loss_acc'] = '%0.6f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def build_model(self, train_set, test_set, validation_set=None): super(FCAE, self).build_model(train_set, test_set, validation_set) y_train = get_output(self.model, self.sym_x) loss = aggregate(squared_error(y_train, self.sym_x), mode='mean') # loss += + 1e-4 * lasagne.regularization.regularize_network_params(self.model, lasagne.regularization.l2) y_test = get_output(self.model, self.sym_x, deterministic=True) loss_test = aggregate(squared_error(y_test, self.sym_x), mode='mean') all_params = get_all_params(self.model, trainable=True) sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') grads = T.grad(loss, all_params) ngrads = lasagne.updates.total_norm_constraint(grads, 5) cgrads = [T.clip(g, -5, 5) for g in ngrads] updates = rmsprop(cgrads, all_params, self.sym_lr, sym_beta1, sym_beta2) inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2 ] f_train = theano.function( inputs, [loss], updates=updates, givens={ self.sym_x: self.sh_train_x[self.batch_slice], }, ) f_test = theano.function( [self.sym_index, self.sym_batchsize], [loss_test], givens={ self.sym_x: self.sh_test_x[self.batch_slice], }, on_unused_input='ignore', ) f_ae = None # f_ae = theano.function( # [self.sym_batchsize], [y_test], # givens={ # self.sym_x: self.sh_valid_x, # }, # on_unused_input='ignore', # ) self.train_args['inputs']['batchsize'] = 128 self.train_args['inputs']['learningrate'] = 1e-3 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 1e-6 self.train_args['outputs']['loss'] = '%0.6f' self.test_args['inputs']['batchsize'] = 128 self.test_args['outputs']['loss_test'] = '%0.6f' # self.validate_args['inputs']['batchsize'] = 128 # self.validate_args['outputs']['loss_eval'] = '%0.6f' # self.validate_args['outputs']['loss_acc'] = '%0.6f' return f_train, f_test, f_ae, self.train_args, self.test_args, self.validate_args
def msq_err(self, train_output, target_values): loss = squared_error(train_output, target_values) loss = aggregate(loss, mode='mean') return loss
def do_regression( num_epochs=60, # No. of epochs to train init_file=None, # Saved parameters to initialise training epoch_size=680780, # Whole dataset size valid_size=34848, train_batch_multiple=10637, # No. of minibatches per batch valid_batch_multiple=1089, # No. of minibatches per batch train_minibatch_size=64, valid_minibatch_size=32, eval_multiple=50, # No. of minibatches to ave. in report save_model=True, input_width=19, rng_seed=100009, cross_val=0, # Cross-validation subset label dataver=1, # Label for different runs/architectures/etc rate_init=1.0, rate_decay=0.999983): ################################################### ################# 0. User inputs ################## ################################################### for i in range(1, len(sys.argv)): if sys.argv[i].startswith('-'): option = sys.argv[i][1:] if option == 'i': init_file = sys.argv[i + 1] elif option[0:2] == 'v=': dataver = int(option[2:]) elif option[0:3] == 'cv=': cross_val = int(option[3:]) elif option[0:3] == 'rs=': rng_seed = int(option[3:]) elif option[0:3] == 'ri=': rate_init = np.float32(option[3:]) elif option[0:3] == 'rd=': rate_decay = np.float32(option[3:]) print("Running with dataver %s" % (dataver)) print("Running with cross_val %s" % (cross_val)) ################################################### ############# 1. Housekeeping values ############## ################################################### # Batch size is possibly not equal to epoch size due to memory limits train_batch_size = train_batch_multiple * train_minibatch_size assert epoch_size >= train_batch_size # Number of times we expect the training/validation generator to be called max_train_gen_calls = (num_epochs * epoch_size) // train_batch_size # Number of evaluations (total minibatches / eval_multiple) num_eval = max_train_gen_calls * train_batch_multiple / eval_multiple ################################################### ###### 2. Define model and theano variables ####### ################################################### if rng_seed is not None: print("Setting RandomState with seed=%i" % (rng_seed)) rng = np.random.RandomState(rng_seed) set_rng(rng) print("Defining variables...") index = T.lscalar() # Minibatch index x = T.tensor3('x') # Inputs y = T.fvector('y') # Target print("Defining model...") network_0 = build_1Dregression_v1(input_var=x, input_width=input_width, nin_units=12, h_num_units=[64, 128, 256, 128, 64], h_grad_clip=1.0, output_width=1) if init_file is not None: print("Loading initial model parametrs...") init_model = np.load(init_file) init_params = init_model[init_model.files[0]] LL.set_all_param_values([network_0], init_params) ################################################### ################ 3. Import data ################### ################################################### # Loading data generation model parameters print("Defining shared variables...") train_set_y = theano.shared(np.zeros(1, dtype=theano.config.floatX), borrow=True) train_set_x = theano.shared(np.zeros((1, 1, 1), dtype=theano.config.floatX), borrow=True) valid_set_y = theano.shared(np.zeros(1, dtype=theano.config.floatX), borrow=True) valid_set_x = theano.shared(np.zeros((1, 1, 1), dtype=theano.config.floatX), borrow=True) # Validation data (pick a single augmented instance, rand0 here) print("Creating validation data...") chunk_valid_data = np.load( "./valid/data_valid_augmented_cv%s_t%s_rand0.npy" % (cross_val, input_width)).astype(theano.config.floatX) chunk_valid_answers = np.load("./valid/data_valid_expected_cv%s.npy" % (cross_val)).astype(theano.config.floatX) print "chunk_valid_answers.shape", chunk_valid_answers.shape print("Assigning validation data...") valid_set_y.set_value(chunk_valid_answers[:]) valid_set_x.set_value(chunk_valid_data.transpose(0, 2, 1)) # Create output directory if not os.path.exists("output_cv%s_v%s" % (cross_val, dataver)): os.makedirs("output_cv%s_v%s" % (cross_val, dataver)) ################################################### ########### 4. Create Loss expressions ############ ################################################### print("Defining loss expressions...") prediction_0 = LL.get_output(network_0) train_loss = aggregate(T.abs_(prediction_0 - y.dimshuffle(0, 'x'))) valid_prediction_0 = LL.get_output(network_0, deterministic=True) valid_loss = aggregate(T.abs_(valid_prediction_0 - y.dimshuffle(0, 'x'))) ################################################### ############ 5. Define update method ############# ################################################### print("Defining update choices...") params = LL.get_all_params(network_0, trainable=True) learn_rate = T.scalar('learn_rate', dtype=theano.config.floatX) updates = lasagne.updates.adadelta(train_loss, params, learning_rate=learn_rate) ################################################### ######### 6. Define train/valid functions ######### ################################################### print("Defining theano functions...") train_model = theano.function( [index, learn_rate], train_loss, updates=updates, givens={ x: train_set_x[(index * train_minibatch_size):((index + 1) * train_minibatch_size)], y: train_set_y[(index * train_minibatch_size):((index + 1) * train_minibatch_size)] }) validate_model = theano.function( [index], valid_loss, givens={ x: valid_set_x[index * valid_minibatch_size:(index + 1) * valid_minibatch_size], y: valid_set_y[index * valid_minibatch_size:(index + 1) * valid_minibatch_size] }) ################################################### ################ 7. Begin training ################ ################################################### print("Begin training...") sys.stdout.flush() cum_iterations = 0 this_train_loss = 0.0 this_valid_loss = 0.0 best_valid_loss = np.inf best_iter = 0 train_eval_scores = np.empty(num_eval) valid_eval_scores = np.empty(num_eval) eval_index = 0 aug_index = 0 for batch in xrange(max_train_gen_calls): start_time = time.time() chunk_train_data = np.load( "./train/data_train_augmented_cv%s_t%s_rand%s.npy" % (cross_val, input_width, aug_index)).astype(theano.config.floatX) chunk_train_answers = np.load("./train/data_train_expected_cv%s.npy" % (cross_val)).astype(theano.config.floatX) train_set_y.set_value(chunk_train_answers[:]) train_set_x.set_value(chunk_train_data.transpose(0, 2, 1)) # Iterate over minibatches in each batch for mini_index in xrange(train_batch_multiple): this_rate = np.float32(rate_init * (rate_decay**cum_iterations)) this_train_loss += train_model(mini_index, this_rate) cum_iterations += 1 # Report loss if (cum_iterations % eval_multiple == 0): this_train_loss = this_train_loss / eval_multiple this_valid_loss = np.mean( [validate_model(i) for i in xrange(valid_batch_multiple)]) train_eval_scores[eval_index] = this_train_loss valid_eval_scores[eval_index] = this_valid_loss # Save report every five evaluations if ((eval_index + 1) % 5 == 0): np.savetxt("output_cv%s_v%s/training_scores.txt" % (cross_val, dataver), train_eval_scores, fmt="%.5f") np.savetxt("output_cv%s_v%s/validation_scores.txt" % (cross_val, dataver), valid_eval_scores, fmt="%.5f") np.savetxt("output_cv%s_v%s/last_learn_rate.txt" % (cross_val, dataver), [np.array(this_rate)], fmt="%.5f") # Save model if best validation score if (this_valid_loss < best_valid_loss): best_valid_loss = this_valid_loss best_iter = cum_iterations - 1 if save_model: np.savez( "output_cv%s_v%s/model.npz" % (cross_val, dataver), LL.get_all_param_values(network_0)) # Reset evaluation reports eval_index += 1 this_train_loss = 0.0 this_valid_loss = 0.0 aug_index += 1 end_time = time.time() print("Computing time for batch %d: %f" % (batch, end_time - start_time)) print("Best validation loss %f after %d epochs" % (best_valid_loss, (best_iter * train_minibatch_size // epoch_size))) del train_set_x, train_set_y, valid_set_x, valid_set_y gc.collect() return None
def build_model(self, train_set, test_set, validation_set=None, weights=None): super(BRNN, self).build_model(train_set, test_set, validation_set) def brier_score(given, predicted, weight_vector, mask): return T.mean( T.power(given - predicted, 2.0).dot(weight_vector) * mask) epsilon = 1e-8 mask = get_output(self.mask, self.sym_x) y_train = T.clip(get_output(self.model, self.sym_x), epsilon, 1) train_brier = brier_score(y_train, self.sym_t, weights, mask) train_cc = aggregate(categorical_crossentropy(y_train, self.sym_t), mode='mean') loss_train_acc = categorical_accuracy(y_train, self.sym_t).mean() y_test = T.clip(get_output(self.model, self.sym_x, deterministic=True), epsilon, 1) test_brier = brier_score(y_test, self.sym_t, weights, mask) test_cc = aggregate(categorical_crossentropy(y_test, self.sym_t), mode='mean') test_acc = categorical_accuracy(y_test, self.sym_t).mean() all_params = get_all_params(self.model, trainable=True) sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') grads = T.grad(train_brier, all_params) grads = [T.clip(g, -1, 1) for g in grads] updates = adam(grads, all_params, self.sym_lr, sym_beta1, sym_beta2) inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2 ] f_train = theano.function( inputs, [train_cc, train_brier], updates=updates, givens={ self.sym_x: self.sh_train_x[self.batch_slice], self.sym_t: self.sh_train_t[self.batch_slice], }, ) f_test = theano.function( [], [test_cc, test_brier], givens={ self.sym_x: self.sh_test_x, self.sym_t: self.sh_test_t, }, ) f_validate = None if validation_set is not None: f_validate = theano.function( [self.sym_index, self.sym_batchsize], [test_cc, test_acc], givens={ self.sym_x: self.sh_valid_x[self.batch_slice], self.sym_t: self.sh_valid_t[self.batch_slice], }, ) predict = theano.function([self.sym_x], [y_test]) self.train_args['inputs']['batchsize'] = 64 self.train_args['inputs']['learningrate'] = 1e-3 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 # 1e-6 self.train_args['outputs']['train_cc'] = '%0.4f' # self.train_args['outputs']['train_acc'] = '%0.4f' self.train_args['outputs']['train_brier'] = '%0.4f' # self.test_args['inputs']['batchsize'] = 64 self.test_args['outputs']['test_cc'] = '%0.4f' # self.test_args['outputs']['test_acc'] = '%0.4f' self.test_args['outputs']['test_brier'] = '%0.4f' # self.validate_args['inputs']['batchsize'] = 64 # self.validate_args['outputs']['loss_eval'] = '%0.6f' # self.validate_args['outputs']['test_acc'] = '%0.6f' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args, predict
def __init__(self, hidden_size=100, nclasses=73, num_embeddings=11359, embedding_dim=100, window_size=1, memory_size=40, n_memory_slots=8, go_code=1, depth=2, load_dir=None): articles, titles = T.imatrices('articles', 'titles') n_article_slots = int(n_memory_slots / 2) # TODO derive this from an arg n_title_slots = n_memory_slots - n_article_slots n_instances = articles.shape[0] self.window_size = window_size randoms = { # attr: shape # 'emb': (num_embeddings + 1, embedding_dim), 'M_a': (memory_size, n_article_slots), 'M_t': (memory_size, n_title_slots), 'w_a': (n_article_slots,), 'w_t': (n_title_slots,), 'Wg_a': (window_size * embedding_dim, n_article_slots), 'Wg_t': (window_size * embedding_dim, n_title_slots), 'Wk': (hidden_size, memory_size), 'Wb': (hidden_size, 1), 'Wv': (hidden_size, memory_size), 'We_a': (hidden_size, n_article_slots), 'We_t': (hidden_size, n_title_slots), 'Wx': (window_size * embedding_dim, hidden_size), 'Wh': (memory_size, hidden_size), 'W': (hidden_size, nclasses), 'h0': hidden_size } zeros = { # attr: shape 'bg_a': n_article_slots, 'bg_t': n_title_slots, 'bk': memory_size, 'bb': 1, 'bv': memory_size, 'be_a': n_article_slots, 'be_t': n_title_slots, 'bh': hidden_size, 'b': nclasses, } for l in range(depth): randoms['gru' + str(l)] = (1, embedding_dim) def random_shared(name): shape = randoms[name] return theano.shared( 0.2 * np.random.normal(size=shape).astype(theano.config.floatX), name=name) def zeros_shared(name): shape = zeros[name] return theano.shared(np.zeros(shape, dtype=theano.config.floatX), name=name) for key in randoms: # create an attribute with associated shape and random values setattr(self, key, random_shared(key)) for key in zeros: # create an attribute with associated shape and values equal to 0 setattr(self, key, zeros_shared(key)) self.names = randoms.keys() + zeros.keys() # self.names.remove('emb') # no need to save or update embeddings scan_vars = 'h0 w_a M_a w_t M_t'.split() def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0) for key in scan_vars: setattr(self, key, repeat_for_each_instance(self.__getattribute__(key))) self.names.remove(key) if load_dir is not None: with open(os.path.join(load_dir, 'params.pkl')) as handle: params = pickle.load(handle) self.__dict__.update(params) def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs): """ notes Headers from paper in all caps mem = n_article slots if is_article else n_title_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_a: attention weights for article memory :param M_a: article memory :param args: gru_weights, maybe w_t, maybe M_t gru_weights: weights with which to initialize GRULayer on each time step w_t: attention weights for titles memory M_t: titles memory :param kwargs: is_training, is_article is_training: is_article: we use different parts of memory when working with a article :return: [y = model outputs, i + 1 = increment index, h w, M (see above)] """ is_training = kwargs['is_training'] is_article = kwargs['is_article'] gru_weights = args[:depth] if len(args) > depth: w_t = args[depth] M_t = args[depth + 1] i_type = T.iscalar if is_article or is_training else T.ivector assert i.type == i_type if not is_article: assert w_t is not None and M_t is not None word_idxs = i if is_article or is_training: # get representation of word window document = articles if is_article else titles # [instances, bucket_width] word_idxs = document[:, i:i+1] # [instances, 1] # x_i = self.emb[word_idxs].flatten(ndim=2) # [instances, embedding_dim] input = InputLayer(shape=(None, 1), input_var=word_idxs) embed = EmbeddingLayer(input, num_embeddings, embedding_dim) gru = GRULayer(incoming=embed, num_units=embedding_dim, hid_init=self.gru0) for weight in gru_weights: gru = GRULayer(incoming=gru, num_units=embedding_dim, hid_init=weight) x_i = get_output(gru).flatten(ndim=2) x_i = Print('x_i')(x_i) # [instances, embedding_dim] gru_weights = [] if is_article: M_read = M_a # [instances, memory_size, n_article_slots] w_read = w_a # [instances, n_article_slots] else: M_read = T.concatenate([M_a, M_t], axis=2) # [instances, memory_size, n_title_slots] w_read = T.concatenate([w_a, w_t], axis=1) # [instances, n_title_slots] # eqn 15 c = T.batched_dot(M_read, w_read) # [instances, memory_size] # EXTERNAL MEMORY READ def get_attention(Wg, bg, M, w): g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.nnet.softplus(beta) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g) * w + g * w_hat # [instances, mem] w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a) # [instances, n_article_slots] if not is_article: w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t) # [instances, n_title_slots] # MODEL INPUT AND OUTPUT # eqn 9 h = T.dot(c, self.Wh) + T.dot(x_i, self.Wx) + self.bh # [instances, hidden_size] # eqn 10 y = T.nnet.softmax(T.dot(h, self.W) + self.b) # [instances, nclasses] # EXTERNAL MEMORY UPDATE def update_memory(We, be, w_update, M_update): # eqn 17 e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] f = 1. - w_update * e # [instances, mem] # eqn 16 v = T.tanh(T.dot(h, self.Wv) + self.bv) # [instances, memory_size] # need to add broadcast layers for memory update f = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] u = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] v = v.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # eqn 19 return M_update * f + T.batched_dot(v, u) * (1 - f) # [instances, memory_size, mem] M_a = update_memory(self.We_a, self.be_a, w_a, M_a) attention_and_memory = [w_a, M_a] if not is_article: M_t = update_memory(self.We_t, self.be_t, w_t, M_t) attention_and_memory += [w_t, M_t] y_max = y.argmax(axis=1).astype(int32) next_idxs = i + 1 if is_training or is_article else y_max return [y, y_max, next_idxs, h] + attention_and_memory read_article = partial(recurrence, is_training=True, is_article=True) # for read_article, it actually doesn't matter whether is_training is true i0 = T.constant(0, dtype=int32, name='first_value_of_i') gru_weights = [eval('self.gru' + str(l)) for l in range(depth)] outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a] + gru_weights [_, _, _, h, w, M], _ = theano.scan(fn=read_article, outputs_info=outputs_info, n_steps=articles.shape[1], name='read_scan') produce_title = partial(recurrence, is_training=True, is_article=False) outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)] outputs_info.extend([self.w_t, self.M_t]) bucket_width = titles.shape[1] - 1 # subtract 1 because <go> is omitted in y_true [y, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title, outputs_info=outputs_info, n_steps=bucket_width, name='train_scan') # loss and updates y_clip = T.clip(y, .01, .99) y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T y_true = titles[:, 1:].ravel() # [:, 1:] in order to omit <go> counts = T.extra_ops.bincount(y_true, assert_nonneg=True) weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0) losses = T.nnet.categorical_crossentropy(y_flatten, y_true) loss = objectives.aggregate(losses, weights, mode='sum') updates = adadelta(loss, self.params()) self.learn = theano.function(inputs=[articles, titles], outputs=[y_max.T, loss], updates=updates, allow_input_downcast=True, name='learn') produce_title_test = partial(recurrence, is_training=False, is_article=False) self.test = theano.function(inputs=[articles, titles], outputs=[y_max.T], on_unused_input='ignore') outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code [_, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title_test, outputs_info=outputs_info, n_steps=bucket_width, name='test_scan') self.predict = theano.function(inputs=[articles, titles], outputs=y_max.T, name='infer')
def build_model(self, train_set, test_set, validation_set=None): super(CNN, self).build_model(train_set, test_set, validation_set) epsilon = 1e-8 y_train = T.clip(get_output(self.model, self.sym_x), epsilon, 1) loss_cc = aggregate(categorical_crossentropy(y_train, self.sym_t), mode='mean') loss_train_acc = categorical_accuracy(y_train, self.sym_t).mean() y = T.clip(get_output(self.model, self.sym_x, deterministic=True), epsilon, 1) loss_eval = aggregate(categorical_crossentropy(y, self.sym_t), mode='mean') loss_acc = categorical_accuracy(y, self.sym_t).mean() all_params = get_all_params(self.model, trainable=True) sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') grads = T.grad(loss_cc, all_params) grads = [T.clip(g, -5, 5) for g in grads] updates = rmsprop(grads, all_params, self.sym_lr, sym_beta1, sym_beta2) inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2 ] f_train = theano.function( inputs, [loss_cc, loss_train_acc], updates=updates, givens={ self.sym_x: self.sh_train_x[self.batch_slice], self.sym_t: self.sh_train_t[self.batch_slice], }, ) f_test = theano.function( [self.sym_index, self.sym_batchsize], [loss_eval, loss_acc], givens={ self.sym_x: self.sh_test_x[self.batch_slice], self.sym_t: self.sh_test_t[self.batch_slice], }, ) f_validate = None if validation_set is not None: f_validate = theano.function( [self.sym_index, self.sym_batchsize], [loss_eval, loss_acc], givens={ self.sym_x: self.sh_valid_x[self.batch_slice], self.sym_t: self.sh_valid_t[self.batch_slice], }, ) self.train_args['inputs']['batchsize'] = 128 self.train_args['inputs']['learningrate'] = 1e-3 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['outputs']['loss_cc'] = '%0.6f' self.train_args['outputs']['loss_train_acc'] = '%0.6f' self.test_args['inputs']['batchsize'] = 128 self.test_args['outputs']['loss_eval'] = '%0.6f' self.test_args['outputs']['loss_acc'] = '%0.6f' self.validate_args['inputs']['batchsize'] = 128 # self.validate_args['outputs']['loss_eval'] = '%0.6f' # self.validate_args['outputs']['loss_acc'] = '%0.6f' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def build_update_functions(train_set_x, train_set_y, valid_set_x, valid_set_y, network, y, X, train_MASK, val_MASK, batch_size=32, l2_reg=.0001): # build update functions # extract tensor representing the network predictions prediction = get_output(network) ###################New######################### # Aggregate the element-wise error into a scalar value using a mask # note that y should note contain NAN, replace them with 0 or -1. The value does not matter. It # is not used to calculate the aggregated error and update of the network. # MASK should be a matrix of size(y), with 0s in place of NaN values and 1s everywhere else. # build tensor variable for mask trainMASK = T.matrix('trainMASK') # collect squared error loss_RMSE = squared_error(prediction, y) # Drop nan values and average over the remaining values loss_RMSE = aggregate(loss_RMSE, weights=trainMASK, mode='normalized_sum') # compute the square root loss_RMSE = loss_RMSE.sqrt() ############################################### # add l2 regularization # l2_penalty = regularize_layer_params(network, l2) # regc = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1] # layers = get_all_layers(network) # reg_weights = {key: value for (key, value) in zip(layers, regc)} # l2_penalty = regularize_layer_params_weighted(reg_weights, l2) loss = loss_RMSE#(1 - l2_reg) * loss_RMSE + l2_reg * l2_penalty # get network params params = get_all_params(network) # subset_params = params #subset network params to extract the ones that you want to train # print 'length of params',len(params), '\n' subset_params = [params[0], params[1], params[10], params[11], params[12], params[13]] # print('RMSPROP \n') updates = rmsprop(loss, subset_params, learning_rate=1e-4) # create validation/test loss expression # the loss represents the loss for all the labels test_prediction = get_output(network, deterministic=True) ###################New######################### # Aggregate the element-wise error into a scalar value using a mask # note that y should note contain NAN, replace them with 0 or -1. The value does not matter. It # is not used to calculate the aggregated error and update of the network. # MASK should be a matrix of size(y), with 0s in place of NaN values and 1s everywhere else. # build tensor variable for mask valMASK = T.matrix('valMASK') # collect squared error test_loss = squared_error(test_prediction, y) # Drop nan values and average over the remaining values test_loss = aggregate(test_loss, weights=valMASK, mode='normalized_sum') # compute the square root test_loss = test_loss.sqrt() ################################################ # index for mini-batch slicing index = T.lscalar() # training function train_set_x_size = train_set_x.get_value().shape[0] val_set_x_size = valid_set_x.get_value().shape[0] train_fn = theano.function(inputs=[index], outputs=[loss, loss_RMSE], updates=updates, givens={X: train_set_x[ index * batch_size: T.minimum((index + 1) * batch_size, train_set_x_size)], y: train_set_y[ index * batch_size: T.minimum((index + 1) * batch_size, train_set_x_size)], trainMASK: train_MASK[index * batch_size: T.minimum((index + 1) * batch_size, train_set_x_size)]}) # validation function val_fn = theano.function(inputs=[index], outputs=[test_loss, prediction], givens={X: valid_set_x[ index * batch_size: T.minimum((index + 1) * batch_size, val_set_x_size)], y: valid_set_y[ index * batch_size: T.minimum((index + 1) * batch_size, val_set_x_size)], valMASK: val_MASK[ index * batch_size: T.minimum((index + 1) * batch_size, val_set_x_size)]}) return train_fn, val_fn
def __init__(self, hidden_size=100, nclasses=73, num_embeddings=11359, embedding_dim=100, window_size=1, memory_size=40, n_memory_slots=8, go_code=1, depth=2, load_dir=None): articles, titles = T.imatrices('articles', 'titles') n_article_slots = int(n_memory_slots / 2) # TODO derive this from an arg n_title_slots = n_memory_slots - n_article_slots n_instances = articles.shape[0] self.window_size = window_size randoms = { # attr: shape # 'emb': (num_embeddings + 1, embedding_dim), 'M_a': (memory_size, n_article_slots), 'M_t': (memory_size, n_title_slots), 'w_a': (n_article_slots, ), 'w_t': (n_title_slots, ), 'Wg_a': (window_size * embedding_dim, n_article_slots), 'Wg_t': (window_size * embedding_dim, n_title_slots), 'Wk': (hidden_size, memory_size), 'Wb': (hidden_size, 1), 'Wv': (hidden_size, memory_size), 'We_a': (hidden_size, n_article_slots), 'We_t': (hidden_size, n_title_slots), 'Wx': (window_size * embedding_dim, hidden_size), 'Wh': (memory_size, hidden_size), 'W': (hidden_size, nclasses), 'h0': hidden_size } zeros = { # attr: shape 'bg_a': n_article_slots, 'bg_t': n_title_slots, 'bk': memory_size, 'bb': 1, 'bv': memory_size, 'be_a': n_article_slots, 'be_t': n_title_slots, 'bh': hidden_size, 'b': nclasses, } for l in range(depth): randoms['gru' + str(l)] = (1, embedding_dim) def random_shared(name): shape = randoms[name] return theano.shared( 0.2 * np.random.normal(size=shape).astype(theano.config.floatX), name=name) def zeros_shared(name): shape = zeros[name] return theano.shared(np.zeros(shape, dtype=theano.config.floatX), name=name) for key in randoms: # create an attribute with associated shape and random values setattr(self, key, random_shared(key)) for key in zeros: # create an attribute with associated shape and values equal to 0 setattr(self, key, zeros_shared(key)) self.names = randoms.keys() + zeros.keys() # self.names.remove('emb') # no need to save or update embeddings scan_vars = 'h0 w_a M_a w_t M_t'.split() def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0) for key in scan_vars: setattr(self, key, repeat_for_each_instance(self.__getattribute__(key))) self.names.remove(key) if load_dir is not None: with open(os.path.join(load_dir, 'params.pkl')) as handle: params = pickle.load(handle) self.__dict__.update(params) def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs): """ notes Headers from paper in all caps mem = n_article slots if is_article else n_title_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_a: attention weights for article memory :param M_a: article memory :param args: gru_weights, maybe w_t, maybe M_t gru_weights: weights with which to initialize GRULayer on each time step w_t: attention weights for titles memory M_t: titles memory :param kwargs: is_training, is_article is_training: is_article: we use different parts of memory when working with a article :return: [y = model outputs, i + 1 = increment index, h w, M (see above)] """ is_training = kwargs['is_training'] is_article = kwargs['is_article'] gru_weights = args[:depth] if len(args) > depth: w_t = args[depth] M_t = args[depth + 1] i_type = T.iscalar if is_article or is_training else T.ivector assert i.type == i_type if not is_article: assert w_t is not None and M_t is not None word_idxs = i if is_article or is_training: # get representation of word window document = articles if is_article else titles # [instances, bucket_width] word_idxs = document[:, i:i + 1] # [instances, 1] # x_i = self.emb[word_idxs].flatten(ndim=2) # [instances, embedding_dim] input = InputLayer(shape=(None, 1), input_var=word_idxs) embed = EmbeddingLayer(input, num_embeddings, embedding_dim) gru = GRULayer(incoming=embed, num_units=embedding_dim, hid_init=self.gru0) for weight in gru_weights: gru = GRULayer(incoming=gru, num_units=embedding_dim, hid_init=weight) x_i = get_output(gru).flatten(ndim=2) x_i = Print('x_i')(x_i) # [instances, embedding_dim] gru_weights = [] if is_article: M_read = M_a # [instances, memory_size, n_article_slots] w_read = w_a # [instances, n_article_slots] else: M_read = T.concatenate( [M_a, M_t], axis=2) # [instances, memory_size, n_title_slots] w_read = T.concatenate([w_a, w_t], axis=1) # [instances, n_title_slots] # eqn 15 c = T.batched_dot(M_read, w_read) # [instances, memory_size] # EXTERNAL MEMORY READ def get_attention(Wg, bg, M, w): g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.nnet.softplus(beta) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g) * w + g * w_hat # [instances, mem] w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a) # [instances, n_article_slots] if not is_article: w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t) # [instances, n_title_slots] # MODEL INPUT AND OUTPUT # eqn 9 h = T.dot(c, self.Wh) + T.dot( x_i, self.Wx) + self.bh # [instances, hidden_size] # eqn 10 y = T.nnet.softmax(T.dot(h, self.W) + self.b) # [instances, nclasses] # EXTERNAL MEMORY UPDATE def update_memory(We, be, w_update, M_update): # eqn 17 e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] f = 1. - w_update * e # [instances, mem] # eqn 16 v = T.tanh(T.dot(h, self.Wv) + self.bv) # [instances, memory_size] # need to add broadcast layers for memory update f = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] u = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] v = v.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # eqn 19 return M_update * f + T.batched_dot(v, u) * ( 1 - f) # [instances, memory_size, mem] M_a = update_memory(self.We_a, self.be_a, w_a, M_a) attention_and_memory = [w_a, M_a] if not is_article: M_t = update_memory(self.We_t, self.be_t, w_t, M_t) attention_and_memory += [w_t, M_t] y_max = y.argmax(axis=1).astype(int32) next_idxs = i + 1 if is_training or is_article else y_max return [y, y_max, next_idxs, h] + attention_and_memory read_article = partial(recurrence, is_training=True, is_article=True) # for read_article, it actually doesn't matter whether is_training is true i0 = T.constant(0, dtype=int32, name='first_value_of_i') gru_weights = [eval('self.gru' + str(l)) for l in range(depth)] outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a ] + gru_weights [_, _, _, h, w, M], _ = theano.scan(fn=read_article, outputs_info=outputs_info, n_steps=articles.shape[1], name='read_scan') produce_title = partial(recurrence, is_training=True, is_article=False) outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)] outputs_info.extend([self.w_t, self.M_t]) bucket_width = titles.shape[ 1] - 1 # subtract 1 because <go> is omitted in y_true [y, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title, outputs_info=outputs_info, n_steps=bucket_width, name='train_scan') # loss and updates y_clip = T.clip(y, .01, .99) y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T y_true = titles[:, 1:].ravel() # [:, 1:] in order to omit <go> counts = T.extra_ops.bincount(y_true, assert_nonneg=True) weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0) losses = T.nnet.categorical_crossentropy(y_flatten, y_true) loss = objectives.aggregate(losses, weights, mode='sum') updates = adadelta(loss, self.params()) self.learn = theano.function(inputs=[articles, titles], outputs=[y_max.T, loss], updates=updates, allow_input_downcast=True, name='learn') produce_title_test = partial(recurrence, is_training=False, is_article=False) self.test = theano.function(inputs=[articles, titles], outputs=[y_max.T], on_unused_input='ignore') outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code [_, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title_test, outputs_info=outputs_info, n_steps=bucket_width, name='test_scan') self.predict = theano.function(inputs=[articles, titles], outputs=y_max.T, name='infer')
"""loading weight values from the previous model""" """load layer parameters. To be used only when learning walk forward""" # with np.load('model.npz') as f: # param_values = [f['arr_%d'%i] for i in range(len(f.files))] # lasagne.layers.set_all_param_values(network,param_values) prediction = lasagne.layers.get_output(network,input_var, deterministic = True) loss = lasagne.objectives.binary_crossentropy(prediction, target_var) """loss function aggregation only to be used when using cost sensitive training""" # class_weights = np.empty((50,2),dtype=float) # class_weights_global = class_weights # loss = aggregate(loss, weights=theano.shared(class_weights_global),mode='normalized_sum') """loss function aggregation to be used without cost sensitive training""" loss = aggregate(loss,mode='mean') params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.sgd(loss, params, learning_rate=0.2) test_prediction = lasagne.layers.get_output(network,input_var, deterministic=True) test_loss = lasagne.objectives.binary_crossentropy(test_prediction,target_var) test_loss = test_loss.mean() # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),dtype=theano.config.floatX) # test_acc = test_acc.mean() train_fn = theano.function([input_var, target_var],loss, updates=updates) # val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) val_fn = theano.function([input_var, target_var],test_loss) """Training""" print("Starting training...")
def build_update_functions(train_set_x, train_set_y, valid_set_x, valid_set_y, network, y, X, train_MASK, val_MASK, batch_size=32, l2_reg=.0001, learning_rate=.005, momentum=.9): # build update functions # extract tensor representing the network predictions prediction = get_output(network) ################################################ ##################old########################### # # collect squared error # loss_RMSE = squared_error(prediction, y) # # compute the root mean squared error # loss_RMSE = loss_RMSE.mean().sqrt() ###################New######################### # Aggregate the element-wise error into a scalar value using a mask # note that y should note contain NAN, replace them with 0 or -1. The value does not matter. It # is not used to calculate the aggregated error and update of the network. # MASK should be a matrix of size(y), with 0s in place of NaN values and 1s everywhere else. # build tensor variable for mask trainMASK = T.matrix('trainMASK') # collect squared error loss_RMSE = squared_error(prediction, y) # Drop nan values and average over the remaining values loss_RMSE = aggregate(loss_RMSE, weights=trainMASK, mode='normalized_sum') # compute the square root loss_RMSE = loss_RMSE.sqrt() ############################################### # add l2 regularization l2_penalty = regularize_network_params(network, l2) loss = (1 - l2_reg) * loss_RMSE + l2_reg * l2_penalty # get network params params = get_all_params(network, trainable = True) # # create update criterion # print('nestrov') # updates = nesterov_momentum( loss, params, learning_rate=.01, momentum=.9) # print('AdaGrad') # updates = adagrad(loss, params,learning_rate= 1e-2) # print('RMSPROP \n') updates = rmsprop(loss, params, learning_rate=learning_rate) # create validation/test loss expression # the loss represents the loss for all the labels test_prediction = get_output(network, deterministic=True) ################################################ ##################old########################### # # collect squared error # test_loss = squared_error(test_prediction,y) # # compute the root mean squared error # test_loss = test_loss.mean().sqrt() # # test_loss_withl2 = (1-l2_reg) * test_loss + l2_reg * l2_penalty ################################################ ###################New######################### # Aggregate the element-wise error into a scalar value using a mask # note that y should note contain NAN, replace them with 0 or -1. The value does not matter. It # is not used to calculate the aggregated error and update of the network. # MASK should be a matrix of size(y), with 0s in place of NaN values and 1s everywhere else. # build tensor variable for mask valMASK = T.matrix('valMASK') # collect squared error test_loss = squared_error(test_prediction, y) # Drop nan values and average over the remaining values test_loss = aggregate(test_loss, weights=valMASK, mode='normalized_sum') # compute the square root test_loss = test_loss.sqrt() ################################################ # index for mini-batch slicing index = T.lscalar() # training function train_set_x_size = train_set_x.get_value().shape[0] val_set_x_size = valid_set_x.get_value().shape[0] train_fn = theano.function(inputs=[index], outputs=[loss, loss_RMSE], updates=updates, givens={X: train_set_x[ index * batch_size: T.minimum((index + 1) * batch_size, train_set_x_size)], y: train_set_y[ index * batch_size: T.minimum((index + 1) * batch_size, train_set_x_size)], trainMASK: train_MASK[index * batch_size: T.minimum((index + 1) * batch_size, train_set_x_size)]}) # validation function val_fn = theano.function(inputs=[index], outputs=[test_loss, prediction], givens={X: valid_set_x[ index * batch_size: T.minimum((index + 1) * batch_size, val_set_x_size)], y: valid_set_y[ index * batch_size: T.minimum((index + 1) * batch_size, val_set_x_size)], valMASK: val_MASK[ index * batch_size: T.minimum((index + 1) * batch_size, val_set_x_size)]}) return train_fn, val_fn
def loss(x, t): return LO.aggregate( LO.categorical_crossentropy(T.clip(x, 1e-6, 1. - 1e-6), t))
def loss(x, t): return LO.aggregate(LO.categorical_crossentropy(T.clip(x, 1e-6, 1. - 1e-6), t))
def build_model(self): # Define the model prior to building it if not hasattr(self, 'network'): self.model() self._generate_layer_list() # Print the model architecture self.log_msg("Network Architecture") self.log_msg(self.model_str()) # Training loss train_prediction = get_output(self.network) train_loss = aggregate(self.objective(train_prediction, self.output_var), mode='mean') self.log_msg("Objective: {}".format(self.objective.__name__)) # Validation loss validation_prediction = get_output(self.network, deterministic=True) validation_loss = aggregate(self.objective(validation_prediction, self.output_var), mode='mean') # Update the parameters params = get_all_params(self.network, trainable=True) popts = { 'loss_or_grads': train_loss, 'params': params, 'learning_rate': self.learning_rate_tensor, 'momentum': self.momentum_tensor, } # Inspect to see if momentum is a valid argument for the update update_args = inspect.getargspec(self.updates)[0] # Remove momentum if not applicable if not 'momentum' in update_args: del popts['momentum'] updates = self.updates(**popts) # Print the learning rate type self.log_msg('Update: %s' % self.updates.__name__) self.log_msg("Learning Rate: %s" % self.learning_rate.__name__) if 'momentum' in popts.keys(): self.log_msg("Momentum: %s" % self.momentum.__name__) # Define training loss function self.train_loss = theano.function( inputs=[self.input_var, self.output_var], outputs=train_loss, updates=updates, allow_input_downcast=True, ) # Define the accuracy function for categorisation problems cat_accuracy = T.mean( T.eq( T.argmax(validation_prediction, axis=1), T.argmax(self.output_var, axis=1), )) # Define validation loss function if self.objective is squared_error: self.valid_loss = theano.function( inputs=[self.input_var, self.output_var], outputs=validation_loss, ) else: self.valid_loss = theano.function( inputs=[self.input_var, self.output_var], outputs=[validation_loss, cat_accuracy], ) # Define predict self.predict = theano.function(inputs=[self.input_var], outputs=validation_prediction) self.train_predict = theano.function(inputs=[self.input_var], outputs=train_prediction)