def generate_theano_func(args, network, penalty, input_dict, target_var): prediction = get_output(network, input_dict) # loss = T.mean( target_var * ( T.log(target_var) - prediction )) loss = T.mean(categorical_crossentropy(prediction, target_var)) # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) ) # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params ) # penalty = regularize_layer_params(l_forward_1_lstm, l2) # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params) # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) ) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, input_dict, deterministic=True) # test_prediction = get_output(network, deterministic=True) # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction)) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) train_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True, ) if args.task == "sts": val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True, ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True, ) return train_fn, val_fn
def test_maxpool_layer(): l_in1 = InputLayer((None, 2)) l_in2 = InputLayer((None, 20)) l_hid = DenseLayer(l_in2, num_units=30, nonlinearity=rectify) l_pool = MaxpoolLayer([l_in1, l_hid]) l_out = DenseLayer(l_pool, num_units=1, nonlinearity=sigmoid) bounds = theano.tensor.lmatrix('bounds') data = theano.tensor.matrix('data') targets = theano.tensor.matrix('targets') predictions = get_output(l_out, {l_in1: bounds, l_in2: data}) loss = categorical_crossentropy(predictions, targets) loss = aggregate(loss, mode='mean') params = get_all_params(l_out) updates_sgd = sgd(loss, params, learning_rate=0.0001) train_function = theano.function([bounds, data, targets], updates=updates_sgd, allow_input_downcast=True) test_bounds = np.array([[0, 3], [3, 5], [5, 7]]) test_X = np.random.randn(10, 20) test_Y = np.array([[0], [1], [0]]) train_function(test_bounds, test_X, test_Y)
def build_updates(grad, params, optimization, learning_rate): """ setup optimization algorithm """ if optimization['optimizer'] == 'sgd': update_op = updates.sgd(grad, params, learning_rate=learning_rate) elif optimization['optimizer'] == 'nesterov_momentum': if momenum in optimization: momentum = optimization['momentum'] else: momentum = 0.9 update_op = updates.nesterov_momentum(grad, params, learning_rate=learning_rate, momentum=momentum) elif optimization['optimizer'] == 'adagrad': update_op = updates.adagrad(grad, params, learning_rate=learning_rate) elif optimization['optimizer'] == 'rmsprop': if 'rho' in optimization: rho = optimization['rho'] else: rho = 0.9 update_op = updates.rmsprop(grad, params, learning_rate=learning_rate, rho=rho) elif optimization['optimizer'] == 'adam': if 'beta1' in optimization: beta1 = optimization['beta1'] else: beta1 = 0.9 if 'beta2' in optimization: beta2 = optimization['beta2'] else: beta2 = 0.999 update_op = updates.adam(grad, params, learning_rate=learning_rate, beta1=beta1, beta2=beta2) return update_op
def optimize(grads, params): if state['optim_method'] == 'adam': updates = adam(grads, params, lrt, state['momentum']) elif state['optim_method'] == 'adagrad': updates = adagrad(grads, params, lrt) elif state['optim_method'] == 'sgd': updates = sgd(grads, params, lrt) return updates
def get_network(model): input_data = tensor.dmatrix('x') targets_var = tensor.dmatrix('y') network = layers.InputLayer((model['batch_size'], model['input_vars']), input_data) nonlin = nonlinearities.rectify if model['hidden_nonlinearity'] != 'ReLu': nonlin = nonlinearities.tanh prev_layer = network for l in range(model['nlayers']): fc = layers.DenseLayer(prev_layer, model['units'], nonlinearity=nonlin) if model['dropout']: fc = layers.DropoutLayer(fc, 0.5) prev_layer = fc output_lin = None if model['output_mode'] == OUTPUT_LOG: output_lin = nonlinearities.tanh output_layer = layers.DenseLayer(prev_layer, 1, nonlinearity=output_lin) predictions = layers.get_output(output_layer) if model['output_mode'] == OUTPUT_BOUNDED: (minth, maxth) = model['maxmin'][model['control']] maxt = theano.shared(np.ones((model['batch_size'], 1)) * maxth) mint = theano.shared(np.ones((model['batch_size'], 1)) * minth) predictions = tensor.min(tensor.concatenate([maxt, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) predictions = tensor.max(tensor.concatenate([mint, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) loss = objectives.squared_error(predictions, targets_var) loss = objectives.aggregate(loss, mode='mean') params = layers.get_all_params(output_layer) test_prediction = layers.get_output(output_layer, deterministic=True) test_loss = objectives.squared_error(test_prediction, targets_var) test_loss = test_loss.mean() updates_sgd = updates.sgd(loss, params, learning_rate=model['lr']) ups = updates.apply_momentum(updates_sgd, params, momentum=0.9) train_fn = theano.function([input_data, targets_var], loss, updates=ups) pred_fn = theano.function([input_data], predictions) val_fn = theano.function([input_data, targets_var], test_loss) return {'train': train_fn, 'eval': val_fn, 'pred': pred_fn, 'layers': output_layer}
def get_updates(nnet, train_obj, trainable_params, solver=None): implemented_solvers = ("sgd", "momentum", "nesterov", "adagrad", "rmsprop", "adadelta", "adam", "adamax") if solver not in implemented_solvers: nnet.sgd_solver = "adam" else: nnet.sgd_solver = solver if nnet.sgd_solver == "sgd": updates = l_updates.sgd(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "momentum": updates = l_updates.momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "rmsprop": updates = l_updates.rmsprop(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adamax": updates = l_updates.adamax(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def gradient_descend_theano(fun, x0, args=None, learning_rate=1e-3, tol=1e-3, max_iter=3000, verbose=True): funct, trainable_params, non_trainable_params = fun updates = sgd(funct, trainable_params, learning_rate=learning_rate) train_fun = theano.function(non_trainable_params, funct, updates=updates) loss_fn_compiled = theano.function(non_trainable_params, funct) old_loss = np.inf for i in range(max_iter): train_fun(*args) curr_loss = loss_fn_compiled(*args) if abs(curr_loss - old_loss) < tol: break old_loss = curr_loss _print_optimizer_iteration_info(verbose, i, old_loss) _print_optimizer_final_info(verbose, i, old_loss, 'Lasagne Gradient Descend') params_optimal = trainable_params[0].get_value() return {'x': params_optimal}
def get_cost_updates(self, corrupted_input, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x=corrupted_input y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) #z=corrupted_input # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch # L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z)) L=categorical_crossentropy(z,self.x) #L = (self.x * T.log(z) + (1 - self.x) * T.log(1 - z)) #cost=L.mean() # temp=(self.x*T.log(z)+(1-self.x)*T.log(1-z)) # L=-T.sum(temp) # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(L) # print cost reg=1e-8*lasagne.regularization.l2(self.params[0]) cost=cost+reg # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params,add_names='True') updates_sgd=sgd(cost,self.params,learning_rate) updates_dic=apply_momentum(updates_sgd, self.params, momentum=0.9) updates=updates_dic.items() # generate the list of updates # updates = [ # (param, param - learning_rate * gparam) # for param, gparam in zip(self.params, gparams) # ] return (cost, updates)
def net_updates(net, loss, lr): # Get all trainable parameters (weights) of our net params = l.get_all_params(net, trainable=True) # We use the adam update, other options are available if cfg.OPTIMIZER == 'adam': param_updates = updates.adam(loss, params, learning_rate=lr, beta1=0.9) elif cfg.OPTIMIZER == 'nesterov': param_updates = updates.nesterov_momentum(loss, params, learning_rate=lr, momentum=0.9) elif cfg.OPTIMIZER == 'sgd': param_updates = updates.sgd(loss, params, learning_rate=lr) return param_updates
def __init__(self, weights, neurons_topology, learning_rate=0.1, learning_rate_decay=0.985, collaboration_sigma=1.0, collaboration_sigma_decay=0.95, verbosity=2): self._verbosity = verbosity self._history = [] self.neurons_number = weights.shape[0] self.W_shar_mat = theano.shared(weights) self.D_shar_mat = theano.shared(neurons_topology) self.collaboration_sigma = theano.shared(collaboration_sigma) self.collaboration_sigma_decay = collaboration_sigma_decay self.x_row = T.vector("exemplar") self.x_mat = T.matrix("batch") self.learning_rate = theano.shared(learning_rate) self.learning_rate_decay = learning_rate_decay self.distance_from_y_row = ((T.sub(self.W_shar_mat, self.x_row)**2).sum(axis=1)) self.closest_neuron_idx = T.argmin(self.distance_from_y_row) self.distances_from_closest_neuron = self.D_shar_mat[ self.closest_neuron_idx] self.affinities_to_closest_neuron = T.exp( -self.distances_from_closest_neuron / (self.collaboration_sigma)**2) self.smoothed_distances_from_closest_neuron = T.mul( self.distance_from_y_row, G.disconnected_grad(self.affinities_to_closest_neuron)) self.cost_scal = self.smoothed_distances_from_closest_neuron.sum() self.updates = sgd(self.cost_scal, [self.W_shar_mat], learning_rate=self.learning_rate) self.update_neurons = theano.function([self.x_row], self.cost_scal, updates=self.updates)
def __init__(self, weights, neurons_topology, relaxing_factor=-0.5, **kwargs): super(WinnerRelaxingSOM, self).__init__(weights, neurons_topology, **kwargs) self.wr_relaxing_factor = relaxing_factor self.wr_relaxing_member = ( self.smoothed_distances_from_closest_neuron.sum() - self.smoothed_distances_from_closest_neuron[ self.closest_neuron_idx]) self.cost_scal += self.wr_relaxing_factor * self.learning_rate * T.mul( self.W_shar_mat[self.closest_neuron_idx], G.disconnected_grad(self.wr_relaxing_member)).sum() self.updates = sgd(self.cost_scal, [self.W_shar_mat], learning_rate=self.learning_rate) self.update_neurons = theano.function([self.x_row], self.cost_scal, updates=self.updates)
def train_net(x, y1, y2, num_iter=10000): input_var = tensor.tensor4('input_var') cls_target = tensor.ivector('cls_target') bbox_target = tensor.ivector('bbox_target') network = build_model(input_var, roidb=rois) cls_score_out, bbox_pred_out = get_output( [network['cls_score'], network['bbox_pred']]) # Computing Loss functions update parameters cls_loss = categorical_crossentropy(cls_score_out, cls_target) cls_loss = cls_loss.mean() bbox_pred_loss = huber_loss(bbox_pred_out, bbox_target) bbox_pred_loss = bbox_pred_loss.mean() combined_params = get_all_params( [network['cls_score'], network['bbox_pred']], trainable=True) combined_loss = cls_loss + bbox_pred_loss updates = sgd(combined_loss, combined_params, learning_rate=0.001) train_net = theano.function([input_var, cls_target, bbox_target], combined_loss, updates=updates)
def build_train_func(rank=0, **kwargs): print("rank: {} Building model".format(rank)) resnet = build_resnet() print("Building training function") x = T.ftensor4('x') y = T.imatrix('y') prob = L.get_output(resnet['prob'], x, deterministic=False) loss = T.nnet.categorical_crossentropy(prob, y.flatten()).mean() params = L.get_all_params(resnet.values(), trainable=True) sgd_updates = updates.sgd(loss, params, learning_rate=1e-4) # make a function to compute and store the raw gradient f_train = theano.function( inputs=[x, y], outputs=loss, # (assumes this is an avg) updates=sgd_updates) return f_train, "original"
def main(cf): ######## # DATA # ######## print 'Creating data generators...' train_iterator, valid_iterator, test_iterator = create_data_generators(cf) ############################## # COST, GRADIENT AND UPDATES # ############################## print 'Building model...' cost, accuracy = cf.model.compute_cost(deterministic=False) cost_val, accuracy_val = cf.model.compute_cost(deterministic=True) params = get_all_params(cf.model.net, trainable=True) if cf.algo == 'adam': updates = adam(cost, params, cf.learning_rate) elif cf.algo == 'sgd': updates = sgd(cost, params, cf.learning_rate) elif cf.algo == 'momentum': updates = momentum(cost, params, cf.learning_rate) else: raise ValueError('Specified algo does not exist') ############## # MONITORING # ############## print 'Creating extensions and compiling functions...', train_monitor = TrainMonitor( cf.train_freq_print, cf.model.vars, [cost, accuracy], updates) monitoring_vars = [cost_val, accuracy_val] valid_monitor = ValMonitor( 'Validation', cf.valid_freq_print, cf.model.vars, monitoring_vars, valid_iterator) test_monitor = ValMonitor( 'Test', cf.valid_freq_print, cf.model.vars, monitoring_vars, valid_iterator) train_saver = VariableSaver( train_monitor, cf.dump_every_batches, cf.dump_path, 'train') valid_saver = VariableSaver( valid_monitor, cf.dump_every_batches, cf.dump_path, 'valid') test_saver = VariableSaver(test_monitor, None, cf.dump_path, 'test') # Ending conditions end_conditions = [] if hasattr(cf, 'max_iter'): end_conditions.append(MaxIteration(cf.max_iter)) if hasattr(cf, 'max_time'): end_conditions.append(MaxTime(cf.max_iter)) extensions = [ valid_monitor, test_monitor, train_saver, valid_saver, test_saver ] train_m = Trainer(train_monitor, train_iterator, extensions, end_conditions) ############ # TRAINING # ############ train_m.train()
# In[ ]: input_var = T.tensor4('inputs') output_var = T.matrix('outputs') network = layers[0][0](input_var=input_var, **layers[0][1]) for layer in layers[1:]: network = layer[0](network, **layer[1]) prediction = get_output(network) loss = squared_error(prediction, output_var) loss = loss.mean() params = get_all_params(network, trainable=True) #updates = nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) updates = sgd(loss, params, learning_rate=0.01) test_prediction = get_output(network, deterministic=True) test_loss = squared_error(test_prediction, output_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: #test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), output_var), # dtype=theano.config.floatX) train_fn = theano.function([input_var, output_var], loss, updates=updates)# , mode=theano.compile.MonitorMode(post_func=theano.compile.monitormode.detect_nan)) #val_fn = theano.function([input_var, output_var], [test_loss, test_acc]) val_fn = theano.function([input_var, output_var], test_loss)
def build_network_2dconv( args, input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var, wordEmbeddings, maxlen=36 ): print ("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) # two conv pool layer # filter_size=(10, 100) # pool_size=(4,4) input_1 = InputLayer((None, maxlen), input_var=input1_var) batchsize, seqlen = input_1.input_var.shape # input_1_mask = InputLayer((None, maxlen),input_var=input1_mask_var) emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_1.params[emb_1.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape_1 = ReshapeLayer(emb_1, (batchsize, 1, maxlen, wordDim)) conv2d_1 = Conv2DLayer( reshape_1, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size) # (None, 100, 1, 1) """ filter_size_2=(4, 10) pool_size_2=(2,2) conv2d_1 = Conv2DLayer(maxpool_1, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size_2) #(None, 100, 1, 1) (None, 100, 1, 20) """ forward_1 = FlattenLayer(maxpool_1) # (None, 100) #(None, 50400) input_2 = InputLayer((None, maxlen), input_var=input2_var) # input_2_mask = InputLayer((None, maxlen),input_var=input2_mask_var) emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_2.params[emb_2.W].remove("trainable") reshape_2 = ReshapeLayer(emb_2, (batchsize, 1, maxlen, wordDim)) conv2d_2 = Conv2DLayer( reshape_2, num_filters=num_filters, filter_size=filter_size, stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size) # (None, 100, 1, 1) """ conv2d_2 = Conv2DLayer(maxpool_2, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size_2) #(None, 100, 1, 1) """ forward_2 = FlattenLayer(maxpool_2) # (None, 100) # elementwisemerge need fix the sequence length mul = ElemwiseMergeLayer([forward_1, forward_2], merge_function=T.mul) sub = AbsSubLayer([forward_1, forward_2], merge_function=T.sub) concat = ConcatLayer([mul, sub]) concat = ConcatLayer([forward_1, forward_2]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) if args.task == "sts": network = DenseLayer(hid, num_units=5, nonlinearity=softmax) elif args.task == "ent": network = DenseLayer(hid, num_units=3, nonlinearity=softmax) # prediction = get_output(network, {input_1:input1_var, input_2:input2_var}) prediction = get_output(network) loss = T.mean(categorical_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d_1: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" # test_prediction = get_output(network, {input_1:input1_var, input_2:input2_var}, deterministic=True) test_prediction = get_output(network, deterministic=True) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) """ train_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True) """ train_fn = theano.function([input1_var, input2_var, target_var], loss, updates=updates, allow_input_downcast=True) if args.task == "sts": """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True) """ val_fn = theano.function( [input1_var, input2_var, target_var], [test_loss, test_prediction], allow_input_downcast=True ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) """ val_fn = theano.function([input1_var, input2_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
def main(cf): ######## # DATA # ######## print 'Creating data generators...' train_iterator, valid_iterator, test_iterator = create_data_generators(cf) ############################## # COST, GRADIENT AND UPDATES # ############################## print 'Building model...' cost, accuracy = cf.model.compute_cost(deterministic=False) cost_val, accuracy_val = cf.model.compute_cost(deterministic=True) params = get_all_params(cf.model.net, trainable=True) if cf.algo == 'adam': updates = adam(cost, params, cf.learning_rate) elif cf.algo == 'sgd': updates = sgd(cost, params, cf.learning_rate) elif cf.algo == 'momentum': updates = momentum(cost, params, cf.learning_rate) else: raise ValueError('Specified algo does not exist') ############## # MONITORING # ############## print 'Creating extensions and compiling functions...', train_monitor = TrainMonitor(cf.train_freq_print, cf.model.vars, [cost, accuracy], updates) monitoring_vars = [cost_val, accuracy_val] valid_monitor = ValMonitor('Validation', cf.valid_freq_print, cf.model.vars, monitoring_vars, valid_iterator) test_monitor = ValMonitor('Test', cf.valid_freq_print, cf.model.vars, monitoring_vars, valid_iterator) train_saver = VariableSaver(train_monitor, cf.dump_every_batches, cf.dump_path, 'train') valid_saver = VariableSaver(valid_monitor, cf.dump_every_batches, cf.dump_path, 'valid') test_saver = VariableSaver(test_monitor, None, cf.dump_path, 'test') # Ending conditions end_conditions = [] if hasattr(cf, 'max_iter'): end_conditions.append(MaxIteration(cf.max_iter)) if hasattr(cf, 'max_time'): end_conditions.append(MaxTime(cf.max_iter)) extensions = [ valid_monitor, test_monitor, train_saver, valid_saver, test_saver ] train_m = Trainer(train_monitor, train_iterator, extensions, end_conditions) ############ # TRAINING # ############ train_m.train()
def _prepare(self, X, y, X_valid=None, y_valid=None, sample_weight=None, whole_dataset_in_device=True): self._stats = [] self._class_label_encoder = LabelEncoder() if self.is_classification is True: self._class_label_encoder.fit(y) self.classes_ = self._class_label_encoder.classes_ y = self._class_label_encoder.transform(y).astype(y.dtype) self.y_train_transformed = y if y_valid is not None: y_valid_transformed = self._class_label_encoder.transform( y_valid).astype(y_valid.dtype) self._l_x_in = layers.InputLayer(shape=(None, X.shape[1])) batch_index, X_batch, y_batch, batch_slice = get_theano_batch_variables( self.batch_size, y_softmax=self.is_classification) if sample_weight is not None: t_sample_weight = T.vector('sample_weight') sample_weight = sample_weight.astype(theano.config.floatX) else: t_sample_weight = T.scalar('sample_weight') if self.is_classification is True: y_dim = len(set(y.flatten().tolist())) else: y_dim = y.shape[1] self._prediction_layer = self._build_model(y_dim) self._layers = layers.get_all_layers(self._prediction_layer) self._build_prediction_functions(X_batch, self._prediction_layer) if self.input_noise_function is None: output = layers.get_output(self._prediction_layer, X_batch) else: X_batch_noisy = self.input_noise_function(X_batch) output = layers.get_output(self._prediction_layer, X_batch_noisy) if self.is_classification: loss = -T.mean(t_sample_weight * T.log(output) [T.arange(y_batch.shape[0]), y_batch]) else: loss = T.mean( t_sample_weight * T.sum((output - y_batch) ** 2, axis=1)) loss_unreg = loss all_params = layers.get_all_params(self._prediction_layer) if self._output_softener_coefs is not None: all_params.append(self._output_softener_coefs) W_params = layers.get_all_param_values( self._prediction_layer, regularizable=True) # regularization if self.L1_factor is not None: for L1_factor_layer, W in zip(self.L1_factor, W_params): loss = loss + L1_factor_layer * T.sum(abs(W)) if self.L2_factor is not None: for L2_factor_layer, W in zip(self.L2_factor, W_params): loss = loss + L2_factor_layer * T.sum(W**2) if self.optimization_method == 'nesterov_momentum': gradient_updates = updates.nesterov_momentum(loss, all_params, learning_rate=self.learning_rate, momentum=self.momentum) elif self.optimization_method == 'adadelta': # don't need momentum there gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'adam': gradient_updates = updates.Adam( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'momentum': gradient_updates = updates.momentum( loss, all_params, learning_rate=self.learning_rate, momentum=self.momentum ) elif self.optimization_method == 'adagrad': gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'rmsprop': gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'sgd': gradient_updates = updates.sgd( loss, all_params, learning_rate=self.learning_rate, ) else: raise Exception("wrong optimization method") nb_batches = X.shape[0] // self.batch_size if (X.shape[0] % self.batch_size) != 0: nb_batches += 1 X = X.astype(theano.config.floatX) if self.is_classification == True: y = y.astype(np.int32) else: y = y.astype(theano.config.floatX) if whole_dataset_in_device == True: X_shared = theano.shared(X, borrow=True) y_shared = theano.shared(y, borrow=True) givens = { X_batch: X_shared[batch_slice], y_batch: y_shared[batch_slice] } if sample_weight is not None: sample_weight_shared = theano.shared( sample_weight, borrow=True) givens[t_sample_weight] = sample_weight_shared[batch_slice] else: givens[t_sample_weight] = T.as_tensor_variable( np.array(1., dtype=theano.config.floatX)) iter_update_batch = theano.function( [batch_index], loss, updates=gradient_updates, givens=givens, ) else: if sample_weight is None: iter_update_gradients = theano.function( [X_batch, y_batch], loss, updates=gradient_updates, givens={t_sample_weight: T.as_tensor_variable( np.array(1., dtype=theano.config.floatX))}, ) def iter_update_batch(batch_index): sl = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) return iter_update_gradients(X[sl], y[sl]) else: iter_update_gradients = theano.function( [X_batch, y_batch, t_sample_weight], loss, updates=gradient_updates ) def iter_update_batch(batch_index): sl = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) return iter_update_gradients(X[sl], y[sl], sample_weight[sl]) self._iter_update_batch = iter_update_batch self._get_loss = theano.function( [X_batch, y_batch, t_sample_weight], loss_unreg, allow_input_downcast=True) def iter_update(epoch): losses = [] #self.learning_rate.set_value(self.learning_rate.get_value() * np.array(0.99, dtype=theano.config.floatX)) for i in xrange(nb_batches): losses.append(self._iter_update_batch(i)) # max norm if self.max_norm is not None: for max_norm_layer, layer in zip(self.max_norm, self._layers): layer.W = updates.norm_constraint( layer.W, self.max_norm) losses = np.array(losses) d = OrderedDict() d["epoch"] = epoch #d["loss_train_std"] = losses.std() #d["loss_train"] = losses.mean() d["loss_train"] = self._get_loss( self.X_train, self.y_train_transformed, 1.) d["accuracy_train"] = ( self.predict(self.X_train) == self.y_train).mean() if X_valid is not None and y_valid is not None: d["loss_valid"] = self._get_loss( X_valid, y_valid_transformed, 1.) if self.is_classification == True: d["accuracy_valid"] = ( self.predict(X_valid) == y_valid).mean() if self.verbose > 0: if (epoch % self.report_each) == 0: print(tabulate([d], headers="keys")) self._stats.append(d) return d def quitter(update_status): cur_epoch = len(self._stats) - 1 if self.patience_nb_epochs > 0: # patience heuristic (for early stopping) cur_patience_stat = update_status[self.patience_stat] if self.cur_best_patience_stat is None: self.cur_best_patience_stat = cur_patience_stat first_time = True else: first_time = False thresh = self.patience_progression_rate_threshold if cur_patience_stat < self.cur_best_patience_stat * thresh or first_time: if self.verbose >= 2: fmt = "--Early stopping-- good we have a new best value : {0}={1}, last best : epoch {2}, value={3}" print(fmt.format(self.patience_stat, cur_patience_stat, self.cur_best_epoch, self.cur_best_patience_stat)) self.cur_best_epoch = cur_epoch self.cur_best_patience_stat = cur_patience_stat if hasattr(self, "set_state") and hasattr(self, "get_state"): self.cur_best_model = self.get_state() else: self.cur_best_model = pickle.dumps( self.__dict__, protocol=pickle.HIGHEST_PROTOCOL) if (cur_epoch - self.cur_best_epoch) >= self.patience_nb_epochs: finish = True if hasattr(self, "set_state") and hasattr(self, "get_state"): self.set_state(self.cur_best_model) else: self.__dict__.update(pickle.loads(self.cur_best_model)) self._stats = self._stats[0:self.cur_best_epoch + 1] if self.verbose >= 2: print("out of patience...take the model at epoch {0} and quit".format( self.cur_best_epoch + 1)) else: finish = False return finish else: return False def monitor(update_status): pass def observer(monitor_output): pass return (iter_update, quitter, monitor, observer)
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats): print("Building model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen-kw+1 stride = 1 #important context words as channels #CNN_sentence config filter_size=wordDim pool_size=seqlen-filter_size+1 input = InputLayer((None, seqlen, num_feats),input_var=input_var) batchsize, _, _ = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim) #print get_output_shape(emb) reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim)) #print get_output_shape(reshape) conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1 #print get_output_shape(conv1d) conv1d = DimshuffleLayer(conv1d, (0,2,1)) #print get_output_shape(conv1d) pool_size=num_filters maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) #print get_output_shape(maxpool) #forward = FlattenLayer(maxpool) #print get_output_shape(forward) hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
def update(all_grads, all_params, learning_rate): """ Compute updates from gradients """ return sgd(all_grads, all_params, learning_rate)
def get_network(model): input_data = tensor.dmatrix('x') targets_var = tensor.dmatrix('y') network = layers.InputLayer((model['batch_size'], model['input_vars']), input_data) nonlin = nonlinearities.rectify if model['hidden_nonlinearity'] != 'ReLu': nonlin = nonlinearities.tanh prev_layer = network for l in range(model['nlayers']): W = None if model['hidden_nonlinearity'] == 'ReLu': W = lasagne.init.GlorotUniform('relu') else: W = lasagne.init.GlorotUniform(1) fc = layers.DenseLayer(prev_layer, model['units'], nonlinearity=nonlin, W=W) if model['dropout']: fc = layers.DropoutLayer(fc, 0.5) prev_layer = fc output_lin = None if model['output_mode'] == OUTPUT_LOG: output_lin = nonlinearities.tanh output_layer = layers.DenseLayer(prev_layer, 1, nonlinearity=output_lin) predictions = layers.get_output(output_layer) if model['output_mode'] != OUTPUT_LOG: (minth, maxth) = model['maxmin'][model['control']] maxt = theano.shared(np.ones((model['batch_size'], 1)) * maxth) mint = theano.shared(np.ones((model['batch_size'], 1)) * minth) predictions = tensor.min(tensor.concatenate([maxt, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) predictions = tensor.max(tensor.concatenate([mint, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) if model['output_mode'] == OUTPUT_NO: prediction_unboun = layers.get_output(output_layer) loss = objectives.squared_error(prediction_unboun, targets_var) else: loss = objectives.squared_error(predictions, targets_var) loss = objectives.aggregate(loss, mode='mean') params = layers.get_all_params(output_layer) # test_prediction = layers.get_output(output_layer, deterministic=True) #fix for dropout test_loss = objectives.squared_error(predictions, targets_var) test_loss = test_loss.mean() if model['hidden_nonlinearity'] == 'ReLu': model['lr'] *= 0.5 updates_sgd = updates.sgd(loss, params, learning_rate=model['lr']) ups = updates.apply_momentum(updates_sgd, params, momentum=0.9) train_fn = theano.function([input_data, targets_var], loss, updates=ups) pred_fn = theano.function([input_data], predictions) # pred_fn = theano.function([input_data], prediction_unboun) val_fn = theano.function([input_data, targets_var], test_loss) return { 'train': train_fn, 'eval': val_fn, 'pred': pred_fn, 'layers': output_layer }
def event_span_classifier(args, input_var, input_mask_var, target_var, wordEmbeddings, seqlen): print("Building model with LSTM") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] GRAD_CLIP = wordDim args.lstmDim = 150 input = InputLayer((None, seqlen),input_var=input_var) batchsize, seqlen = input.input_var.shape input_mask = InputLayer((None, seqlen),input_var=input_mask_var) emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb_1.W].remove('trainable') lstm = LSTMLayer(emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh) lstm_back = LSTMLayer( emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh, backwards=True) slice_forward = SliceLayer(lstm, indices=-1, axis=1) # out_shape (None, args.lstmDim) slice_backward = SliceLayer(lstm_back, indices=0, axis=1) # out_shape (None, args.lstmDim) concat = ConcatLayer([slice_forward, slice_backward]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, lstm:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, input_mask_var,target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, input_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
def optimizer(self, params, learning_rate): return updates.sgd(self.loss, params, learning_rate)
# hidden_layer hidden_input = conv_layer.output.flatten(2) hidden_input_shape = (conv_layer.output_shape[0], conv_layer.output_shape[1]*conv_layer.output_shape[2]*conv_layer.output_shape[3]) hidden_layer = FullConectedLayer(rng,hidden_input,hidden_input_shape[1],100) # regression_layer regession_layer = RegresstionLayer(rng,hidden_layer.output,100,1) mse = regession_layer.mse(Y) cost = mse + 0.001 * (conv_layer.L2 + hidden_layer.L2 + regession_layer.L2) params = conv_layer.params + hidden_layer.params + regession_layer.params updates = sgd(cost,params,0.01) train_model = theano.function([X,Y],[mse,cost],updates=updates) valid_model = theano.function([X,Y],[mse,cost]) showfunction = theano.function([X,Y],[hidden_input, hidden_layer.output, regession_layer.y_pred, mse, cost]) # a,b,c,d,e = showfunction(X_train[:100],Y_train_rouge1[:100]) # # print(a,b,c,d,e) patience = 0 best_valid_mse_global = 100 early_stop = 20 epoch_i = 0
x = T.fmatrix() t = T.fvector() ann = network(x) prediction = get_output(ann)[:, 1] predict = function([x], outputs=prediction) loss = binary_crossentropy(prediction, t).mean() # L2 regularization if L2_REGULARIZATION: l2_penalty = ALPHA * regularize_network_params(ann, l2) loss += l2_penalty.mean() updates = sgd(loss_or_grads=loss, params=get_all_params(ann, trainable=True), learning_rate=LR) train = function([x, t], outputs=loss, updates=updates, allow_input_downcast=True, mode='FAST_COMPILE') # Load data train_data, test_data = get_data() train_data, test_data = np.float32(train_data), np.float32(test_data) # Standardize features train_data[:, :-1] = (train_data[:, :-1] - np.mean( train_data[:, :-1], axis=0)) / np.std(train_data[:, :-1], axis=0) test_data[:, :-1] = (test_data[:, :-1] - np.mean(
nonlinearity=softmax, W=Constant()) # Now, we can generate the symbolic expression of the network's output given an input variable. net_input = T.matrix('net_input') net_output = l_output.get_output(net_input) # As a loss function, we'll use Theano's categorical_crossentropy function. # This allows for the network output to be class probabilities, # but the target output to be class labels. true_output = T.ivector('true_output') loss = T.mean(T.nnet.categorical_crossentropy(net_output, true_output)) # Retrieving all parameters of the network is done using get_all_params, # which recursively collects the parameters of all layers connected to the provided layer. all_params = get_all_params(l_output) # Now, we'll generate updates using Lasagne's SGD function updates = sgd(loss, all_params, learning_rate=0.01) # Finally, we can compile Theano functions for training and computing the output. training = function([net_input, true_output], loss, updates=updates) prediction = function([net_input], net_output) # Train for 100 epochs print 'epoch logloss' for k, n in enumerate(xrange(100)): # this is logloss res = training(trainT, classT) print '{0:.3d} {1:.4f}'.format(k, res) # Compute the predicted label of the training data. # The argmax converts the class probability output to class label probabilities = prediction(testT) # normalized prediction = np.argmax(probabilities, axis=1)
def build_network_2dconv(args, input_var, target_var, wordEmbeddings, maxlen=60): print("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) input = InputLayer((None, maxlen), input_var=input_var) batchsize, seqlen = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb.params[emb.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape = ReshapeLayer(emb, (batchsize, 1, maxlen, wordDim)) conv2d = Conv2DLayer( reshape, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool = MaxPool2DLayer(conv2d, pool_size=pool_size) # (None, 100, 1, 1) forward = FlattenLayer(maxpool) # (None, 100) #(None, 50400) hid = DenseLayer(forward, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction, target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype) eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype) eval_grad5 = TT.vector('eval_grad5', dtype=grad[4].dtype) f_train = theano.function( inputs=[observations_var, actions_var, d_rewards_var], outputs=grad) f_update = theano.function( inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5], outputs=None, updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5], params, learning_rate=learning_rate)) alla = [] for i in range(10): if (load_policy): policy.set_param_values(np.loadtxt('policy.txt'), trainable=True) avg_return = np.zeros(n_itr) #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) for j in range(n_itr): paths = parallel_sampler.sample_paths_on_trajectories( policy.get_param_values(), N, T, show_bar=False) #baseline.fit(paths) observations = [p["observations"] for p in paths] actions = [p["actions"] for p in paths] d_rewards = [p["rewards"] for p in paths]
from lasagne.layers import InputLayer, DenseLayer import lasagne from lasagne.updates import sgd, total_norm_constraint import theano.tensor as T x = T.matrix() y = T.ivector() l_in = InputLayer((5, 10)) l1 = DenseLayer(l_in, num_units=7, nonlinearity=T.nnet.softmax) output = lasagne.layers.get_output(l1, x) cost = T.mean(T.nnet.categorical_crossentropy(output, y)) all_params = lasagne.layers.get_all_params(l1) all_grads = T.grad(cost, all_params) scaled_grads = total_norm_constraint(all_grads[i], 5) updates = sgd(scaled_grads, all_params, learning_rate=0.1)