def build(params): image_shape = params['image_shape'] image_layer = layers.InputLayer([params['batch_size'], image_shape[0] * image_shape[1]]) label_layer = layers.InputLayer([params['batch_size'], params['num_classes']]) # rewighted alpha reweighted_alpha = (params['alpha'] * params['num_samples_train'] / params['num_samples_train_label']) semi_vae_layer = semi_vae.SemiVAE( [image_layer, label_layer], params['num_units_hidden_common'], params['dim_z'], reweighted_alpha ) sym_label_images = T.matrix('label_images') sym_label_labels = T.matrix('label_labels') sym_unlabel_images = T.matrix('unlabel_images') cost_for_label = semi_vae_layer.get_cost_for_label([sym_label_images, sym_label_labels]) cost_for_unlabel = semi_vae_layer.get_cost_for_unlabel(sym_unlabel_images) cost_together = semi_vae_layer.get_cost_together([sym_label_images, sym_label_labels, sym_unlabel_images]) cost_test, acc_test = semi_vae_layer.get_cost_test([sym_label_images, sym_label_labels]) network_params = semi_vae_layer.get_params() for param in network_params: print(param, param.get_value().shape) update_for_label = updates.adam(cost_for_label, network_params) update_for_unlabel = updates.adam(cost_for_unlabel, network_params) update_together = updates.adam(cost_together, network_params, learning_rate=3e-4) fn_train = theano.function([sym_label_images, sym_label_labels, sym_unlabel_images], cost_together, updates = update_together, on_unused_input = 'warn' ) ''' fn_for_label = theano.function([sym_label_images, sym_label_labels], cost_for_label, updates = update_for_label, #on_unused_input = 'warn', ) fn_for_unlabel = theano.function([sym_unlabel_images], cost_for_unlabel, updates = update_for_unlabel, #on_unused_input = 'warn' ) ''' fn_for_label = None fn_for_unlabel = None fn_for_test = theano.function([sym_label_images, sym_label_labels], [cost_test, acc_test]) return semi_vae, fn_for_label, fn_for_unlabel, fn_train, fn_for_test
def build_theano_fn_simple(self): print '%s build theano fn simple' % self.rank x = T.fmatrix('x') y = T.ivector('y') W_1, b_1 = common.init_tparams_fc(784, 1000, 'l1') out_1 = T.tanh(T.dot(x, W_1) + b_1) W_2, b_2 = common.init_tparams_fc(1000, 2000, 'l2') out_2 = T.tanh(T.dot(out_1, W_2) + b_2) W_3, b_3 = common.init_tparams_fc(2000, 3000, 'l3') out_3 = T.tanh(T.dot(out_2, W_3) + b_3) W_4, b_4 = common.init_tparams_fc(3000, 10, 'softmax') prob = T.nnet.softmax((T.dot(out_3, W_4) + b_4)) self.params = [W_1, b_1, W_2, b_2, W_3, b_3, W_4, b_4] # cost cost = -T.log(prob[T.arange(prob.shape[0]), y] + 1e-6).mean() pred = T.argmax(prob, 1) grads = T.grad(cost, self.params) grads_all_reduced = self.grad_all_reduce(grads) updates = adam(grads_all_reduced, self.params) #updates = adam(grads, self.params) self.train_fn = theano.function([x, y], [cost, prob, pred, y], updates=updates, accept_inplace=True) # the following code is used for debugging only if model == 'debug': self.debug_var = theano.shared(numpy.float32(1.)) debug_var_global = AllReduceSum(self.debug_var, inplace=True, worker=self.worker) updates = {self.debug_var: debug_var_global} self.debug_fn = theano.function([], [], updates=updates, accept_inplace=True)
def generate_theano_func(args, network, penalty, input_dict, target_var): prediction = get_output(network, input_dict) # loss = T.mean( target_var * ( T.log(target_var) - prediction )) loss = T.mean(categorical_crossentropy(prediction, target_var)) # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) ) # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params ) # penalty = regularize_layer_params(l_forward_1_lstm, l2) # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params) # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) ) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, input_dict, deterministic=True) # test_prediction = get_output(network, deterministic=True) # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction)) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) train_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True, ) if args.task == "sts": val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True, ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True, ) return train_fn, val_fn
def create_updates(loss, network, opt, learning_rate, momentum, beta1, beta2): params = lasagne.layers.get_all_params(network, trainable=True) grads = theano.grad(loss, params) # if max_norm: # names = ['crf.U', 'crf.W_h', 'crf.W_c', 'crf.b'] # constraints = [grad for param, grad in zip(params, grads) if param.name in names] # assert len(constraints) == 4 # scaled_grads = total_norm_constraint(constraints, max_norm=max_norm) # counter = 0 # for i in xrange(len(params)): # param = params[i] # if param.name in names: # grads[i] = scaled_grads[counter] # counter += 1 # assert counter == 4 if opt == 'adam': updates = adam(grads, params=params, learning_rate=learning_rate, beta1=beta1, beta2=beta2) elif opt == 'momentum': updates = nesterov_momentum(grads, params=params, learning_rate=learning_rate, momentum=momentum) else: raise ValueError('unkown optimization algorithm: %s' % opt) return updates
def prep_train(alpha=0.0002, nz=100): E, D = build_net(nz=nz) x = T.tensor4('x') #Get outputs z=E(x), x_hat=D(z) encoding = get_output(E, x) decoding = get_output(D, encoding) #Get parameters of E and D params_e = get_all_params(E, trainable=True) params_d = get_all_params(D, trainable=True) params = params_e + params_d #Calc cost and updates cost = T.mean(squared_error(x, decoding)) grad = T.grad(cost, params) updates = adam(grad, params, learning_rate=alpha) train = theano.function(inputs=[x], outputs=cost, updates=updates) rec = theano.function(inputs=[x], outputs=decoding) test = theano.function(inputs=[x], outputs=cost) return train, test, rec, E, D
def set_train_data(self, train_data, train_target): self.train_data = theano.shared( np.asarray(train_data, dtype=theano.config.floatX)) self.train_target = theano.shared( np.asarray(train_target, dtype=theano.config.floatX)) i = T.iscalar() sigma_prior = T.exp(-3) learning_rate = 0.001 batch_size = 100 if self.learning_task == "classification": objective = self.cross_entropy(batch_size=batch_size, sigma_prior=sigma_prior) elif self.learning_task == "regression": objective = self.mean_square_loss(batch_size=batch_size, sigma_prior=sigma_prior) # train function setting updates = adam(objective, self.all_params, learning_rate=learning_rate) self.train_function = theano.function( inputs=[i], outputs=objective, updates=updates, givens={ self.x: self.train_data[i * batch_size:(i + 1) * batch_size], self.y: self.train_target[i * batch_size:(i + 1) * batch_size] }) self.n_train_batches = int(self.train_data.get_value().shape[0] / float(batch_size))
def get_updates(nnet, train_obj, trainable_params): implemented_solvers = ("nesterov", "adagrad", "adadelta", "adam") if not hasattr(nnet, "solver") or nnet.solver not in implemented_solvers: nnet.sgd_solver = "nesterov" else: nnet.sgd_solver = nnet.solver if nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=0.9) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def getFunctions(pixel, LR = 0.001): X = T.tensor4('X') Y = T.ivector('y') # set up theano functions to generate output by feeding data through network, any test outputs should be deterministic output_layer = ZFTurboNet(pixel,X) output_train = lasagne.layers.get_output(output_layer) output_test = lasagne.layers.get_output(output_layer, deterministic=True) # set up the loss that we aim to minimize, when using cat cross entropy our Y should be ints not one-hot loss = lasagne.objectives.categorical_crossentropy(output_train, Y) penalty = lasagne.regularization.regularize_layer_params(output_layer, l1) * 5e-4 loss = loss + penalty loss = loss.mean() # set up loss functions for validation dataset valid_loss = lasagne.objectives.categorical_crossentropy(output_test, Y) valid_loss = valid_loss.mean() valid_acc = T.mean(T.eq(T.argmax(output_test, axis=1), Y), dtype=theano.config.floatX) # get parameters from network and set up sgd with nesterov momentum to update parameters params = lasagne.layers.get_all_params(output_layer, trainable=True) updates = adam(loss, params, learning_rate=LR) # set up training and prediction functions train_fn = theano.function(inputs=[X,Y], outputs=loss, updates=updates) valid_fn = theano.function(inputs=[X,Y], outputs=[valid_loss, valid_acc]) # set up prediction function predict_proba = theano.function(inputs=[X], outputs=output_test) return train_fn, valid_fn, predict_proba, output_layer
def build_updates(grad, params, optimization, learning_rate): """ setup optimization algorithm """ if optimization['optimizer'] == 'sgd': update_op = updates.sgd(grad, params, learning_rate=learning_rate) elif optimization['optimizer'] == 'nesterov_momentum': if momenum in optimization: momentum = optimization['momentum'] else: momentum = 0.9 update_op = updates.nesterov_momentum(grad, params, learning_rate=learning_rate, momentum=momentum) elif optimization['optimizer'] == 'adagrad': update_op = updates.adagrad(grad, params, learning_rate=learning_rate) elif optimization['optimizer'] == 'rmsprop': if 'rho' in optimization: rho = optimization['rho'] else: rho = 0.9 update_op = updates.rmsprop(grad, params, learning_rate=learning_rate, rho=rho) elif optimization['optimizer'] == 'adam': if 'beta1' in optimization: beta1 = optimization['beta1'] else: beta1 = 0.9 if 'beta2' in optimization: beta2 = optimization['beta2'] else: beta2 = 0.999 update_op = updates.adam(grad, params, learning_rate=learning_rate, beta1=beta1, beta2=beta2) return update_op
def optimize(grads, params): if state['optim_method'] == 'adam': updates = adam(grads, params, lrt, state['momentum']) elif state['optim_method'] == 'adagrad': updates = adagrad(grads, params, lrt) elif state['optim_method'] == 'sgd': updates = sgd(grads, params, lrt) return updates
def prep_train(alpha=0.002, beta1=0.5, beta2=0.9, nz=200): G, D = build_net(nz=nz) x = T.tensor4('x') z = T.matrix('z') # get network output for D and G G_z = get_output(G, z) D_G_z = get_output(D, G_z) # fake D_x = get_output(D, x) # real # create new variable e to sample X along straight lines e = T.TensorType(dtype=floatX, broadcastable=(False, True, True, True))() mixed_X = (e * G_z) + (1 - e) * x output_D_mixed = get_output(D, mixed_X) #compute gradients + penalty grad_mixed = T.grad(T.sum(output_D_mixed), mixed_X) norm_grad_mixed = T.sqrt(T.sum(T.square(grad_mixed), axis=[1, 2, 3])) grad_penalty = T.mean(T.square(norm_grad_mixed - 1)) # get parameters params_d = get_all_params(D, trainable=True) params_g = get_all_params(G, trainable=True) # compute losses for the discriminator J_D and the generator J_G J_D = D_G_z.mean() - D_x.mean() + 10 * grad_penalty J_G = -D_G_z.mean() # update parameters for both update_D = adam(J_D, params_d, learning_rate=alpha, beta1=beta1, beta2=beta2) update_G = adam(J_G, params_g, learning_rate=alpha, beta1=beta1, beta2=beta2) # define training functions train_G = theano.function(inputs=[z], outputs=J_G, updates=update_G) train_D = theano.function(inputs=[x, z, e], outputs=J_D, updates=update_D) return train_G, train_D, G, D
def __init__(self, steps = 1, num_layers = 2, num_units = 32, eps = 1e-2, recurrent = False, nonlinearity = tanh, ): self.steps = steps self.X = T.fmatrix() self.Y = T.fmatrix() def network(l): if recurrent: l = ReshapeLayer(l, shape = (-1, steps, 1)) l = LSTMLayer(l, num_units) for k in range(num_layers): l = DenseLayer(l, num_units = num_units, nonlinearity = nonlinearity) l = DenseLayer(l, num_units = 1, nonlinearity = linear) return l self.network = network l = InputLayer(input_var = self.X, shape = (None, steps)) l = self.network(l) self.l_ = l self.x_ = get_output(self.l_) self.f = theano.function([self.X], self.x_, allow_input_downcast=True) l2_penalty = regularize_network_params(l,L2) error = squared_error(self.x_, self.Y).mean() loss = error + eps * l2_penalty params = get_all_params(l) updates = adam(loss, params) self.error = theano.function([self.X,self.Y], error, allow_input_downcast=True) self.train = theano.function([self.X,self.Y], loss, updates=updates, allow_input_downcast=True)
def __init__(self, labels, g=0.1, m=0.01, feature_dimension=128, n_codewords=16, n_feature_samples=100, eta=0.01): """ The labels of the objects used for the optimization. The objects must be in the same order when the fit function is called :param labels: labels of the objects used for the optimization :param g: BoW quantization parameter :param m: entropy softness parameter :param feature_dimension: dimension of the extracted feature vectors :param n_codewords: number of codewords in the dictionary :param n_feature_samples: number of feature vectors to use in each iteration :param eta: learning rate """ SoftBoW.__init__(self, g=g, feature_dimension=feature_dimension, n_codewords=n_codewords) self.entropy = SoftEntropy(m=m, labels=labels) self.entropy_loss = None self.learning_rate = eta self.n_feature_samples = n_feature_samples # Histograms self.S = self._sym_histograms(self.X) # Entropy loss self.entropy_loss = self.entropy._sym_entropy(self.S) # Compile loss function self.calculate_loss_theano = theano.function([self.X], self.entropy_loss) # Define gradients w.r.t. V (and take care of NaNs) entropy_grad = T.grad(self.entropy_loss, self.S) entropy_grad = T.switch(T.isnan(entropy_grad), 0, entropy_grad) dictionary_grad = T.grad(self.entropy._sym_entropy(self.S), self.V, known_grads={self.S: entropy_grad}) dictionary_grad = T.switch(T.isnan(dictionary_grad), 0, dictionary_grad) # Define and compile the training function self.updates = adam([dictionary_grad], [self.V], learning_rate=self.learning_rate) self.train_theano = theano.function(inputs=[self.X], outputs=[self.entropy_loss], updates=self.updates)
def buildFunctions(net, input_var, target_var): params = lasagne.layers.get_all_params(net['h0_inv'], trainable=True) out = lasagne.layers.get_output(net['h0_inv'], deterministic=True) loss = lasagne.objectives.squared_error(out, target_var) adam_update = adam (loss.mean(), params) train_function = theano.function([input_var, target_var], loss, updates=adam_update) return train_function
def train_function(self, semi_supervised= True, unlabel_stable=False): ''' use_unlabel == True, semi-superviesd learning return: train function for 1 epoch use ''' self.semi_supervised = semi_supervised sym_klw = T.scalar('sym_klw',dtype=theano.config.floatX) # symbolic scalar of warming up sym_cw = T.scalar('sym_cw',dtype=theano.config.floatX) # classifier warm up sym_s = T.matrix('sym_s',dtype='int64') sym_mask = T.matrix('sym_mask',dtype=theano.config.floatX) sym_y = T.matrix('sym_label',dtype=theano.config.floatX) sym_s_u = T.matrix('sym_s_u',dtype='int64') sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX) num_l, num_u = sym_s.shape[0].astype(theano.config.floatX), 0.0 if self.semi_supervised: print 'Train with unlabel data.' num_u = sym_s_u.shape[0].astype(theano.config.floatX) #get labeled/unlabeled cost outs1 = self.cost_label([sym_s, sym_mask, sym_y], dev_stage=False, return_mode = 'mean') loss_recons, loss_kl, valid_words, word_drop_num, loss_classifier, batch_ppl, acc = outs1 loss_recons_u, loss_kl_u,loss_entropy_u, batch_ppl_u = 0.0,0.0,0.0,0.0 valid_words_u = 0 if self.semi_supervised: outs2 = self.cost_unlabel([sym_s_u, sym_mask_u], dev_stage=unlabel_stable, sample_by_prob=self.sample_unlabel) loss_recons_u, loss_kl_u, valid_words_u, loss_entropy_u, batch_ppl_u = outs2 ''' total Loss: L = Loss_labeled(s,mask,y) + beta*(n_l+n_u)/n_l * Loss_classisifer(s,mask,y) + Loss_unlabel(s_u, mask_u) L = recons_term + sym_klw_term + loss_classifier_term - loss_entropy_u ''' alpha = sym_cw * self.cost_beta * ( num_l + num_u ) / num_l total_cost = loss_recons * num_l + loss_recons_u * num_u\ + sym_klw * ( loss_kl * num_l + loss_kl_u * num_u)\ + alpha * loss_classifier * num_l\ - loss_entropy_u * num_u total_cost /= (num_l + num_u) train_params = self.get_params(only_trainable=True) all_grads = theano.grad(total_cost,train_params) all_grads = [T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads] all_grads = total_norm_constraint( all_grads, max_norm=self.max_norm ) #all_grads = [T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads] updates = adam(all_grads,train_params, self.lr, self.beta1, self.beta2) if self.semi_supervised: train_input = [sym_s, sym_mask, sym_y, sym_s_u, sym_mask_u, sym_klw, sym_cw] train_output = [total_cost, loss_recons, loss_recons_u, loss_kl, loss_kl_u, alpha, loss_classifier, loss_entropy_u, batch_ppl, batch_ppl_u, valid_words, valid_words_u, word_drop_num, acc] else: train_input = [sym_s, sym_mask, sym_y, sym_klw, sym_cw] train_output = [total_cost, loss_recons, loss_kl, loss_classifier, batch_ppl, valid_words, word_drop_num, acc] train_f = theano.function(inputs=train_input, outputs=train_output,updates=updates, name='train_function') return train_f
def build_training_function(self): self._loss = self.get_loss_function() self._params = get_all_params(self.network, trainable=True) self._updates_net = adam(self._loss, self._params, learning_rate=self.learning_rate, beta1=0.) return theano.function( [self.states, self.actions, self.next_states, self.rewards], self._loss, updates=self._updates_net)
def contrastive_loss_iter(embedder, update_params={}): X_pairs = { 'img1':T.tensor4(), 'img2':T.tensor4(), } y = T.ivector() # basically class labels final_emb_layer = embedder[-1] all_layers = ll.get_all_layers(embedder) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_pairs.items()} predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_pairs.items()} margin = 1 # if distance is 0 that's bad distance = lambda pred: (pred['img1'] - pred['img2'] + 1e-7).norm(2, axis=1) contrastive_loss = lambda pred: T.mean(y*(distance(pred)) + (1 - y)*(margin - distance(pred)).clip(0,np.inf)) failed_matches = lambda pred: T.switch(T.eq(T.sum(y),0), 0, T.sum((y*distance(pred)) > margin) / T.sum(y)) failed_nonmatches = lambda pred: T.switch(T.eq(T.sum(1-y),0), 0, T.sum((1-y*distance(pred)) < margin) / T.sum(1-y)) failed_pairs = lambda pred: 0.5*failed_matches(pred) + 0.5*failed_nonmatches(pred) decay = 0.0001 reg = regularize_network_params(final_emb_layer, l2) * decay losses_reg = lambda pred: contrastive_loss(pred) + reg loss_train = losses_reg(predicted_embeds_train) loss_train.name = 'CL' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder])) all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) print("Compiling network for training") tic = time.time() train_iter = theano.function([X_pairs['img1'], X_pairs['img2'], y], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X_pairs['img1'], X_pairs['img2'], y], [ contrastive_loss(predicted_embeds_valid), losses_reg(predicted_embeds_valid), failed_pairs(predicted_embeds_valid)]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def triplet_loss_iter(embedder, update_params={}): X_triplets = { 'anchor':T.tensor4(), 'positive':T.tensor4(), 'negative':T.tensor4(), } # each will be a batch of images final_emb_layer = embedder[-1] all_layers = ll.get_all_layers(embedder) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_triplets.items()} predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_triplets.items()} # each output should be batch_size x embed_size # should give us a vector of batch_size of distances btw anchor and positive alpha = 0.2 # FaceNet alpha triplet_pos = lambda pred: (pred['anchor'] - pred['positive']).norm(2,axis=1) triplet_neg = lambda pred: (pred['anchor'] - pred['negative']).norm(2,axis=1) triplet_distances = lambda pred: (triplet_pos(pred) - triplet_neg(pred) + alpha).clip(0, np.inf) triplet_failed = lambda pred: T.mean(triplet_distances(pred) > alpha) triplet_loss = lambda pred: T.sum(triplet_distances(pred)) decay = 0.001 reg = regularize_network_params(final_emb_layer, l2) * decay losses_reg = lambda pred: triplet_loss(pred) + reg loss_train = losses_reg(predicted_embeds_train) loss_train.name = 'TL' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder])) all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) print("Compiling network for training") tic = time.time() train_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [triplet_loss(predicted_embeds_valid), losses_reg(predicted_embeds_valid), triplet_failed(predicted_embeds_valid)]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def __init__ (self): self.learning_rate = 0.001 self.L1_reg = 0.0000 self.L2_reg = 0.0001 self.n_hidden = 50 self.num_inputs = 20 self.num_outputs = 1 # allocate symbolic variables for the data x = T.ivector('x') y = T.iscalar('y') rng = np.random.RandomState(None) # construct the neural network's Architecture architecture = Architecture( rng=rng, input=[x], n_in=self.num_inputs, n_hidden=self.n_hidden, n_out=self.num_outputs ) cost = ( architecture.error_function(y) + self.L1_reg * architecture.L1 + self.L2_reg * architecture.L2_sqr ) #stochastic gradient descent with adaptive learning with Lasagne--using Adam updates = adam(cost, architecture.params, learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) # backpropogation that also contains a forward pass self.train_model = theano.function( inputs=[x, y], outputs=[cost, architecture.get_result()], updates=updates, allow_input_downcast=True ) # forward pass self.run_model = theano.function( inputs=[x], outputs=architecture.get_result(), allow_input_downcast=True ) self.grab_weights = theano.function( inputs=[], outputs=architecture.params, allow_input_downcast=True )
def loss_iter(segmenter, update_params={}): X = T.tensor4() y = T.tensor4() pixel_weights = T.tensor3() final_pred_layer = segmenter[-1] all_layers = ll.get_all_layers(segmenter) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_masks_train = ll.get_output(segmenter, X) predicted_mask_valid = ll.get_output(final_pred_layer, X, deterministic=True) thresh = 0.5 accuracy = lambda pred: T.mean(T.eq(T.argmax(pred, axis=1), T.argmax(y, axis=1))) true_pos = lambda pred: T.sum((pred[:,0,:,:] > thresh) * (y[:,0,:,:] > thresh)) false_pos = lambda pred: T.sum((pred[:,0,:,:] > thresh) - (y[:,0,:,:] > thresh)) precision = lambda pred: (true_pos(pred) / (true_pos(pred) + false_pos(pred))) pixel_weights_1d = pixel_weights.flatten(ndim=1) losses = lambda pred: T.mean(crossentropy_flat(pred + 1e-7, y + 1e-7) * pixel_weights_1d) decay = 0.0001 reg = regularize_network_params(final_pred_layer, l2) * decay losses_reg = lambda pred: losses(pred) + reg loss_train = T.sum([losses_reg(mask) for mask in predicted_masks_train]) loss_train.name = 'CE' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in segmenter])) all_params = ll.get_all_params(segmenter, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) acc_train = accuracy(predicted_masks_train[-1]) acc_valid = accuracy(predicted_mask_valid) prec_train = precision(predicted_masks_train[-1]) prec_valid = precision(predicted_mask_valid) print("Compiling network for training") tic = time.time() train_iter = theano.function([X, y, pixel_weights], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X, y, pixel_weights], [losses(predicted_mask_valid), losses_reg(predicted_mask_valid), prec_valid]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def optimize(content_targets, vgg_path, im_size, epochs=2, period=1000, batch_size=4, save_path='saver/fns.ckpt', learning_rate=1e-3, checkpoint_model=None): assert content_targets.shape[1:] == (3, *im_size) print('=== CREATE VGG NET ===') images = T.ftensor4() vgg_net = vgg.Net(im_size) vgg_net.set_params(vgg.load_params(vgg_path)) print('=== CREATE EMUL NET ===') emul_net = emul.Net(im_size) if checkpoint_model is not None: emul_net.set_params(checkpoint_model) content_losses = [] for layer in vgg.CONTENT_LAYERS + vgg.STYLE_LAYERS: content_vgg_features = vgg_net(images, layer) content_emul_features = emul_net(images, layer) size = content_emul_features.size loss = l2_loss(content_vgg_features, content_emul_features) / size content_losses.append(loss) loss = sum(content_losses) updates = adam(loss, emul_net.get_params(False), learning_rate) print('=== FUNCTION COMPILE ===') train_fn = theano.function([images], loss, updates=updates) valid_fn = theano.function([images], loss) print('=== START TRAIN ===') it = 0 time_in_train = 0 for epoch in range(epochs): for i in range(0, len(content_targets), batch_size): batch = content_targets[i:i + batch_size] batch = np.float32(batch) start = time() loss = train_fn(batch) time_in_train += time() - start it += 1 if it % period == 0 or i + batch_size >= len(content_targets): print('Time in train: %1.3lf' % time_in_train) time_in_train = 0 save(save_path, emul_net.get_params()) yield epoch, it, loss
def create_spotlight_fn(final_layer, blur_axes, free_axes, weight_axes, trials_shape): ones_shape = [trials_shape[i_ax] if i_ax in blur_axes + free_axes else 1 for i_ax in xrange(len(trials_shape))] means_stds_shape = [trials_shape[i_ax] if i_ax in free_axes else 1 for i_ax in xrange(len(trials_shape))] means_stds_shape = [len(blur_axes)] + means_stds_shape #toadd: mixture of gaussians full_mask = T.ones(ones_shape, dtype=np.float32) broadcast_pattern = [True if ax not in (free_axes) else False for ax in xrange(len(trials_shape))] broadcast_pattern = [False] + broadcast_pattern means = theano.shared((np.ones(means_stds_shape)* 0.5).astype(np.float32), broadcastable=broadcast_pattern) stds = theano.shared((np.ones(means_stds_shape)* 1).astype(np.float32), broadcastable=broadcast_pattern) for i_blur_axis, axis in enumerate(blur_axes): ax_mask = T.constant(np.linspace(0,1, trials_shape[axis], dtype=np.float32)) dimshuffle_pattern = [0 if ax == axis else 'x' for ax in xrange(len(trials_shape))] ax_mask = ax_mask.dimshuffle(*dimshuffle_pattern) # todo maybe have to fix this here? ax_gaussian = T.exp(-T.square((ax_mask - means[i_blur_axis]) / stds[i_blur_axis]) * 0.5) full_mask = full_mask * ax_gaussian weights_shape = [trials_shape[i_ax] if i_ax in weight_axes else 1 for i_ax in xrange(1,len(trials_shape))] weights_shape = [trials_shape[0]] + weights_shape broadcast_pattern = [True if ax not in (weight_axes) else False for ax in xrange(1, len(trials_shape))] broadcast_pattern = [False] + broadcast_pattern weights = theano.shared((np.ones(weights_shape)).astype(np.float32), broadcastable=broadcast_pattern) full_mask = full_mask * (T.maximum(weights,0) / T.mean(T.maximum(weights,0), axis=0, keepdims=True)) trials_var = T.ftensor4() scaled_trials = trials_var * full_mask targets = T.ivector() outputs = lasagne.layers.get_output(final_layer, inputs=scaled_trials, input_var=scaled_trials) loss = categorical_crossentropy(outputs, targets).sum() loss += T.mean(T.sqr(stds)) * 0.1 loss -= T.mean(T.abs_(weights - T.mean(weights, axis=0, keepdims=True))) * 10 adam_updates = adam(loss,[means, stds, weights], learning_rate=0.01) adam_grad_fn = theano.function([trials_var, targets], [loss,outputs, scaled_trials, full_mask, weights], updates=adam_updates) return adam_grad_fn
def run(self, parameter, parameterName, loss, **kwargs) : pVar = parameter.getVar() gparam = tt.grad(loss, pVar) updates = LUP.adam( [ gparam ], [pVar], learning_rate=self.getHP("lr"), beta1=self.getHP("beta1"), beta2=self.getHP("beta2"), epsilon=self.getHP("epsilon")) ret = OptimizerResult(pVar, parameterName, gparam, updates[pVar]) i = 0 for param, update in updates.items() : if param is not pVar : name = "%s_adam_%s" % (parameterName, i) ret.addCoParameter(param, name, None, update) i += 1 return ret
def _build(self, X, y): """ Builds the network and associated training functions, for the specific shapes of the inputs """ n_x = X.shape[-1] n_y = y.shape[-1] n_c = X.shape[1] # Defining input layers self.l_x = InputLayer(shape=(self.batch_size, n_c, n_x, n_x), input_var=self._x, name='x') self.l_y = InputLayer(shape=(self.batch_size, n_y), input_var=self._y, name='y') net = self._model_definition(self.l_x) # Output classifier out = DenseLayer(net, num_units=n_y, nonlinearity=identity) self._network = NonlinearityLayer(out, nonlinearity=sigmoid) # Compute network loss q, p = get_output([out, self.l_y], inputs={ self.l_x: self._x, self.l_y: self._y }) # Define loss function loss = weighted_sigmoid_binary_crossentropy(q, p, self.pos_weight) # Average over batch loss = loss.mean() # Get trainable parameters and generate updates params = get_all_params([self._network], trainable=True) grads = T.grad(loss, params) updates = adam(grads, params, learning_rate=self._lr) self._trainer = theano.function([self._x, self._y, self._lr], [loss], updates=updates) # Get detection probability from the network qdet = get_output(self._network, inputs={self.l_x: self._x}, deterministic=True) self._output = theano.function([self._x], qdet)
def train_expectation_function(self): ''' unlabeled data train with expection ''' print "Train Function: Calculate the Expectation of unlabeled data." sym_klw = T.scalar('sym_klw', dtype=theano.config.floatX) # symbolic scalar of warming up sym_sents = T.matrix('sym_s', dtype='int64') sym_mask = T.matrix('sym_mask', dtype=theano.config.floatX) # one hot! sym_label = T.matrix('sym_label', dtype=theano.config.floatX) sym_sents_u = T.matrix('sym_s_u', dtype='int64') sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX) num_l, num_u = sym_sents.shape[0].astype(theano.config.floatX), \ sym_sents_u.shape[0].astype(theano.config.floatX) num_all = num_l + num_u # forward the network and get cost values enc_sents, dec_sents, _ = self._forward_sents(sym_sents, dev_stage=False) enc_sents_u, dec_sents_u, _ = self._forward_sents(sym_sents_u, dev_stage=False) # classifier loss y_pred, loss_class, acc = self._forward_classifier([enc_sents, sym_mask], sym_label, dev_stage=False) y_pred_u, loss_entropy, _ = self._forward_classifier([enc_sents_u, sym_mask_u], None, dev_stage=False) # reconstruction and kl loss loss_rec, loss_kl, ppl = self.cost_label([sym_sents, enc_sents, dec_sents, sym_mask, sym_label], dev_stage=False) loss_rec_u, loss_kl_u, ppl_u = self.cost_unlabel_expectation([sym_sents_u, enc_sents_u, dec_sents_u, sym_mask_u, y_pred_u], dev_stage=False) # use baseline if self.use_baseline: baselines_u = self._get_baselines([sym_sents_u, enc_sents_u, sym_mask_u]) loss_rec_u -= baselines_u total_cost = T.sum(loss_rec) + T.sum(loss_rec_u) - T.sum(loss_entropy) total_cost += sym_klw * (T.sum(loss_kl) + T.sum(loss_kl_u)) total_cost += self.alpha * T.sum(loss_class) * num_all / num_l total_cost /= num_all all_params = self.get_params(tag='all') all_grads = theano.grad(total_cost, all_params) all_grads = total_norm_constraint(all_grads, max_norm=self.max_norm) updates = adam(all_grads, all_params, self.lr) train_input = [sym_sents, sym_mask, sym_label, sym_sents_u, sym_mask_u, sym_klw] train_output = [total_cost, T.mean(loss_rec), T.mean(loss_rec_u), T.mean(loss_kl), T.mean(loss_kl_u), T.mean(loss_class), T.mean(loss_entropy), ppl, ppl_u, acc, self.b] train_f = theano.function(inputs=train_input, outputs=train_output, updates=updates, name='train_expectation') return train_f
def build_model(self, train_x, train_mask_x, train_mask_out, train_target, test_x, test_mask_x, test_mask_out, test_target): self.train_x = train_x self.train_mask_x = train_mask_x self.train_mask_out = train_mask_out self.train_target = train_target self.test_x = test_x self.test_mask_x = test_mask_x self.test_mask_out = test_mask_out self.test_target = test_target self.index = T.iscalar('index') self.num_batch_test = T.iscalar('index') self.b_slice = slice(self.index * self.num_batch, (self.index + 1) * self.num_batch) sym_x = T.dtensor3() sym_mask_x = T.dmatrix() sym_target = T.dtensor3() sym_mask_out = T.dtensor3() # sym_mask_out = T.dtensor3() should not be useful since output is still zero # TODO think about this if it is true out = lasagne.layers.get_output(self.model, inputs={self.l_in: sym_x, self.mask_input: sym_mask_x}) out_out = self.get_output_y(out) loss = T.mean(lasagne.objectives.squared_error(out_out, sym_target)) / self.num_batch out_test = lasagne.layers.get_output(self.model, inputs={self.l_in: sym_x, self.mask_input: sym_mask_x}) out_out_test = self.get_output_y(out_test) loss_test = T.mean(lasagne.objectives.squared_error(out_out_test, sym_target)) / self.num_batch_test all_params = [self.W] + [self.b] +lasagne.layers.get_all_params(self.model) all_grads_target = [T.clip(g, -3, 3) for g in T.grad(loss, all_params)] all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 3) updates_target = adam(all_grads_target, all_params) train_model = theano.function([self.index], [loss, out_out], givens={sym_x: self.train_x[self.b_slice], sym_mask_x: self.train_mask_x[self.b_slice], sym_target: self.train_target[self.b_slice], }, updates=updates_target) test_model = theano.function([self.num_batch_test], [loss_test, out_out_test], givens={sym_x: self.test_x, sym_mask_x: self.test_mask_x, sym_target: self.test_target, }) return train_model, test_model
def get_updates(nnet, train_obj, trainable_params, solver=None): implemented_solvers = ("sgd", "momentum", "nesterov", "adagrad", "rmsprop", "adadelta", "adam", "adamax") if solver not in implemented_solvers: nnet.sgd_solver = "adam" else: nnet.sgd_solver = solver if nnet.sgd_solver == "sgd": updates = l_updates.sgd(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "momentum": updates = l_updates.momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "rmsprop": updates = l_updates.rmsprop(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adamax": updates = l_updates.adamax(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def build_model(self, train_x, train_mask_x, train_mask_out, train_target, test_x, test_mask_x, test_mask_out, test_target): self.train_x = train_x self.train_mask_x = train_mask_x self.train_mask_out = train_mask_out self.train_target = train_target self.test_x = test_x self.test_mask_x = test_mask_x self.test_mask_out = test_mask_out self.test_target = test_target self.index = T.iscalar('index') self.num_batch_test = T.iscalar('index') self.b_slice = slice(self.index * self.num_batch, (self.index + 1) * self.num_batch) sym_x = T.dtensor3() sym_mask_x = T.dmatrix() sym_target = T.dtensor3() # sym_mask_out = T.dtensor3() should not be useful since output is still zero # TODO think about this if it is true output = lasagne.layers.get_output(self.model, inputs={self.l_in: sym_x, self.mask_input: sym_mask_x}) theta = self.get_output_y(output) log_px = self.get_log_x(sym_target, theta) log_px_sum_time = log_px.sum(axis=1, dtype=theano.config.floatX) # sum over tx loss = - T.sum(log_px_sum_time) / self.num_batch # average over batch ## log_px_test = self.get_log_x(sym_target, theta) log_px_sum_time_test = log_px_test.sum(axis=1, dtype=theano.config.floatX) # sum over time loss_test = - T.sum(log_px_sum_time_test) / self.num_batch_test # average over batch # loss = T.mean(lasagne.objectives.squared_error(mu, sym_target)) all_params = [self.W_y_theta] + [self.b_y_theta] + lasagne.layers.get_all_params(self.model) all_grads_target = [T.clip(g, -3, 3) for g in T.grad(loss, all_params)] all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 3) updates_target = adam(all_grads_target, all_params) train_model = theano.function([self.index], [loss, theta, log_px], givens={sym_x: self.train_x[self.b_slice], sym_mask_x: self.train_mask_x[self.b_slice], sym_target: self.train_target[self.b_slice]}, updates=updates_target) test_model = theano.function([self.num_batch_test], [loss_test, theta], givens={sym_x: self.test_x, sym_mask_x: self.test_mask_x, sym_target: self.test_target}) return train_model, test_model
def build(layer_heads, params): """""" fns = {} # model methods x = T.tensor4('input') for target in params['targets']: fns[target['name']] = {} out_layer = layer_heads[target['name']] y = T.matrix('target') o = L.get_output(out_layer, inputs=x) o_vl = L.get_output(out_layer, inputs=x, deterministic=True) if 'class_weight' in params and params['class_weight']: loss_fn = partial(weighted_cce, weights=params['class_weight']) else: loss_fn = obj.categorical_crossentropy loss = loss_fn(o, y).mean() loss_vl = loss_fn(o_vl, y).mean() wd_l2 = reg.regularize_network_params(out_layer, reg.l2) wd_l2 *= params['beta'] acc_vl = obj.categorical_accuracy(o_vl, y).mean() updates_ = updates.adam(loss + wd_l2, L.get_all_params(out_layer, trainable=True), learning_rate=params['learning_rate'], epsilon=params['epsilon']) fns[target['name']]['train'] = theano.function( [x, y], updates=updates_, allow_input_downcast=True) fns[target['name']]['predict'] = theano.function( [x], o_vl, allow_input_downcast=True) fns[target['name']]['cost'] = theano.function( [x, y], loss_vl, allow_input_downcast=True) fns[target['name']]['acc'] = theano.function([x, y], acc_vl, allow_input_downcast=True) fns[target['name']]['transform'] = theano.function( [x], L.get_output(L.get_all_layers(layer_heads[target['name']])[-2], inputs=x, deterministic=True), allow_input_downcast=True) return fns, layer_heads
def __init__(self, architecture, dim, params): t1 = time.time() self.t_in = tensor.ftensor3('inputs') # =X float64 self.t_out = tensor.imatrix('targets') # =Y_true int32 self.input_shape = ( None, dim, params['segment_size'], ) self.output_shape = ( None, dim, params['segment_size'], ) self.architecture = architecture self.model = build_lasagne_model(architecture, dim, self.t_in, self.input_shape) self.params = params self.trained = False self.dim = dim test_pred = get_output(self.model, deterministic=False) test_loss = my_loss(self.model, test_pred, self.t_out, False, params) test_loss_with_reg = my_loss(self.model, test_pred, self.t_out, True, params) test_acc = tensor.mean(tensor.eq(tensor.argmax(test_pred, axis=1), self.t_out), dtype=config.floatX) self.eval_fn = function([self.t_in, self.t_out], [test_loss, test_loss_with_reg, test_acc], allow_input_downcast=True) self.evaluate = function([self.t_in], get_output(self.model, self.t_in), allow_input_downcast=True) pred = get_output(self.model) loss_with_reg = my_loss(self.model, pred, self.t_out, True, params) params = get_all_params(self.model, trainable=True) updates = adam(loss_with_reg, params=params, learning_rate=0.0001) self.train_fn = function([self.t_in, self.t_out], loss_with_reg, updates=updates, allow_input_downcast=True) print('Neural network initialized in {:.2f}s'.format(time.time() - t1))
def net_updates(net, loss, lr): # Get all trainable parameters (weights) of our net params = l.get_all_params(net, trainable=True) # We use the adam update, other options are available if cfg.OPTIMIZER == 'adam': param_updates = updates.adam(loss, params, learning_rate=lr, beta1=0.9) elif cfg.OPTIMIZER == 'nesterov': param_updates = updates.nesterov_momentum(loss, params, learning_rate=lr, momentum=0.9) elif cfg.OPTIMIZER == 'sgd': param_updates = updates.sgd(loss, params, learning_rate=lr) return param_updates
def build_model(vocab_size=200, embsize=25, hiddensize=50, ydim=2): X = T.matrix('X', dtype='int64') Mask = T.matrix('mask', dtype=config.floatX) Y = T.vector('Y', dtype='int64') nstep = X.shape[0] mini_batch_size = X.shape[1] emblayer = ProjectionLayer(X, vocab_size, embsize, (nstep, mini_batch_size)) lstmlayer = LSTM(emblayer.output, Mask, embsize, hiddensize, name="lstm-encode") lstmlayer.build_lstm() proj = lstmlayer.output proj = (proj * Mask[:, :, None]).sum(axis=0) proj = proj / Mask.sum(axis=0)[:, None] softmax_layer = SoftmaxLayer(proj, hiddensize, ydim) cost = softmax_layer.negative_log_likelihood(Y) err = softmax_layer.errors(Y) params = emblayer.params + lstmlayer.params + softmax_layer.params updates = adam(cost, params) train_function = theano.function(inputs=[X, Mask, Y], outputs=[cost, err], updates=updates) valid_function = theano.function(inputs=[X, Mask, Y], outputs=[cost, err]) predict_function = theano.function(inputs=[X, Mask], outputs=softmax_layer.y_pred) # see_func = theano.function(inputs=[X,Mask], outputs=softmax_layer.p_y_given_x) # hyhy = see_func(data_X,mask_X) # print hyhy.shape return X, Mask, Y, cost, err, train_function, valid_function, predict_function
def build_theano_fn_resnet(self): t0 = time.time() print '%s build theano fn resnet' % self.rank x = T.ftensor4('images') y = T.ivector('label') model = resnet50.build_model(x) prob = lasagne.layers.get_output(model['prob'], deterministic=True) self.params = lasagne.layers.get_all_params(model['prob'], trainable=True) cost = -T.log(prob[T.arange(prob.shape[0]), y] + 1e-6).mean() grads = T.grad(cost, self.params) grads_all_reduced = self.grad_all_reduce(grads) updates = adam(grads_all_reduced, self.params) self.train_fn = theano.function([x, y], [cost, y], updates=updates, accept_inplace=True) print '%s finished build theano fn, used %.3f' % (self.rank, time.time() - t0)
def __init__(self, learning_rate, hdim, ldim, input_tensor, n_in, n_out): # Set variables self.learning_rate = learning_rate self.hdim = hdim self.ldim = ldim self.input_tensor = input_tensor self.n_in = n_in self.n_out = n_out # Build the network self.srng = RandomStreams(seed=234) # Encoder part (phi) self.encoder = utils.EncoderMLP(self.srng, input_tensor, n_in, hdim, ldim) # Decoder part (theta) self.decoder = utils.DecoderMLP(self.encoder.output, ldim, hdim, n_out) # Prediction self.predict = self.decoder.bern # Cost function self.kl_div = T.mean( utils.kl_unit_normal(self.encoder.mu, self.encoder.sigma2)) self.xent = T.mean( T.sum(T.nnet.binary_crossentropy(self.decoder.bern, self.input_tensor), axis=1)) self.cost = self.kl_div + self.xent # parameters self.params = self.encoder.params + self.decoder.params self.updates = adam(self.cost, self.params, self.learning_rate) # functions self.predict = theano.function(inputs=[input_tensor], outputs=self.predict) self.cost_fun = theano.function( inputs=[input_tensor], outputs=[self.cost, self.kl_div, self.xent]) self.train = theano.function( inputs=[input_tensor], outputs=[self.cost, self.kl_div, self.xent], updates=self.updates) self.normal_vars = theano.function( inputs=[input_tensor], outputs=[self.encoder.mu, self.encoder.sigma2])
def prepare_functions(model, X, index, y, X_test, X_train, y_train, batch_size, l_rate): n_data_const = T.constant( X_train.shape[0].eval(), name='n_data', dtype=floatX ) mean_log_likelihood = model.mean_log_likelihood(y) # scaled_kl_W = model.kl_W / n_data_const # scaled_kl_b = model.kl_b / n_data_const # scaled_kl = scaled_kl_W + scaled_kl_b effect_scaled_kl_W = model.effect_kl_W / n_data_const effect_scaled_kl_b = model.effect_kl_b / n_data_const effect_scaled_kl = effect_scaled_kl_W + effect_scaled_kl_b # cost = -(mean_log_likelihood - scaled_kl) cost = -(mean_log_likelihood - effect_scaled_kl) params = model.params updates = adam(cost, params, learning_rate=l_rate) print('... compiling functions') # monitor cost and the individual components of it # outputs = [cost, mean_log_likelihood, scaled_kl_W, scaled_kl_b] outputs = ( [cost, mean_log_likelihood, effect_scaled_kl_W, effect_scaled_kl_b] ) train = theano.function( inputs=[index], outputs=outputs, updates=updates, givens={ X: X_train[index * batch_size:(index + 1) * batch_size], y: y_train[index * batch_size:(index + 1) * batch_size] } # mode=NanGuardMode(nan_is_error=True, inf_is_error=True) ) test_predict = theano.function( [index], model.p_y_given_x, givens={X: X_test[index * batch_size:(index + 1) * batch_size]} # mode=NanGuardMode(nan_is_error=True, inf_is_error=True) ) return train, test_predict
def loss_iter(segmenter, update_params={}): X = T.tensor4() y = T.tensor4() pixel_weights = T.tensor3() all_layers = ll.get_all_layers(segmenter) imwrite_architecture(all_layers, './layer_rep.png') predicted_mask_train = ll.get_output(segmenter, X) predicted_mask_valid = ll.get_output(segmenter, X, deterministic=True) accuracy = lambda pred: T.mean(T.eq(T.argmax(pred, axis=1), T.argmax(y, axis=1))) pixel_weights_1d = pixel_weights.flatten(ndim=1) losses = lambda pred: T.mean(crossentropy_flat(pred + 1e-7, y + 1e-7) * pixel_weights_1d) decay = 0.0001 reg = regularize_network_params(segmenter, l2) * decay losses_reg = lambda pred: losses(pred) + reg loss_train = losses_reg(predicted_mask_train) loss_train.name = 'combined_loss' # for the names all_params = ll.get_all_params(segmenter) grads = T.grad(loss_train, all_params, add_names=True) #updates = adam(grads, all_params, **update_params) updates = adam(grads, all_params, **update_params) acc_train = accuracy(predicted_mask_train) acc_valid = accuracy(predicted_mask_valid) print("Compiling network for training") tic = time.time() train_iter = theano.function([X, y, pixel_weights], [loss_train, losses(predicted_mask_train), acc_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X, y, pixel_weights], [losses(predicted_mask_valid), acc_valid]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def build_model(vocab_size = 200, embsize = 25, hiddensize = 50, ydim =2 ): X = T.matrix('X', dtype='int64') Mask = T.matrix('mask', dtype=config.floatX) Y = T.vector('Y', dtype='int64') nstep = X.shape[0] mini_batch_size = X.shape[1] emblayer = ProjectionLayer(X,vocab_size,embsize,(nstep,mini_batch_size)) lstmlayer = LSTM(emblayer.output,Mask,embsize,hiddensize,name ="lstm-encode") lstmlayer.build_lstm() proj = lstmlayer.output proj = (proj * Mask[:, :, None]).sum(axis=0) proj = proj / Mask.sum(axis=0)[:, None] softmax_layer = SoftmaxLayer(proj,hiddensize,ydim) cost = softmax_layer.negative_log_likelihood(Y) err = softmax_layer.errors(Y) params = emblayer.params + lstmlayer.params + softmax_layer.params updates = adam(cost,params) train_function = theano.function(inputs=[X,Mask,Y], outputs=[cost, err],updates=updates) valid_function = theano.function(inputs=[X,Mask,Y], outputs=[cost, err]) predict_function = theano.function(inputs=[X,Mask], outputs= softmax_layer.y_pred) # see_func = theano.function(inputs=[X,Mask], outputs=softmax_layer.p_y_given_x) # hyhy = see_func(data_X,mask_X) # print hyhy.shape return X,Mask,Y,cost,err, train_function, valid_function, predict_function
def prep_train(alpha=0.0002, beta=0.5, nz=200): E, D = build_net(nz=nz) x = T.Tensor5('x') # x -> symbolic variable, input to the computational graph #Get outputs z=E(x), x_hat=D(z) encoding = get_output(E, x) decoding = get_output(D, encoding) #Get parameters of E and D params_e = get_all_params(E, trainable=True) params_d = get_all_params(D, trainable=True) params = params_e + params_d #Calculate cost and updates cost = T.mean(squared_error(x, decoding)) grad = T.grad(cost, params) updates = adam(grad, params, learning_rate=alpha, beta1=beta) train = theano.function(inputs=[x], outputs=cost, updates=updates) rec = theano.function(inputs=[x], outputs=decoding) test = theano.function(inputs=[x], outputs=cost) #theano.function returns an actual python function used to evaluate our real data return train, test, rec, E, D
def train_network(self, n_epochs=10000, learning_rate=0.001): loss = categorical_crossentropy(self.output, self.Y) loss = loss.mean() params = get_all_params(self.network_probs, trainable=True) updates = adam(loss, params, learning_rate=learning_rate) train = theano.function(inputs=[self.X, self.Y], outputs=loss, updates=updates, allow_input_downcast=True) trX, trY = self.get_data() for epoch in range(n_epochs): train_loss = train(trX, trY) if epoch % 50 == 0: print 'epoch: %d, loss: %f' % (epoch, train_loss) np.savez(pkg_path + '/models/model.npz', *get_all_param_values(self.network_probs))
def get_f_train(self): network_params = self.get_params() for param in network_params: print param.get_value().shape, param.name x = T.imatrix() m = T.matrix() y = T.matrix() pred = layers.get_output(self.l_y, { self.l_x: x, self.l_m: m, }, deterministic=False) cost = objectives.categorical_crossentropy(pred, y).mean() acc = T.eq(T.argmax(pred, axis=1), T.argmax(y, axis=1)).mean() grads = theano.grad(cost, network_params) grads = updates.total_norm_constraint(grads, max_norm=20.0) grads = [T.clip(g, -10.0, 10.0) for g in grads] params_update = updates.adam(grads, network_params, self.lr) f_train = theano.function([x, m, y], [cost, acc], updates=params_update) return f_train
output_m_a = lasagne.layers.get_output(l_out_m_a, inputs={l_in_a: x_sym}) output_f_a = lasagne.layers.get_output(l_out_f_a, inputs={l_in_a: x_sym}) loss_all_target_m_a = lasagne.objectives.squared_error(output_m_a, t_sym) loss_mean_target_m_a = T.mean(loss_all_target_m_a) loss_all_target_f_a = lasagne.objectives.squared_error(output_f_a, t_sym) loss_mean_target_f_a = T.mean(loss_all_target_f_a) all_params_target_m_a = lasagne.layers.get_all_params([l_out_m_a]) all_grads_target_m_a = [T.clip(g, -10, 10) for g in T.grad(loss_mean_target_m_a, all_params_target_m_a)] all_grads_target_m_a = lasagne.updates.total_norm_constraint(all_grads_target_m_a, 10) updates_target_m_a = adam(all_grads_target_m_a, all_params_target_m_a) all_params_target_f_a = lasagne.layers.get_all_params([l_out_f_a]) all_grads_target_f_a = [T.clip(g, -10, 10) for g in T.grad(loss_mean_target_f_a, all_params_target_f_a)] all_grads_target_f_a = lasagne.updates.total_norm_constraint(all_grads_target_f_a, 10) updates_target_f_a = adam(all_grads_target_f_a, all_params_target_f_a) train_model_m_a = theano.function([x_sym, t_sym], [loss_mean_target_m_a, output_m_a], updates=updates_target_m_a) test_model_m_a = theano.function([x_sym, t_sym], [loss_mean_target_m_a, output_m_a]) train_model_f_a = theano.function([x_sym, t_sym],
# Compute gradient in case of gradient clipping if run_parameters.clip_gradient[0] is not None: grad = T.grad(loss, params) if run_parameters.clip_gradient[0] is 0: # softclip grad = [updates.norm_constraint(g, run_parameters.clip_gradient[1], range(g.ndim)) for g in grad] elif run_parameters.clip_gradient[0] is 1: grad = [T.clip(g, run_parameters.clip_gradient[0], run_parameters.clip_gradient[1]) for g in grad] loss = grad # Update function to train # sgd_lr = run_parameters.update_lr sgd_lr = theano.shared(utils.floatX(run_parameters.update_lr)) sgd_lr_decay = utils.floatX(1.0) sgd_lr_decay_threshold = utils.floatX(1.0) updates_function = updates.adam(loss, params, run_parameters.update_lr) # Compile train function train_fn = theano.function([input_var, target_var, labeled_var], loss, updates=updates_function, allow_input_downcast=True, on_unused_input='ignore') # Compile test prediction function classification = T.argmax(test_prediction, axis=1) test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis=1)), dtype=theano.config.floatX) test_wrong = T.neq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis=1)) # Compile a second function computing the validation loss and accuracy: # val_fn = theano.function([input_var, target_var, labeled_var], [loss2*lr[1], test_acc], allow_input_downcast=True) val_fn = theano.function([input_var, target_var, labeled_var], [test_loss, losses_ratio[0] * test_loss1.mean(), losses_ratio[1] * test_loss2.mean(),
# the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True ) for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T):
# Initial values of the variables that are transmitted through the recursion h_ini, k_ini, w_ini = model.create_shared_init_states(batch_size) loss, updates_ini, monitoring = model.apply(seq_pt, seq_pt_mask, seq_tg, seq_str, seq_str_mask, h_ini, k_ini, w_ini) ######################## # GRADIENT AND UPDATES # ######################## params = model.params grads = T.grad(loss, params) grads = clip_norm_gradients(grads) if algo == 'adam': updates_params = adam(grads, params, 0.0003) elif algo == 'sgd': updates_params = [] for p, g in zip(params, grads): updates_params.append((p, p - learning_rate * g)) else: raise ValueError('Specified algo does not exist') updates_all = updates_ini + updates_params ##################### # SAMPLING FUNCTION # ##################### pt_ini, h_ini_pred, k_ini_pred, w_ini_pred, bias = \ model.create_sym_init_states()
def Train(options,init_params,build_model,DataHandler): load=options['load']; loadHis=options['loadHis']; saveto=options['saveto']; loadfrom=options['loadfrom']; dataset=options['dataset']; last_n=options['last_n']; fsize=options['videosize']; print ">>>init params & build graph"; tparams=init_params(options); cost,preds,inner_state,inps,use_noise=build_model(options,tparams); print "build done" print ">>>compile cost&updates function"; start=time.time(); f=theano.function(inps,[cost,preds],allow_input_downcast=True,on_unused_input='ignore'); print "cost function ready" if options['finetune']: updates=momentum(cost, itemlist(tparams), options['lrate'], momentum=options['momentum']); else: updates=adam(cost, itemlist(tparams), learning_rate=options['lrate'], beta1=0.9, beta2=0.999, epsilon=1e-08); print len(itemlist(tparams)) print "updates ready",len(updates) f_update=theano.function(inps,[cost,preds],updates=updates,allow_input_downcast=True,on_unused_input='ignore'); print "update function ready" print "compile finish, use %.1fmin"%((time.time()-start)/60); print '>>>Optimization' # ready dataset dh_train = DataHandler(options['dataset'],datatype=0,fps=options['fps']); dh_train.SetMode('source'); dh_valid = DataHandler(options['dataset'],datatype=1,fps=options['fps']); dh_valid.SetMode('source'); train_log=np.empty((0,4),dtype='float32'); min_valid_cost=1e8; max_valid_acc=0; if loadHis and os.path.exists(loadfrom): print "load log history from",loadfrom train_log = np.load(loadfrom)['train_log']; min_valid_cost=train_log[:,2].min(); max_valid_acc=train_log[:,3].max(); train_num=dh_train.batch_num; # should be set to dh_train.batch_num for epochidx in xrange(options['max_epochs']): use_noise.set_value(1.0); dh_train.Reset(); print 'Epoch ', epochidx start=time.time(); for vidx in xrange(train_num): x,mask,y=dh_train.GetSingleVideoFromSource(size=fsize,scale=1); x=x.reshape([x.shape[0],x.shape[1],fsize,fsize,3]); x=x.transpose([0,1,4,2,3]); x=x.reshape([x.shape[0],x.shape[1],-1]); cost,preds=f_update(x,mask,y); acc=((y.mean(0)).argmax(1)==preds).mean(); print cost,acc; # print tparams['recog/cnn_conv2_w'].get_value().sum(),tparams['recog/cnn_conv3_w'].get_value().sum(),tparams['recog/cnn_conv4_w'].get_value().sum(),tparams['recog/cnn_conv5_w'].get_value().sum(),(tparams['recog/cnn_conv5_w'].get_value()**2).sum() if ((vidx+1)%100==0): print "%d/%d, use %.1fmin"%(vidx+1,dh_train.batch_num,(time.time()-start)/60.0); start=time.time(); use_noise.set_value(0.0); #compute train error dh_train.Reset(); print ">>train cost"; tcost,tacc=Predict(options,f,dh_train,verbose=True,train_num=200); print "cost: %.3f, acc: %.3f"%(tcost,tacc); #compute valid error dh_valid.Reset(); print ">>valid cost"; vcost,vacc=Predict(options,f,dh_valid,verbose=True); print "cost: %.3f, acc: %.3f"%(vcost,vacc); print ">>save point:",options['saveto']; train_log=np.append(train_log,np.array([tcost,tacc,vcost,vacc])[None,...],axis=0); # train_log.append([tcost,tacc,vcost,vacc]); params = unzip(tparams); np.savez(saveto, train_log=train_log, options=options, **params); if (vcost<min_valid_cost): min_valid_cost=vcost; max_valid_acc=max(max_valid_acc,vacc); print ">>save best:",options['bestsaveto']; np.savez(options['bestsaveto'], train_log=train_log, options=options, **params); elif (vacc>max_valid_acc): max_valid_acc=vacc; min_valid_cost=min(min_valid_cost,vcost); print ">>save best:",options['bestsaveto']; np.savez(options['bestsaveto'], train_log=train_log, options=options, **params);
def get_model(): dtensor4 = T.TensorType("float32", (False,) * 4) input_var = dtensor4("inputs") dtensor2 = T.TensorType("float32", (False,) * 2) target_var = dtensor2("targets") # input layer with unspecified batch size layer_input = InputLayer( shape=(None, 30, 64, 64), input_var=input_var ) # InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var) layer_0 = DimshuffleLayer(layer_input, (0, "x", 1, 2, 3)) # Z-score? # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer_1 = batch_norm( Conv3DDNNLayer( incoming=layer_0, num_filters=64, filter_size=(3, 3, 3), stride=(1, 3, 3), pad="same", nonlinearity=leaky_rectify, W=Orthogonal(), ) ) layer_2 = MaxPool3DDNNLayer(layer_1, pool_size=(1, 2, 2), stride=(1, 2, 2), pad=(0, 1, 1)) layer_3 = DropoutLayer(layer_2, p=0.25) # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer_4 = batch_norm( Conv3DDNNLayer( incoming=layer_3, num_filters=128, filter_size=(3, 3, 3), stride=(1, 3, 3), pad="same", nonlinearity=leaky_rectify, W=Orthogonal(), ) ) layer_5 = MaxPool3DDNNLayer(layer_4, pool_size=(1, 2, 2), stride=(1, 2, 2), pad=(0, 1, 1)) layer_6 = DropoutLayer(layer_5, p=0.25) # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer_7 = batch_norm( Conv3DDNNLayer( incoming=layer_6, num_filters=256, filter_size=(3, 3, 3), stride=(1, 3, 3), pad="same", nonlinearity=leaky_rectify, W=Orthogonal(), ) ) layer_8 = MaxPool3DDNNLayer(layer_7, pool_size=(1, 2, 2), stride=(1, 2, 2), pad=(0, 1, 1)) layer_9 = DropoutLayer(layer_8, p=0.25) # Recurrent layer layer_10 = DimshuffleLayer(layer_9, (0, 2, 1, 3, 4)) layer_11 = LSTMLayer(layer_10, num_units=612, hid_init=Orthogonal(), only_return_final=False) # Output Layer layer_systole = DenseLayer(layer_11, 600, nonlinearity=leaky_rectify, W=Orthogonal()) layer_diastole = DenseLayer(layer_11, 600, nonlinearity=leaky_rectify, W=Orthogonal()) layer_systole_1 = DropoutLayer(layer_systole, p=0.3) layer_diastole_1 = DropoutLayer(layer_diastole, p=0.3) layer_systole_2 = DenseLayer(layer_systole_1, 1, nonlinearity=None, W=Orthogonal()) layer_diastole_2 = DenseLayer(layer_diastole_1, 1, nonlinearity=None, W=Orthogonal()) layer_output = ConcatLayer([layer_systole_2, layer_diastole_2]) # Loss prediction = get_output(layer_output) loss = squared_error(prediction, target_var) loss = loss.mean() # Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum Or Adam params = get_all_params(layer_output, trainable=True) updates = adam(loss, params) # updates_0 = rmsprop(loss, params) # updates = apply_nesterov_momentum(updates_0, params) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_output, deterministic=True) test_loss = squared_error(test_prediction, target_var) test_loss = test_loss.mean() # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) # Compile a second function computing the validation loss and accuracy val_fn = theano.function([input_var, target_var], test_loss, allow_input_downcast=True) # Compule a third function computing the prediction predict_fn = theano.function([input_var], test_prediction, allow_input_downcast=True) return [layer_output, train_fn, val_fn, predict_fn]
def doit(mode): from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,)) # Initialize a linear baseline estimator using default hand-crafted features if "linbaseline" in mode: print('linear baseline') baseline = LinearFeatureBaseline(env.spec) elif "vanilla" in mode: print("zero baseline") baseline = ZeroBaseline(env.spec) elif mode == "batchavg": print('batch average baseline') # use a zero baseline but subtract the mean of the discounted returns (see below) baseline = ZeroBaseline(env.spec) if "_ztrans" in mode: print('z transform advantages') else: print('no z transform') # We will collect 100 trajectories per iteration N = 50 # Each trajectory will have at most 100 time steps T = 50 # Number of iterations n_itr = 50 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1 ) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True ) results = [] for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) if "_ztrans" in mode: advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) if mode == 'batchavg': # in this case `advantages` up to here are just our good old returns, without baseline or z transformation. # now we subtract their mean across all episodes. advantages = advantages - np.mean(advantages) f_train(observations, actions, advantages) avgr = np.mean([sum(p["rewards"]) for p in paths]) print(('Average Return:',avgr)) results.append(avgr) return results
def main(): configure_theano() options = parse_options() config_file = options['config'] config = ConfigParser.ConfigParser() config.read(config_file) print('CLI options: {}'.format(options.items())) print('Reading Config File: {}...'.format(config_file)) print(config.items('stream1')) print(config.items('stream2')) print(config.items('lstm_classifier')) print(config.items('training')) print('preprocessing dataset...') # stream 1 s1_data = load_mat_file(config.get('stream1', 'data')) s1_imagesize = tuple([int(d) for d in config.get('stream1', 'imagesize').split(',')]) s1 = config.get('stream1', 'model') s1_inputdim = config.getint('stream1', 'input_dimensions') s1_shape = config.get('stream1', 'shape') s1_nonlinearities = config.get('stream1', 'nonlinearities') # stream 2 s2_data = load_mat_file(config.get('stream2', 'data')) s2_imagesize = tuple([int(d) for d in config.get('stream2', 'imagesize').split(',')]) s2 = config.get('stream2', 'model') s2_inputdim = config.getint('stream2', 'input_dimensions') s2_shape = config.get('stream2', 'shape') s2_nonlinearities = config.get('stream2', 'nonlinearities') # lstm classifier fusiontype = config.get('lstm_classifier', 'fusiontype') weight_init = options['weight_init'] if 'weight_init' in options else config.get('lstm_classifier', 'weight_init') use_peepholes = options['use_peepholes'] if 'use_peepholes' in options else config.getboolean('lstm_classifier', 'use_peepholes') output_classes = config.getint('lstm_classifier', 'output_classes') output_classnames = config.get('lstm_classifier', 'output_classnames').split(',') lstm_size = config.getint('lstm_classifier', 'lstm_size') matlab_target_offset = config.getboolean('lstm_classifier', 'matlab_target_offset') # capture training parameters validation_window = int(options['validation_window']) \ if 'validation_window' in options else config.getint('training', 'validation_window') num_epoch = int(options['num_epoch']) if 'num_epoch' in options else config.getint('training', 'num_epoch') learning_rate = options['learning_rate'] if 'learning_rate' in options \ else config.getfloat('training', 'learning_rate') epochsize = config.getint('training', 'epochsize') batchsize = config.getint('training', 'batchsize') weight_init_fn = las.init.GlorotUniform() if weight_init == 'glorot': weight_init_fn = las.init.GlorotUniform() if weight_init == 'norm': weight_init_fn = las.init.Normal(0.1) if weight_init == 'uniform': weight_init_fn = las.init.Uniform() if weight_init == 'ortho': weight_init_fn = las.init.Orthogonal() train_subject_ids = read_data_split_file(config.get('training', 'train_subjects_file')) val_subject_ids = read_data_split_file(config.get('training', 'val_subjects_file')) test_subject_ids = read_data_split_file(config.get('training', 'test_subjects_file')) s1_data_matrix = s1_data['dataMatrix'].astype('float32') s2_data_matrix = s2_data['dataMatrix'].astype('float32') targets_vec = s1_data['targetsVec'].reshape((-1,)) subjects_vec = s1_data['subjectsVec'].reshape((-1,)) vidlen_vec = s1_data['videoLengthVec'].reshape((-1,)) if matlab_target_offset: targets_vec -= 1 s1_data_matrix = presplit_dataprocessing(s1_data_matrix, vidlen_vec, config, 'stream1', imagesize=s1_imagesize) s2_data_matrix = presplit_dataprocessing(s2_data_matrix, vidlen_vec, config, 'stream2', imagesize=s2_imagesize) s1_train_X, s1_train_y, s1_train_vidlens, s1_train_subjects, \ s1_val_X, s1_val_y, s1_val_vidlens, s1_val_subjects, \ s1_test_X, s1_test_y, s1_test_vidlens, s1_test_subjects = split_seq_data(s1_data_matrix, targets_vec, subjects_vec, vidlen_vec, train_subject_ids, val_subject_ids, test_subject_ids) s2_train_X, s2_train_y, s2_train_vidlens, s2_train_subjects, \ s2_val_X, s2_val_y, s2_val_vidlens, s2_val_subjects, \ s2_test_X, s2_test_y, s2_test_vidlens, s2_test_subjects = split_seq_data(s2_data_matrix, targets_vec, subjects_vec, vidlen_vec, train_subject_ids, val_subject_ids, test_subject_ids) s1_train_X, s1_val_X, s1_test_X = postsplit_datapreprocessing(s1_train_X, s1_val_X, s1_test_X, config, 'stream1') s2_train_X, s2_val_X, s2_test_X = postsplit_datapreprocessing(s2_train_X, s2_val_X, s2_test_X, config, 'stream2') ae1 = load_decoder(s1, s1_shape, s1_nonlinearities) ae2 = load_decoder(s2, s2_shape, s2_nonlinearities) # IMPT: the encoder was trained with fortan ordered images, so to visualize # convert all the images to C order using reshape_images_order() # output = dbn.predict(test_X) # test_X = reshape_images_order(test_X, (26, 44)) # output = reshape_images_order(output, (26, 44)) # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44)) inputs1 = T.tensor3('inputs1', dtype='float32') inputs2 = T.tensor3('inputs2', dtype='float32') mask = T.matrix('mask', dtype='uint8') targets = T.imatrix('targets') print('constructing end to end model...') network, l_fuse = adenet_v2_nodelta.create_model(ae1, ae2, (None, None, s1_inputdim), inputs1, (None, None), mask, (None, None, s2_inputdim), inputs2, lstm_size, output_classes, fusiontype, w_init_fn=weight_init_fn, use_peepholes=use_peepholes) print_network(network) # draw_to_file(las.layers.get_all_layers(network), 'network.png') print('compiling model...') predictions = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = temporal_softmax_loss(predictions, targets, mask) updates = adam(cost, all_params, learning_rate=learning_rate) train = theano.function( [inputs1, targets, mask, inputs2], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function([inputs1, targets, mask, inputs2], cost, allow_input_downcast=True) test_predictions = las.layers.get_output(network, deterministic=True) test_cost = temporal_softmax_loss(test_predictions, targets, mask) compute_test_cost = theano.function( [inputs1, targets, mask, inputs2], test_cost, allow_input_downcast=True) val_fn = theano.function([inputs1, mask, inputs2], test_predictions, allow_input_downcast=True) # We'll train the network with 10 epochs of 30 minibatches each print('begin training...') cost_train = [] cost_val = [] class_rate = [] STRIP_SIZE = 3 val_window = circular_list(validation_window) train_strip = np.zeros((STRIP_SIZE,)) best_val = float('inf') best_cr = 0.0 datagen = gen_lstm_batch_random(s1_train_X, s1_train_y, s1_train_vidlens, batchsize=batchsize) integral_lens = compute_integral_len(s1_train_vidlens) val_datagen = gen_lstm_batch_random(s1_val_X, s1_val_y, s1_val_vidlens, batchsize=len(s1_val_vidlens)) test_datagen = gen_lstm_batch_random(s1_test_X, s1_test_y, s1_test_vidlens, batchsize=len(s1_test_vidlens)) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val, idxs_val = next(val_datagen) integral_lens_val = compute_integral_len(s1_val_vidlens) X_diff_val = gen_seq_batch_from_idx(s2_val_X, idxs_val, s1_val_vidlens, integral_lens_val, np.max(s1_val_vidlens)) # we use the test set to check final classification rate X_test, y_test, mask_test, idxs_test = next(test_datagen) integral_lens_test = compute_integral_len(s1_test_vidlens) X_diff_test = gen_seq_batch_from_idx(s2_test_X, idxs_test, s1_test_vidlens, integral_lens_test, np.max(s1_test_vidlens)) # reshape the targets for validation y_val_evaluate = y_val y_val = y_val.reshape((-1, 1)).repeat(mask_val.shape[-1], axis=-1) for epoch in range(num_epoch): time_start = time.time() for i in range(epochsize): X, y, m, batch_idxs = next(datagen) # repeat targets based on max sequence len y = y.reshape((-1, 1)) y = y.repeat(m.shape[-1], axis=-1) X_diff = gen_seq_batch_from_idx(s2_train_X, batch_idxs, s1_train_vidlens, integral_lens, np.max(s1_train_vidlens)) print_str = 'Epoch {} batch {}/{}: {} examples using adam'.format( epoch + 1, i + 1, epochsize, len(X)) print(print_str, end='') sys.stdout.flush() train(X, y, m, X_diff) print('\r', end='') cost = compute_train_cost(X, y, m, X_diff) val_cost = compute_test_cost(X_val, y_val, mask_val, X_diff_val) cost_train.append(cost) cost_val.append(val_cost) train_strip[epoch % STRIP_SIZE] = cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk cr, val_conf = evaluate_model2(X_val, y_val_evaluate, mask_val, X_diff_val, val_fn) class_rate.append(cr) if val_cost < best_val: best_val = val_cost best_cr = cr test_cr, test_conf = evaluate_model2(X_test, y_test, mask_test, X_diff_test, val_fn) print("Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f}, Test CR= {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, test_cr, time.time() - time_start)) best_params = las.layers.get_all_param_values(network) else: print("Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if epoch >= validation_window and early_stop2(val_window, best_val, validation_window): break print('Final Model') print('CR: {}, val loss: {}, Test CR: {}'.format(best_cr, best_val, test_cr)) # plot confusion matrix table_str = plot_confusion_matrix(test_conf, output_classnames, fmt='pipe') print('confusion matrix: ') print(table_str) if 'save_plot' in options: prefix = options['save_plot'] plot_validation_cost(cost_train, cost_val, savefilename='{}.validloss.png'.format(prefix)) with open('{}.confmat.txt'.format(prefix), mode='a') as f: f.write(table_str) f.write('\n\n') if 'write_results' in options: print('writing results to {}'.format(options['write_results'])) results_file = options['write_results'] with open(results_file, mode='a') as f: f.write('{},{},{}\n'.format(test_cr, best_cr, best_val)) if 'save_best' in options: print('saving best model...') las.layers.set_all_param_values(network, best_params) save_model_params(network, options['save_best']) print('best model saved to {}'.format(options['save_best']))
l_out_target = lasagne.layers.DenseLayer(l_reshape_target, num_units=n_features, nonlinearity=rectify) l_out_reshape_target = lasagne.layers.ReshapeLayer(l_out_target, (-1, x_sym.shape[1], n_features)) output_target = lasagne.layers.get_output(l_out_reshape_target, inputs={l_in: x_sym, l_mask: mask_x_sym}) # print lasagne.layers.get_output(l_out_reshape_target, inputs={l_in: x_sym, l_mask: mask_x_sym}).eval({x_sym:test_x,mask_x_sym:mask_test_x}).shape loss_all_target = lasagne.objectives.squared_error(output_target * mask_t_sym, t_sym) loss_mean_target = loss_all_target.mean() # print loss_mean_target.eval({x_sym:test_x,mask_x_sym:mask_test_x, t_sym: target_train, mask_t_sym: mask_target_train}) all_params_target = lasagne.layers.get_all_params([l_out_reshape_target]) all_grads_target = [T.clip(g, -3, 3) for g in T.grad(loss_mean_target, all_params_target)] all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 3) updates_target = adam(all_grads_target, all_params_target) train_target = theano.function([x_sym, mask_x_sym, t_sym, mask_t_sym], loss_mean_target, updates=updates_target) test_target = theano.function([x_sym, mask_x_sym, t_sym, mask_t_sym], [loss_mean_target, output_target]) # for noise l_decoder_noise = lasagne.layers.GRULayer(l_bottle, num_units=NUM_UNITS_DEC) l_reshape_noise = lasagne.layers.ReshapeLayer(l_decoder_noise, (-1, NUM_UNITS_DEC)) l_out_noise = lasagne.layers.DenseLayer(l_reshape_noise, num_units=n_features, nonlinearity=rectify) l_out_reshape_noise = lasagne.layers.ReshapeLayer(l_out_noise, (-1, x_sym.shape[1], n_features)) output_noise = lasagne.layers.get_output(l_out_reshape_noise, inputs={l_in: x_sym, l_mask: mask_x_sym})
def create_model(n_feat): x_sym = T.tensor3() m_sym = T.tensor3() f_sym = T.tensor3() l_in = lasagne.layers.InputLayer(shape=(None, max_len, n_feat)) l_dec_fwd = lasagne.layers.GRULayer(l_in, num_units=NUM_UNITS_DEC, name='GRUDecoder', backwards=False) l_dec_bwd = lasagne.layers.GRULayer(l_in, num_units=NUM_UNITS_DEC, name='GRUDecoder', backwards=True) l_concat = lasagne.layers.ConcatLayer([l_dec_fwd, l_dec_bwd], axis=2) l_encoder_2_m = lasagne.layers.GRULayer(l_concat, num_units=NUM_UNITS_ENC) l_encoder_2_f = lasagne.layers.GRULayer(l_concat, num_units=NUM_UNITS_ENC) l_decoder_m = lasagne.layers.GRULayer(l_encoder_2_m, num_units=NUM_UNITS_DEC) l_decoder_f = lasagne.layers.GRULayer(l_encoder_2_f, num_units=NUM_UNITS_DEC) l_reshape_m = lasagne.layers.ReshapeLayer(l_decoder_m, (-1, NUM_UNITS_DEC)) l_dense_m = lasagne.layers.DenseLayer(l_reshape_m, num_units=n_feat, nonlinearity=nonlin) l_out_m = lasagne.layers.ReshapeLayer(l_dense_m, (-1, max_len, n_feat)) l_reshape_f = lasagne.layers.ReshapeLayer(l_decoder_f, (-1, NUM_UNITS_DEC)) l_dense_f = lasagne.layers.DenseLayer(l_reshape_f, num_units=n_feat, nonlinearity=nonlin) l_out_f = lasagne.layers.ReshapeLayer(l_dense_f, (-1, max_len, n_feat)) output_m = lasagne.layers.get_output(l_out_m, inputs={l_in: x_sym}) output_f = lasagne.layers.get_output(l_out_f, inputs={l_in: x_sym}) # here I divide the 3 different type of training if tpe is 0: loss_all_m = lasagne.objectives.squared_error(output_m * x_sym, m_sym) loss_all_f = lasagne.objectives.squared_error(output_f * x_sym, f_sym) loss_mean_m = T.mean(loss_all_m) loss_mean_f = T.mean(loss_all_f) if tpe is 1: loss_all_m = lasagne.objectives.squared_error(output_m * x_sym, m_sym) + \ lasagne.objectives.squared_error((1. - output_m) * x_sym, f_sym) loss_mean_m = T.mean(loss_all_m) if tpe is 2: loss_all_m = lasagne.objectives.squared_error(output_m * x_sym, m_sym) \ - 0.05 * lasagne.objectives.squared_error(output_m * x_sym, f_sym) loss_all_f = lasagne.objectives.squared_error(output_f * x_sym, f_sym) \ - 0.05 * lasagne.objectives.squared_error(output_f * x_sym, m_sym) loss_mean_m = T.mean(loss_all_m) loss_mean_f = T.mean(loss_all_f) all_params_target_m = lasagne.layers.get_all_params([l_out_m]) all_grads_target_m = [T.clip(g, -10, 10) for g in T.grad(loss_mean_m, all_params_target_m)] all_grads_target_m = lasagne.updates.total_norm_constraint(all_grads_target_m, 10) updates_target_m = adam(all_grads_target_m, all_params_target_m) train_model_m = theano.function([x_sym, m_sym, f_sym], [loss_mean_m, output_m], updates=updates_target_m, on_unused_input='ignore') test_model_m = theano.function([x_sym, m_sym, f_sym], [loss_mean_m, output_m], on_unused_input='ignore') if tpe is not 1: all_params_target_f = lasagne.layers.get_all_params([l_out_f]) all_grads_target_f = [T.clip(g, -10, 10) for g in T.grad(loss_mean_f, all_params_target_f)] all_grads_target_f = lasagne.updates.total_norm_constraint(all_grads_target_f, 10) updates_target_f = adam(all_grads_target_f, all_params_target_f) train_model_f = theano.function([x_sym, f_sym, m_sym], [loss_mean_f, output_f], updates=updates_target_f, on_unused_input='ignore') test_model_f = theano.function([x_sym, f_sym, m_sym], [loss_mean_f, output_f], on_unused_input='ignore') return train_model_m, test_model_m, train_model_f, test_model_f return train_model_m, test_model_m
output = lasagne.layers.get_output(l_decoder, inputs={l_in: x_sym}) # output = T.nnet.sigmoid(T.dot(output, W) + b) # output = T.nnet.sigmoid(T.dot(output, W2) + b2) #print lasagne.layers.get_output(l_decoder, inputs={l_in: x_sym}).eval({x_sym:test_x,mask_x_sym:mask_test_x}).shape loss_all_target = lasagne.objectives.squared_error(output, t_sym).sum() loss_mean_target = loss_all_target / n_batch # print loss_mean_target.eval({x_sym:test_x,mask_x_sym:mask_test_x, t_sym: target_train, mask_t_sym: mask_target_train}) all_params_target = lasagne.layers.get_all_params([l_decoder]) all_grads_target = [T.clip(g, -10, 10) for g in T.grad(loss_mean_target, all_params_target)] all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 10) updates_target = adam(all_grads_target, all_params_target) train_model = theano.function([x_sym, t_sym], [loss_mean_target, output], updates=updates_target) test_model = theano.function([x_sym, t_sym], [loss_mean_target, output]) num_min_batches = 100 n_batch = 100 epochs = 50 for i in range(epochs): start_time = time.time()
def __init__ (self): self.learning_rate = 0.001 self.L1_reg = 0.0000 self.L2_reg = 0.0001 self.batch_size = 20 self.n_hidden = 50 self.num_inputs = 337 self.num_outputs = 1 self.momentum_coeff = 0.9 # allocate symbolic variables for the data x = T.ivector('x') y = T.iscalar('y') rng = np.random.RandomState(None) # construct the neural network's Architecture architecture = Architecture( rng=rng, input=[x], n_in=self.num_inputs, n_hidden=self.n_hidden, n_out=self.num_outputs ) cost = ( architecture.error_function(y) + self.L1_reg * architecture.L1 + self.L2_reg * architecture.L2_sqr ) # old version of stochastic gradient descent #gparams = [T.grad(cost, wrt=param) for param in architecture.params] #updates = [(param, param - self.learning_rate * gparam) for param, gparam in zip(architecture.params, gparams)] #stochastic gradient descent with adaptive learning using lasagne--take your pick #updates_sgd = sgd(cost, architecture.params, learning_rate=self.learning_rate) #updates = apply_momentum(updates_sgd, architecture.params, momentum=self.momentum_coeff) #updates = adadelta(cost, architecture.params, learning_rate=self.learning_rate, rho=0.95, epsilon=1e-06) updates = adam(cost, architecture.params, learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) # backpropogation that also contains a forward pass self.train_model = theano.function( inputs=[x, y], outputs=[cost, architecture.get_result()], updates=updates, allow_input_downcast=True ) # forward pass self.run_model = theano.function( inputs=[x], outputs=architecture.get_result(), allow_input_downcast=True ) self.grab_weights = theano.function( inputs=[], outputs=architecture.params, allow_input_downcast=True )
n_mixt_output) # Initial values of the variables that are transmitted through the recursion h_ini, k_ini, w_ini = model.create_shared_init_states(batch_size) loss, updates_ini, monitoring = model.apply(seq_pt, seq_pt_mask, seq_tg, seq_str, seq_str_mask, h_ini, k_ini, w_ini) ######################## # GRADIENT AND UPDATES # ######################## params = model.params grads = T.grad(loss, params) grads = clip_norm_gradients(grads) if algo == 'adam': updates_params = adam(grads, params, 0.0003) elif algo == 'sgd': updates_params = [] for p, g in zip(params, grads): updates_params.append((p, p - learning_rate * g)) else: raise ValueError('Specified algo does not exist') updates_all = updates_ini + updates_params ##################### # SAMPLING FUNCTION # ##################### pt_ini, h_ini_pred, k_ini_pred, w_ini_pred, bias = \ model.create_sym_init_states() create_gen_tag_values(model, pt_ini, h_ini_pred, k_ini_pred, w_ini_pred, bias,
def event_span_classifier(args, input_var, input_mask_var, target_var, wordEmbeddings, seqlen): print("Building model with LSTM") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] GRAD_CLIP = wordDim args.lstmDim = 150 input = InputLayer((None, seqlen),input_var=input_var) batchsize, seqlen = input.input_var.shape input_mask = InputLayer((None, seqlen),input_var=input_mask_var) emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb_1.W].remove('trainable') lstm = LSTMLayer(emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh) lstm_back = LSTMLayer( emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh, backwards=True) slice_forward = SliceLayer(lstm, indices=-1, axis=1) # out_shape (None, args.lstmDim) slice_backward = SliceLayer(lstm_back, indices=0, axis=1) # out_shape (None, args.lstmDim) concat = ConcatLayer([slice_forward, slice_backward]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, lstm:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, input_mask_var,target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, input_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
T.mean(lasagne.objectives.squared_error(masked_f, f_sym)) #print loss_all.eval({x_sym: ftrain_x, m_sym: ftrain_m, f_sym: ftrain_f}) # - gamma * lasagne.objectives.squared_error(masked_f, m_sym) # - gamma * lasagne.objectives.squared_error(masked_m, f_sym) all_params_target_m = lasagne.layers.get_all_params([l_out_m]) all_grads_target_m = [T.clip(g, -10, 10) for g in T.grad(loss_all, all_params_target_m)] all_grads_target_m = lasagne.updates.total_norm_constraint(all_grads_target_m, 10) all_params_target_f = lasagne.layers.get_all_params([l_out_f]) all_grads_target_f = [T.clip(g, -10, 10) for g in T.grad(loss_all, all_params_target_f)] all_grads_target_f = lasagne.updates.total_norm_constraint(all_grads_target_f, 10) updates_target_m = adam(all_grads_target_m, all_params_target_m) updates_target_f = adam(all_grads_target_f, all_params_target_f) train_model_m = theano.function([x_sym, m_sym, f_sym], loss_all, updates=updates_target_m) train_model_f = theano.function([x_sym, f_sym, m_sym], loss_all, updates=updates_target_f) test_model_m = theano.function([x_sym, m_sym, f_sym], [loss_all, output_m]) test_model_f = theano.function([x_sym, f_sym, m_sym], [loss_all, output_f])
# Fit model dtensor5 = TensorType('float32', (False,)*5) input_var = dtensor5('inputs') target_var = T.fvector('targets') network = build_cnn(input_var)['output'] # Create loss function prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.squared_error(prediction, target_var) loss = loss.mean() # Create parameter update expressions (later I will make rates adaptive) params = lasagne.layers.get_all_params(network, trainable=True) # updates = nesterov_momentum(loss, params, learning_rate=0.01, # momentum=0.9) updates = adam(loss, params) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.squared_error(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(lasagne.objectives.squared_error(test_prediction, target_var), dtype=theano.config.floatX) # Compile training function that updates parameters and returns training loss train_fn = theano.function([input_var, target_var], loss, updates=updates) val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) num_epochs = 8000 # Will probably not do this many b/c of early stopping best_network_weights_epoch = 0 epoch_accuracies = [] # Train network for epoch in range(num_epochs):