def test_total_norm_constraint(): import numpy as np import theano import theano.tensor as T from lasagne.updates import total_norm_constraint x1 = T.scalar() x2 = T.matrix() threshold = 5.0 tensors1 = total_norm_constraint([x1, x2], threshold, return_norm=False) tensors2, norm = total_norm_constraint([x1, x2], threshold, return_norm=True) f1 = theano.function([x1, x2], [tensors1[0], tensors1[1]]) f2 = theano.function([x1, x2], [tensors2[0], tensors2[1], norm]) x_test = np.arange(1+9, dtype='float32') x1_test = x_test[-1] x2_test = x_test[:9].reshape((3, 3)) x1_out1, x2_out1 = f1(x1_test, x2_test) x1_out2, x2_out2, norm = f2(x1_test, x2_test) np.testing.assert_array_almost_equal(x1_out1, x1_out2) np.testing.assert_array_almost_equal(x2_out1, x2_out2) x_out = [float(x1_out1)] + list(x2_out1.flatten()) np.testing.assert_array_almost_equal(np.linalg.norm(x_test), norm) np.testing.assert_array_almost_equal(np.linalg.norm(x_out), threshold)
def test_total_norm_constraint(): import numpy as np import theano import theano.tensor as T from lasagne.updates import total_norm_constraint x1 = T.scalar() x2 = T.matrix() threshold = 5.0 tensors1 = total_norm_constraint([x1, x2], threshold, return_norm=False) tensors2, norm = total_norm_constraint([x1, x2], threshold, return_norm=True) f1 = theano.function([x1, x2], [tensors1[0], tensors1[1]]) f2 = theano.function([x1, x2], [tensors2[0], tensors2[1], norm]) x_test = np.arange(1 + 9, dtype='float32') x1_test = x_test[-1] x2_test = x_test[:9].reshape((3, 3)) x1_out1, x2_out1 = f1(x1_test, x2_test) x1_out2, x2_out2, norm = f2(x1_test, x2_test) np.testing.assert_array_almost_equal(x1_out1, x1_out2) np.testing.assert_array_almost_equal(x2_out1, x2_out2) x_out = [float(x1_out1)] + list(x2_out1.flatten()) np.testing.assert_array_almost_equal(np.linalg.norm(x_test), norm) np.testing.assert_array_almost_equal(np.linalg.norm(x_out), threshold)
def apply_grad_norm_clip(gradients, clip=None): if clip is None: _, norm = LU.total_norm_constraint(gradients, 1, return_norm=True) else: gradients, norm = LU.total_norm_constraint(gradients, clip, return_norm=True) return gradients, norm
def careful_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, grad_clipping=1.0e-2): """ RMSProp with gradient clipping. :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled. :return: updates """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad**2 updates[accu] = accu_new updates[param] = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) return updates
def u(loss_or_grads, params, *args, **kwargs): grads = get_or_compute_grads(loss_or_grads, params) grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon) return updates(grads, params, *args, **kwargs)
def build_trainer(input_data, input_mask, target_data, target_mask, network_params, network_reg_params, output_layer, weight_decay, updater, learning_rate, max_grad_norm=0.0, load_updater_params=None): output_score = get_output(output_layer, deterministic=False) frame_prd_idx = T.argmax(output_score, axis=-1) one_hot_target = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1), nb_class=output_dim, dtype=floatX) output_score = T.reshape(x=output_score, newshape=(-1, output_dim), ndim=2) output_score = output_score - T.max(output_score, axis=-1, keepdims=True) output_score = output_score - T.log(T.sum(T.exp(output_score), axis=-1, keepdims=True)) train_ce = -T.sum(T.mul(one_hot_target, output_score), axis=-1)*T.flatten(target_mask, 1) train_loss = T.sum(train_ce)/target_mask.shape[0] frame_loss = T.sum(train_ce)/T.sum(target_mask) frame_accr = T.sum(T.eq(frame_prd_idx, target_data)*target_mask)/T.sum(target_mask) train_total_loss = train_loss if weight_decay > 0: train_total_loss += apply_penalty(network_reg_params, l2)*10**(-weight_decay) network_grads = theano.grad(cost=train_total_loss, wrt=network_params) if max_grad_norm > 0.: network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads, max_norm=max_grad_norm, return_norm=True) else: network_grads_norm = T.sqrt(sum(T.sum(grad ** 2) for grad in network_grads)) train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, updater_params = updater(loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) training_fn = theano.function(inputs=[input_data, input_mask, target_data, target_mask], outputs=[frame_loss, frame_accr, network_grads_norm], updates=train_updates) return training_fn, train_lr, updater_params
def create_dadgm_gradients(self, loss, deterministic=False): grads = GSM.create_gradients(self, loss, deterministic) # combine and clip gradients clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] return cgrads
def set_network_trainer(input_data, input_mask, target_data, target_mask, network, updater, learning_rate, grad_max_norm=10., # l2_lambda=1e-5, load_updater_params=None): # get network output data predict_data = get_output(network, deterministic=False) predict_idx = T.argmax(predict_data, axis=-1) # get prediction cost train_predict_cost = categorical_crossentropy(predictions=T.reshape(predict_data, (-1, predict_data.shape[-1])) + eps, targets=T.flatten(target_data, 1)) train_predict_cost = train_predict_cost*T.flatten(target_mask, 1) train_predict_cost = train_predict_cost.sum()/target_mask.sum() # get regularizer cost train_regularizer_cost = regularize_network_params(network, penalty=l2) # get network parameters network_params = get_all_params(network, trainable=True) # get network gradients with clipping network_grads = theano.grad(cost=train_predict_cost + train_regularizer_cost*l2_lambda, wrt=network_params) network_grads = theano.grad(cost=train_predict_cost, wrt=network_params) network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) # set updater train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, trainer_params = updater(loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) # get training (update) function training_fn = theano.function(inputs=[input_data, input_mask, target_data, target_mask], outputs=[predict_data, predict_idx, train_predict_cost, train_regularizer_cost], network_grads_norm], updates=train_updates, allow_input_downcast=True)
def calculate_gradient(loss, params, weight_norm=[]): """ calculate gradients with option to clip norm """ grad = T.grad(loss, params) # gradient norm option if weight_norm: grad = updates.total_norm_constraint(grad, weight_norm) return grad
def train_function(self, semi_supervised= True, unlabel_stable=False): ''' use_unlabel == True, semi-superviesd learning return: train function for 1 epoch use ''' self.semi_supervised = semi_supervised sym_klw = T.scalar('sym_klw',dtype=theano.config.floatX) # symbolic scalar of warming up sym_cw = T.scalar('sym_cw',dtype=theano.config.floatX) # classifier warm up sym_s = T.matrix('sym_s',dtype='int64') sym_mask = T.matrix('sym_mask',dtype=theano.config.floatX) sym_y = T.matrix('sym_label',dtype=theano.config.floatX) sym_s_u = T.matrix('sym_s_u',dtype='int64') sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX) num_l, num_u = sym_s.shape[0].astype(theano.config.floatX), 0.0 if self.semi_supervised: print 'Train with unlabel data.' num_u = sym_s_u.shape[0].astype(theano.config.floatX) #get labeled/unlabeled cost outs1 = self.cost_label([sym_s, sym_mask, sym_y], dev_stage=False, return_mode = 'mean') loss_recons, loss_kl, valid_words, word_drop_num, loss_classifier, batch_ppl, acc = outs1 loss_recons_u, loss_kl_u,loss_entropy_u, batch_ppl_u = 0.0,0.0,0.0,0.0 valid_words_u = 0 if self.semi_supervised: outs2 = self.cost_unlabel([sym_s_u, sym_mask_u], dev_stage=unlabel_stable, sample_by_prob=self.sample_unlabel) loss_recons_u, loss_kl_u, valid_words_u, loss_entropy_u, batch_ppl_u = outs2 ''' total Loss: L = Loss_labeled(s,mask,y) + beta*(n_l+n_u)/n_l * Loss_classisifer(s,mask,y) + Loss_unlabel(s_u, mask_u) L = recons_term + sym_klw_term + loss_classifier_term - loss_entropy_u ''' alpha = sym_cw * self.cost_beta * ( num_l + num_u ) / num_l total_cost = loss_recons * num_l + loss_recons_u * num_u\ + sym_klw * ( loss_kl * num_l + loss_kl_u * num_u)\ + alpha * loss_classifier * num_l\ - loss_entropy_u * num_u total_cost /= (num_l + num_u) train_params = self.get_params(only_trainable=True) all_grads = theano.grad(total_cost,train_params) all_grads = [T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads] all_grads = total_norm_constraint( all_grads, max_norm=self.max_norm ) #all_grads = [T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads] updates = adam(all_grads,train_params, self.lr, self.beta1, self.beta2) if self.semi_supervised: train_input = [sym_s, sym_mask, sym_y, sym_s_u, sym_mask_u, sym_klw, sym_cw] train_output = [total_cost, loss_recons, loss_recons_u, loss_kl, loss_kl_u, alpha, loss_classifier, loss_entropy_u, batch_ppl, batch_ppl_u, valid_words, valid_words_u, word_drop_num, acc] else: train_input = [sym_s, sym_mask, sym_y, sym_klw, sym_cw] train_output = [total_cost, loss_recons, loss_kl, loss_classifier, batch_ppl, valid_words, word_drop_num, acc] train_f = theano.function(inputs=train_input, outputs=train_output,updates=updates, name='train_function') return train_f
def train_expectation_function(self): ''' unlabeled data train with expection ''' print "Train Function: Calculate the Expectation of unlabeled data." sym_klw = T.scalar('sym_klw', dtype=theano.config.floatX) # symbolic scalar of warming up sym_sents = T.matrix('sym_s', dtype='int64') sym_mask = T.matrix('sym_mask', dtype=theano.config.floatX) # one hot! sym_label = T.matrix('sym_label', dtype=theano.config.floatX) sym_sents_u = T.matrix('sym_s_u', dtype='int64') sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX) num_l, num_u = sym_sents.shape[0].astype(theano.config.floatX), \ sym_sents_u.shape[0].astype(theano.config.floatX) num_all = num_l + num_u # forward the network and get cost values enc_sents, dec_sents, _ = self._forward_sents(sym_sents, dev_stage=False) enc_sents_u, dec_sents_u, _ = self._forward_sents(sym_sents_u, dev_stage=False) # classifier loss y_pred, loss_class, acc = self._forward_classifier([enc_sents, sym_mask], sym_label, dev_stage=False) y_pred_u, loss_entropy, _ = self._forward_classifier([enc_sents_u, sym_mask_u], None, dev_stage=False) # reconstruction and kl loss loss_rec, loss_kl, ppl = self.cost_label([sym_sents, enc_sents, dec_sents, sym_mask, sym_label], dev_stage=False) loss_rec_u, loss_kl_u, ppl_u = self.cost_unlabel_expectation([sym_sents_u, enc_sents_u, dec_sents_u, sym_mask_u, y_pred_u], dev_stage=False) # use baseline if self.use_baseline: baselines_u = self._get_baselines([sym_sents_u, enc_sents_u, sym_mask_u]) loss_rec_u -= baselines_u total_cost = T.sum(loss_rec) + T.sum(loss_rec_u) - T.sum(loss_entropy) total_cost += sym_klw * (T.sum(loss_kl) + T.sum(loss_kl_u)) total_cost += self.alpha * T.sum(loss_class) * num_all / num_l total_cost /= num_all all_params = self.get_params(tag='all') all_grads = theano.grad(total_cost, all_params) all_grads = total_norm_constraint(all_grads, max_norm=self.max_norm) updates = adam(all_grads, all_params, self.lr) train_input = [sym_sents, sym_mask, sym_label, sym_sents_u, sym_mask_u, sym_klw] train_output = [total_cost, T.mean(loss_rec), T.mean(loss_rec_u), T.mean(loss_kl), T.mean(loss_kl_u), T.mean(loss_class), T.mean(loss_entropy), ppl, ppl_u, acc, self.b] train_f = theano.function(inputs=train_input, outputs=train_output, updates=updates, name='train_expectation') return train_f
def _get_updates(self, loss, params, optimizer, optimizer_params={}, clip_grad=None, max_norm_constraint=None, clip_param=None): if clip_param: print clip_param params = [T.clip(p, -clip_param, clip_param) for p in params] grads = T.grad(loss, params) if max_norm_constraint: grads =\ total_norm_constraint(grads, max_norm=max_norm_constraint) if clip_grad: grads = [T.clip(g, -clip_grad, clip_grad) for g in grads] return optimizer(grads, params, **optimizer_params)
def _get_updates(self, loss, params, optimizer, optimizer_params={}, clip_grad=None, max_norm_constraint=None): grads = T.grad(loss, params) if max_norm_constraint: grads =\ total_norm_constraint(grads, max_norm=max_norm_constraint) if clip_grad: grads = [T.clip(g, -clip_grad, clip_grad) for g in grads] return optimizer(grads, params, **optimizer_params)
def params_update(grads, params, lrt, max_g=None): def optimize(grads, params): if state['optim_method'] == 'adam': updates = adam(grads, params, lrt, state['momentum']) elif state['optim_method'] == 'adagrad': updates = adagrad(grads, params, lrt) elif state['optim_method'] == 'sgd': updates = sgd(grads, params, lrt) return updates if max_g is not None: scaled_grads = total_norm_constraint(grads, max_g) updates = optimize(scaled_grads, params) else: updates = optimize(grads, params) return updates
def adam(self, cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8): all_grads = T.grad(cost=cost, wrt=params) all_grads = total_norm_constraint(all_grads, 10) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), all_grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) t_prev = theano.shared(utils.floatX(0.)) updates = OrderedDict() t = t_prev + 1 a_t = learning_rate * T.sqrt(1 - beta2**t) / (1 - beta1**t) for param, g_t in zip(params, all_grads): g_t = T.switch(not_finite, 0.1 * param, g_t) value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (1 - beta1) * g_t v_t = beta2 * v_prev + (1 - beta2) * g_t**2 step = a_t * m_t / (T.sqrt(v_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t updates[param] = param - step updates[t_prev] = t return updates
def cruel_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, grad_clipping=1.0e-2, param_clipping=1.0e-2): """ A version of careful RMSProp for Wassershtein GAN. :param epsilon: small number for computational stability. :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled. :param param_clipping: after each update all params are clipped to [-`param_clipping`, `param_clipping`]. :return: """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad**2 updates[accu] = accu_new updated = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) if param_clipping is not None: updates[param] = T.clip(updated, -param_clipping, param_clipping) else: updates[param] = updated return updates
def get_f_train(self): network_params = self.get_params() for param in network_params: print param.get_value().shape, param.name x = T.imatrix() m = T.matrix() y = T.matrix() pred = layers.get_output(self.l_y, { self.l_x: x, self.l_m: m, }, deterministic=False) cost = objectives.categorical_crossentropy(pred, y).mean() acc = T.eq(T.argmax(pred, axis=1), T.argmax(y, axis=1)).mean() grads = theano.grad(cost, network_params) grads = updates.total_norm_constraint(grads, max_norm=20.0) grads = [T.clip(g, -10.0, 10.0) for g in grads] params_update = updates.adam(grads, network_params, self.lr) f_train = theano.function([x, m, y], [cost, acc], updates=params_update) return f_train
from lasagne.layers import InputLayer, DenseLayer import lasagne from lasagne.updates import sgd, total_norm_constraint import theano.tensor as T x = T.matrix() y = T.ivector() l_in = InputLayer((5, 10)) l1 = DenseLayer(l_in, num_units=7, nonlinearity=T.nnet.softmax) output = lasagne.layers.get_output(l1, x) cost = T.mean(T.nnet.categorical_crossentropy(output, y)) all_params = lasagne.layers.get_all_params(l1) all_grads = T.grad(cost, all_params) scaled_grads = total_norm_constraint(all_grads[i], 5) updates = sgd(scaled_grads, all_params, learning_rate=0.1)
def set_network_trainer(input_data, input_mask, target_data, target_mask, network, updater, learning_rate, grad_max_norm=10., l2_lambda=1e-5, load_updater_params=None): ########################### # get network output data # ########################### # get network output data network_output = get_output(network, deterministic=False) ################################ # get training cost (CTC + L2) # ################################ # get prediction cost (CTC) train_ctc_cost = ctc_cost(y=target_data.dimshuffle(1, 0), y_mask=target_mask.dimshuffle(1, 0), y_hat=network_output.dimshuffle(1, 0, 2), y_hat_mask=input_mask.dimshuffle(1, 0), skip_softmax=True) train_ctc_cost = train_ctc_cost.mean() # get prediction cost (char-level), CTC) train_cost_per_char = train_ctc_cost/target_mask.sum() # get regularizer cost (L2) train_regularizer_cost = regularize_network_params(network, penalty=l2) ########################## # get network parameters # ########################## network_params = get_all_params(network, trainable=True) ######################### # get network gradients # ######################### # get gradient over cost network_grads = theano.grad(cost=train_ctc_cost + train_regularizer_cost*l2_lambda, wrt=network_params) # get gradient norm constraint network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) ####################### # get network updater # ####################### # get learning rate variable train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) # get updater train_updates, trainer_params = updater(loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) ################################ # get network updater function # ################################ training_fn = theano.function(inputs=[input_data, input_mask, target_data, target_mask], outputs=[train_ctc_cost, train_cost_per_char, train_regularizer_cost, network_grads_norm], updates=train_updates) return training_fn, trainer_params
def train_sample_function(self, ew=1.0): ''' unlabeled data train with sample ''' print "Train Function: Estimate the Expectation of unlabeled data by Sample." sym_klw = T.scalar( 'sym_klw', dtype=theano.config.floatX) # symbolic scalar of warming up sym_sents = T.matrix('sym_s', dtype='int64') sym_mask = T.matrix('sym_mask', dtype=theano.config.floatX) # one hot! sym_label = T.matrix('sym_label', dtype=theano.config.floatX) sym_sents_u = T.matrix('sym_s_u', dtype='int64') sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX) num_l, num_u = sym_sents.shape[0].astype(theano.config.floatX), \ sym_sents_u.shape[0].astype(theano.config.floatX) num_all = num_l + num_u # forward the network and get cost values enc_sents, dec_sents, _ = self._forward_sents(sym_sents, dev_stage=False) enc_sents_u, dec_sents_u, _ = self._forward_sents(sym_sents_u, dev_stage=False) # classifier loss y_pred, loss_class, acc = self._forward_classifier( [enc_sents, sym_mask], sym_label, dev_stage=False) y_pred_u, loss_entropy, _ = self._forward_classifier( [enc_sents_u, sym_mask_u], None, dev_stage=False) sampled_label, y_pred_sampled = self._sample_one_category(y_pred_u) # reconstruction and kl loss loss_rec, loss_kl, ppl = self.cost_label( [sym_sents, enc_sents, dec_sents, sym_mask, sym_label], dev_stage=False) loss_rec_u, loss_kl_u, ppl_u = self.cost_label( [sym_sents_u, enc_sents_u, dec_sents_u, sym_mask_u, sampled_label], dev_stage=False) # use baseline # length normalization for unlabel const_Lxy = loss_rec_u / T.sum(sym_mask_u, axis=1) + sym_klw * loss_kl_u if self.use_baseline: baselines_u = self._get_baselines( [sym_sents_u, enc_sents_u, sym_mask_u]) const_Lxy -= baselines_u # gradients, see supplementary files for detail all_params, params_e, params_w, params_phi, params_theta = self.get_params(tag='all'), \ self.get_params(tag='e'), self.get_params(tag='c'), self.get_params(tag='i'), self.get_params(tag='g') total_cost_directly = -T.sum(loss_entropy) * ew + T.sum( loss_rec + sym_klw * loss_kl) total_cost_directly += self.alpha * T.sum(loss_class) * num_all / num_l total_cost_directly /= num_all all_grads = theano.grad(total_cost_directly, all_params) grad_e = theano.grad(T.sum(const_Lxy * T.log(y_pred_sampled) + loss_rec_u + sym_klw * loss_kl_u) / num_all, params_e, consider_constant=[const_Lxy]) grad_w = theano.grad(T.sum(const_Lxy * T.log(y_pred_sampled)) / num_all, params_w, consider_constant=[const_Lxy]) grad_ig = theano.grad(T.sum(loss_rec_u + sym_klw * loss_kl_u) / num_all, params_phi + params_theta, consider_constant=[const_Lxy]) # combine the grads grad_unlabel = grad_e + grad_w + grad_ig all_grads = [gi + gj for gi, gj in zip(all_grads, grad_unlabel)] total_cost = total_cost_directly + T.sum( const_Lxy) / num_all # not used in gradients ''' # old cost function in AVAE all_params = self.get_params(tag='all') total_cost = T.sum(loss_rec) + T.sum(loss_rec_u) - T.sum(loss_entropy) total_cost += sym_klw * (T.sum(loss_kl) + T.sum(loss_kl_u)) total_cost += self.alpha * T.sum(loss_class) * num_all / num_l total_cost /= num_all all_grads = theano.grad(total_cost, all_params) ''' all_grads = [ T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads ] all_grads = total_norm_constraint(all_grads, max_norm=self.max_norm) updates = adam(all_grads, all_params, self.lr) update_baseline = {self.b: 0.9 * self.b + 0.1 * T.mean(loss_rec_u)} updates.update(update_baseline) train_input = [ sym_sents, sym_mask, sym_label, sym_sents_u, sym_mask_u, sym_klw ] train_output = [ total_cost, T.mean(loss_rec), T.mean(loss_rec_u), T.mean(loss_kl), T.mean(loss_kl_u), T.mean(loss_class), T.mean(loss_entropy), ppl, ppl_u, acc, self.b ] train_f = theano.function(inputs=train_input, outputs=train_output, updates=updates, name='train_sample') return train_f
def main(): # TODO Make this work better. # See https://swarbrickjones.wordpress.com/2015/04/29/convolutional-autoencoders-in-pythontheanolasagne/. # Setup C = 1 # number of channels in image H = 28 # height of image W = 28 # width of image # K = 10 # number of classes shape = [C * H * W] padding_size = 2 downsampling_factor = 2 # Dense layers hidden_sizes = [200, 200] latent_size = 2 # Convolutional layers filters = [{"number": 16, "size": 3, "stride": 1}] batch_size = 100 analytic_kl_term = True learning_rate = 0.01 N_epochs = 10 # 1000 # Symbolic variables symbolic_x_LR = T.matrix() symbolic_x_HR = T.matrix() symbolic_z = T.matrix() symbolic_learning_rate = T.scalar('learning_rate') # Fix random seed for reproducibility numpy.random.seed(1234) # Data file_name = "mnist.pkl.gz" file_path = data_path(file_name) (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = data.loadMNIST(file_path, shape) X_train = numpy.concatenate([X_train, X_valid]) X_train = X_train.astype(theano.config.floatX) X_test = X_test.astype(theano.config.floatX) N_train_batches = X_train.shape[0] / batch_size N_test_batches = X_test.shape[0] / batch_size # Setup shared variables X_train_shared = theano.shared(X_train, borrow = True) X_test_shared = theano.shared(X_test, borrow = True) # Models ## Recognition model q(z|x) pool_size = 2 l_enc_HR_in = InputLayer((None, C * H * W), name = "ENC_HR_INPUT") l_enc_HR_downsample = l_enc_HR_in l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C, H, W)) l_enc_HR_downsample = PadLayer(l_enc_HR_downsample, width = padding_size) l_enc_HR_downsample = Pool2DLayer(l_enc_HR_downsample, pool_size = downsampling_factor, mode = "average_exc_pad") _, _, h, w = l_enc_HR_downsample.output_shape l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C * h * w)) l_enc_LR_in = InputLayer((None, C * h * w), name = "ENC_LR_INPUT") l_enc = l_enc_LR_in l_enc = ReshapeLayer(l_enc, (-1, C, h, w)) for i, filter_ in enumerate(filters): l_enc = Conv2DLayer(l_enc, filter_["number"], filter_["size"], filter_["stride"], pad = "same", nonlinearity = rectify, name = 'ENC_CONV_{:d}'.format(i)) # l_enc = Pool2DLayer(l_enc, pool_size) l_z_mu = DenseLayer(l_enc, num_units = latent_size, nonlinearity = None, name = 'ENC_Z_MU') l_z_log_var = DenseLayer(l_enc, num_units = latent_size, nonlinearity = None, name = 'ENC_Z_LOG_VAR') # Sample the latent variables using mu(x) and log(sigma^2(x)) l_z = SimpleSampleLayer(mean = l_z_mu, log_var = l_z_log_var) # as Kingma # l_z = SampleLayer(mean = l_z_mu, log_var = l_z_log_var) ## Generative model p(x|z) l_dec_in = InputLayer((None, latent_size), name = "DEC_INPUT") l_dec = DenseLayer(l_dec_in, num_units = C * H * W, nonlinearity = rectify, name = "DEC_DENSE") l_dec = ReshapeLayer(l_dec, (-1, C, H, W)) for i, filter_ in enumerate_reversed(filters, start = 0): if filter_["stride"] == 1: l_dec = Conv2DLayer(l_dec, filter_["number"], filter_["size"], filter_["stride"], pad = "same", nonlinearity = rectify, name = 'DEC_CONV_{:d}'.format(i)) else: l_dec = Deconv2DLayer(l_dec, filter_["number"], filter_["size"], filter_["stride"], nonlinearity = rectify, name = 'DEC_CONV_{:d}'.format(i)) l_dec_x_mu = Conv2DLayer(l_dec, num_filters = C, filter_size = (3, 3), stride = 1, pad = 'same', nonlinearity = None, name = 'DEC_X_MU') l_dec_x_mu = ReshapeLayer(l_dec_x_mu, (-1, C * H * W)) ## Get outputs from models # With noise x_LR_train = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = False) z_train, z_mu_train, z_log_var_train = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_train}, deterministic = False ) x_mu_train = get_output(l_dec_x_mu, {l_dec_in: z_train}, deterministic = False) # Without noise x_LR_eval = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = True) z_eval, z_mu_eval, z_log_var_eval = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_eval}, deterministic = True ) x_mu_eval = get_output(l_dec_x_mu, {l_dec_in: z_eval}, deterministic = True) # Sampling x_mu_sample = get_output(l_dec_x_mu, {l_dec_in: symbolic_z}, deterministic = True) # Likelihood # Calculate the loglikelihood(x) = E_q[ log p(x|z) + log p(z) - log q(z|x)] def log_likelihood(z, z_mu, z_log_var, x_mu, x, analytic_kl_term): if analytic_kl_term: kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) LL = T.mean(-kl_term + log_px_given_z) else: log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1) log_pz = log_stdnormal(z).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) LL = T.mean(log_pz + log_px_given_z - log_qz_given_x) return LL # log-likelihood for training ll_train = log_likelihood( z_train, z_mu_train, z_log_var_train, x_mu_train, symbolic_x_HR, analytic_kl_term) # log-likelihood for evaluating ll_eval = log_likelihood( z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, symbolic_x_HR, analytic_kl_term) # Parameters to train parameters = get_all_params([l_z_mu, l_dec_x_mu], trainable = True) print("Parameters that will be trained:") for parameter in parameters: print("{}: {}".format(parameter, parameter.get_value().shape)) ### Take gradient of negative log-likelihood gradients = T.grad(-ll_train, parameters) # Adding gradient clipping to reduce the effects of exploding gradients, # and hence speed up convergence gradient_clipping = 1 gradient_norm_max = 5 gradient_constrained = updates.total_norm_constraint(gradients, max_norm = gradient_norm_max) gradients_clipped = [T.clip(g,-gradient_clipping, gradient_clipping) for g in gradient_constrained] # Setting up functions for training symbolic_batch_index = T.iscalar('index') batch_slice = slice(symbolic_batch_index * batch_size, (symbolic_batch_index + 1) * batch_size) update_expressions = updates.adam(gradients_clipped, parameters, learning_rate = symbolic_learning_rate) train_model = theano.function( [symbolic_batch_index, symbolic_learning_rate], ll_train, updates = update_expressions, givens = {symbolic_x_HR: X_train_shared[batch_slice]} ) test_model = theano.function( [symbolic_batch_index], ll_eval, givens = {symbolic_x_HR: X_test_shared[batch_slice]} ) def train_epoch(learning_rate): costs = [] for i in range(N_train_batches): cost_batch = train_model(i, learning_rate) costs += [cost_batch] return numpy.mean(costs) def test_epoch(): costs = [] for i in range(N_test_batches): cost_batch = test_model(i) costs += [cost_batch] return numpy.mean(costs) # Training epochs = [] cost_train = [] cost_test = [] for epoch in range(N_epochs): start = time.time() # Shuffle train data numpy.random.shuffle(X_train) X_train_shared.set_value(X_train) train_cost = train_epoch(learning_rate) test_cost = test_epoch() duration = time.time() - start epochs.append(epoch) cost_train.append(train_cost) cost_test.append(test_cost) # line = "Epoch: %i\tTime: %0.2f\tLR: %0.5f\tLL Train: %0.3f\tLL test: %0.3f\t" % ( epoch, t, learning_rate, train_cost, test_cost) print("Epoch {:d} (duration: {:.2f} s, learning rate: {:.1e}):".format(epoch + 1, duration, learning_rate)) print(" log-likelihood: {:.3f} (training set), {:.3f} (test set)".format(train_cost, test_cost))
cost_g = E_F cost_g += NLL cost_g += l2_cost_g # 3) inferencer cost_i = NLL ############################ # Gradient & Optimization ############################ # parameter updates ########## lrt = theano.shared(np.asarray(state['lr'], dtype=floatX), name='lr') grads_d = T.grad(cost_d, params_d) scaled_grads_d = total_norm_constraint(grads_d, 500.) updates_d = adam(scaled_grads_d, params_d, lrt/10., 0.5) # updates_d = adam(grads_d, params_d, lrt/10., 0.5) grads_g = T.grad(cost_g, params_g) scaled_grads_g = total_norm_constraint(grads_g, 500.) updates_g = adam(scaled_grads_g, params_g, lrt, 0.5) # updates_g = adam(grads_g, params_g, lrt, 0.5) grads_i = T.grad(cost_i, params_i) scaled_grads_i = total_norm_constraint(grads_i, 500.) updates_i = adam(scaled_grads_i, params_i, lrt, 0.5) # updates_i = adam(grads_i, params_i, lrt, 0.5) gnorm_d = T.sqrt(sum(T.sum(g**2) for g in grads_d)) gnorm_g = T.sqrt(sum(T.sum(g**2) for g in grads_g))
def train_n_samples_function(self, n_samples=2): print "Train Function: Estimate the Expectation of unlabeled data by n samples." sym_klw = T.scalar( 'sym_klw', dtype=theano.config.floatX) # symbolic scalar of warming up sym_sents = T.matrix('sym_s', dtype='int64') sym_mask = T.matrix('sym_mask', dtype=theano.config.floatX) # one hot! sym_label = T.matrix('sym_label', dtype=theano.config.floatX) sym_sents_u = T.matrix('sym_s_u', dtype='int64') sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX) num_l, num_u = sym_sents.shape[0].astype(theano.config.floatX), \ sym_sents_u.shape[0].astype(theano.config.floatX) num_all = num_l + num_u self.n_samples = min(n_samples, self.dim_y) # forward the network and get cost values enc_sents, dec_sents, _ = self._forward_sents(sym_sents, dev_stage=False) enc_sents_u, dec_sents_u, _ = self._forward_sents(sym_sents_u, dev_stage=False) # classifier loss y_pred, loss_class, acc = self._forward_classifier( [enc_sents, sym_mask], sym_label, dev_stage=False) y_pred_u, loss_entropy, _ = self._forward_classifier( [enc_sents_u, sym_mask_u], None, dev_stage=False) label_onehot, y_pred_sampled, y_pred_sampled_norm = self._sample_n_categories( y_pred_u, self.n_samples) # reconstruction and kl loss loss_rec, loss_kl, ppl = self.cost_label( [sym_sents, enc_sents, dec_sents, sym_mask, sym_label], use_baseline=self.use_baseline, dev_stage=False) loss_rec_u, loss_kl_u, ppl_u = \ self.cost_unlabel_n_samples([sym_sents_u, enc_sents_u, dec_sents_u, sym_mask_u, label_onehot, y_pred_sampled_norm], dev_stage=False) # gradients, see supplementary files for detail all_params, params_e, params_w, params_phi, params_theta = self.get_params(tag='all'), \ self.get_params(tag='e'), self.get_params(tag='c'), self.get_params(tag='i'), self.get_params(tag='g') total_cost_directly = -self.ew * T.sum(loss_entropy) + T.sum( loss_rec + sym_klw * loss_kl) total_cost_directly += self.alpha * T.sum(loss_class) * num_all / num_l total_cost_directly /= num_all all_grads = theano.grad(total_cost_directly, all_params) # Const var and Baseline lxy = (loss_rec_u + sym_klw * loss_kl_u) # (bs, ns) baseline_norm = T.sum(lxy * y_pred_sampled_norm, axis=1) # (bs,) const_Lxy = (lxy - baseline_norm[:, None]) grad_e = theano.grad( T.sum(y_pred_sampled_norm * (const_Lxy * T.log(y_pred_sampled) + lxy)) / num_all, params_e, consider_constant=[y_pred_sampled_norm, baseline_norm, const_Lxy]) grad_w = theano.grad( T.sum(y_pred_sampled_norm * const_Lxy * T.log(y_pred_sampled)) / num_all, params_w, consider_constant=[y_pred_sampled_norm, baseline_norm, const_Lxy]) grad_ig = theano.grad( T.sum(y_pred_sampled_norm * lxy) / num_all, params_phi + params_theta, consider_constant=[y_pred_sampled_norm, baseline_norm, const_Lxy]) # combine the grads grad_unlabel = grad_e + grad_w + grad_ig all_grads = [gi + gj for gi, gj in zip(all_grads, grad_unlabel)] total_cost = total_cost_directly + T.sum( y_pred_sampled_norm * lxy) / num_all # not used in gradients all_grads = [ T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads ] all_grads = total_norm_constraint(all_grads, max_norm=self.max_norm) updates = adam(all_grads, all_params, self.lr) train_input = [ sym_sents, sym_mask, sym_label, sym_sents_u, sym_mask_u, sym_klw ] train_output = [ total_cost, T.mean(loss_rec), T.mean(n_samples * y_pred_sampled_norm * loss_rec_u), T.mean(loss_kl), T.mean(n_samples * y_pred_sampled_norm * loss_kl_u), T.mean(loss_class), T.mean(loss_entropy), ppl, T.mean(const_Lxy), acc ] train_f = theano.function(inputs=train_input, outputs=train_output, updates=updates, name='train_n_samples') return train_f
def set_network_trainer(input_data, input_mask, target_data, target_mask, network_outputs, updater, learning_rate, grad_max_norm=10., l2_lambda=1e-5, var_lambda=1e-5, load_updater_params=None): network = network_outputs[-1] # get network output data output_data = get_output(network_outputs, deterministic=False) predict_data = output_data[-1] predict_idx = T.argmax(predict_data, axis=-1) # get prediction cost train_predict_cost = categorical_crossentropy( predictions=T.reshape(predict_data, (-1, predict_data.shape[-1])) + eps, targets=T.flatten(target_data, 1)) train_predict_cost = train_predict_cost * T.flatten(target_mask, 1) train_predict_cost = train_predict_cost.sum() / target_mask.sum() # get regularizer cost train_regularizer_cost = regularize_network_params(network, penalty=l2) # reduce inner loop variance (over time) train_fisher_cost = 0. inner_hid_list = output_data[:-1] num_inners = len(inner_hid_list) for inner_hid in inner_hid_list: # mean over time seq_mean = T.mean(input=inner_hid, axis=1) seq_mean_var = T.var(seq_mean, axis=0) # variance over time seq_var = T.var(input=inner_hid, axis=1) seq_var_mean = T.mean(seq_var, axis=0) # ratio train_fisher_cost += T.mean(seq_var_mean / (seq_mean_var + eps)) train_fisher_cost /= num_inners # get network parameters network_params = get_all_params(network, trainable=True) # get network gradients with clipping network_grads = theano.grad(cost=train_predict_cost + train_regularizer_cost * l2_lambda + train_fisher_cost * var_lambda, wrt=network_params) network_grads, network_grads_norm = total_norm_constraint( tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) # set updater train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, trainer_params = updater( loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) # get training (update) function training_fn = theano.function( inputs=[input_data, input_mask, target_data, target_mask], outputs=[ predict_data, predict_idx, train_predict_cost, train_regularizer_cost, train_fisher_cost, network_grads_norm ], updates=train_updates, allow_input_downcast=True) return training_fn, trainer_params
def lasagne_model(model_base, model_flavor, **params): import theano theano.config.floatX = 'float32' from theano import function as tfunction, shared as tshared from theano.tensor import tensor4, imatrix, nnet from theano.tensor import grad as Tgrad, mean as Tmean, reshape as Treshape from lasagne.utils import floatX from lasagne.updates import adam as lasagne_adam, total_norm_constraint from lasagne.layers import get_output as ll_output, \ get_all_params as ll_all_params max_norm = 5.0 verbose = params.get('verbose', False) overwrite = params.get('overwrite', True) sym_x = tensor4() # [nbatch,imgchan,imgrows,imgcols] dims sym_y = imatrix() # one-hot vector of [nb_class x 1] dims l_A_net = model_base['A_net'] l_transform = model_base['transform'] l_out = model_base['net_out'] output_train = ll_output(l_out, sym_x, deterministic=False) output_shape = (-1, l_out.shape[1]) # nb_classes = l_out.shape[1] output_flat = treshape(output_train, output_shape) output_loss = nnet.categorical_crossentropy output_cost = tmean(output_loss(output_flat + tol, sym_y.flatten())) trainable_params = ll_all_params(l_out, trainable=True) all_grads = tgrad(output_cost, trainable_params) updates, norm = total_norm_constraint(all_grads, max_norm=max_norm, return_norm=True) shared_lr = tshared(floatX(update_lr)) updates = lasagne_adam(updates, trainable_params, learning_rate=shared_lr, beta_1=beta_1, beta_2=beta_2, epsilon=tol) model_train = tfunction([sym_x, sym_y], [output_cost, output_train, norm], updates=updates) output_eval, l_A_eval = ll_output([l_out, l_A_net], sym_x, deterministic=True) model_eval = tfunction( [sym_x], [output_eval.reshape(output_shape), l_A_eval.reshape(output_shape)]) model_batch = lambda X, y: model_train(X, int32(y))[0] model_pred = lambda X: model_eval(X)[0] model_xform = lambda X: layer_output(X, l_transform) model_save = lambda outf: save_all_weights( l_out, outf, overwrite=overwrite) model_load = lambda weightf: load_all_weights(l_out, weightf) return Model(package='lasagne', backend='theano', flavor=model_flavor, base=model_base, batch=model_batch, predict=model_pred, transform=model_xform, save=model_save, load=model_load, params=params)
def __init__(self, atari_env, state_dimension, action_dimension, monitor_env=False, learning_rate=0.001, critic_update=10, train_step=1, gamma=0.95, eps_max=1.0, eps_min=0.1, eps_decay=10000, n_epochs=10000, batch_size=32, buffer_size=50000): self.env = gym.make(atari_env) if monitor_env: None self.state_dimension = state_dimension self.action_dimension = action_dimension self.learning_rate = learning_rate self.critic_update = critic_update self.train_step = train_step self.gamma = gamma self.eps_max = eps_max self.eps_min = eps_min self.eps_decay = eps_decay self.n_epochs = n_epochs self.batch_size = batch_size self.buffer_size = buffer_size self.experience_replay = [] def q_network(state): input_state = InputLayer(input_var=state, shape=(None, self.state_dimension[0], self.state_dimension[1], self.state_dimension[2])) input_state = DimshuffleLayer(input_state, pattern=(0, 3, 1, 2)) conv = Conv2DLayer(input_state, num_filters=32, filter_size=(8, 8), stride=(4, 4), nonlinearity=rectify) conv = Conv2DLayer(conv, num_filters=64, filter_size=(4, 4), stride=(2, 2), nonlinearity=rectify) conv = Conv2DLayer(conv, num_filters=64, filter_size=(3, 3), stride=(1, 1), nonlinearity=rectify) flatten = FlattenLayer(conv) dense = DenseLayer(flatten, num_units=512, nonlinearity=rectify) q_values = DenseLayer(dense, num_units=self.action_dimension, nonlinearity=linear) return q_values self.X_state = T.ftensor4() self.X_action = T.bvector() self.X_reward = T.fvector() self.X_next_state = T.ftensor4() self.X_done = T.bvector() self.X_action_hot = to_one_hot(self.X_action, self.action_dimension) self.q_ = q_network(self.X_state) self.q = get_output(self.q_) self.q_target_ = q_network(self.X_next_state) self.q_target = get_output(self.q_target_) self.q_max = T.max(self.q_target, axis=1) self.action = T.argmax(self.q, axis=1) self.mu = theano.function(inputs=[self.X_state], outputs=self.action, allow_input_downcast=True) self.loss = squared_error( self.X_reward + self.gamma * self.q_max * (1.0 - self.X_done), T.batched_dot(self.q, self.X_action_hot)) self.loss = self.loss.mean() self.params = get_all_params(self.q_) self.grads = T.grad(self.loss, self.params) self.normed_grads = total_norm_constraint(self.grads, 1.0) self.updates = rmsprop(self.normed_grads, self.params, learning_rate=self.learning_rate) self.update_network = theano.function(inputs=[ self.X_state, self.X_action, self.X_reward, self.X_next_state, self.X_done ], outputs=self.loss, updates=self.updates, allow_input_downcast=True)
q_max = T.max(q_target, axis=1) action = T.argmax(q, axis=1) mu = theano.function(inputs = [X_state], outputs = action, allow_input_downcast = True) loss = squared_error(X_reward + gamma * q_max * (1.0 - X_done), T.batched_dot(q, X_action_hot)) loss = loss.mean() params = get_all_params(q_) grads = T.grad(loss, params) normed_grads = total_norm_constraint(grads, 1.0) updates = adam(normed_grads, params, learning_rate = learning_rate) update_network = theano.function(inputs = [X_state, X_action, X_reward, X_next_state, X_done], outputs = loss, updates = updates, allow_input_downcast = True) def get_action(state, step):
def set_network_trainer(input_data, input_mask, target_data, target_mask, num_outputs, network, rand_layer_list, updater, learning_rate, grad_max_norm=10., l2_lambda=1e-5, load_updater_params=None): # get one hot target one_hot_target_data = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1), nb_class=num_outputs, dtype=floatX) # get network output data predict_data = get_output(network, deterministic=False) num_seqs = predict_data.shape[0] # get prediction cost predict_data = T.reshape(x=predict_data, newshape=(-1, num_outputs), ndim=2) predict_data = predict_data - T.max(predict_data, axis=-1, keepdims=True) predict_data = predict_data - T.log( T.sum(T.exp(predict_data), axis=-1, keepdims=True)) train_predict_cost = -T.sum(T.mul(one_hot_target_data, predict_data), axis=-1) train_predict_cost = train_predict_cost * T.flatten(target_mask, 1) train_model_cost = train_predict_cost.sum() / num_seqs train_frame_cost = train_predict_cost.sum() / target_mask.sum() # get regularizer cost train_regularizer_cost = regularize_network_params(network, penalty=l2) # get network parameters network_params = get_all_params(network, trainable=True) # get network gradients network_grads = theano.grad(cost=train_model_cost + train_regularizer_cost * l2_lambda, wrt=network_params) if grad_max_norm > 0.: network_grads, network_grads_norm = total_norm_constraint( tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) else: network_grads_norm = T.sqrt( sum(T.sum(grad**2) for grad in network_grads)) # set updater train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, trainer_params = updater( loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) skip_comp_list = [] for rand_layer in rand_layer_list: skip_comp_list.append( T.sum(rand_layer.skip_comp * input_mask) / T.sum(input_mask)) # get training (update) function training_fn = theano.function( inputs=[input_data, input_mask, target_data, target_mask], outputs=[train_frame_cost, network_grads_norm] + skip_comp_list, updates=train_updates) return training_fn, trainer_params
def main(): # Main setup latent_sizes = [2, 5, 10, 20, 30, 50, 100] downsampling_factors = [1, 2, 4] N_epochs = 50 binarise_downsampling = False bernoulli_sampling = True # Setup C = 1 # number of channels in image H = 28 # height of image W = 28 # width of image # K = 10 # number of classes hidden_sizes = [200, 200] batch_size = 100 analytic_kl_term = True learning_rate = 0.001 #0.0003 shape = [H * W * C] # Symbolic variables symbolic_x_LR = T.matrix() symbolic_x_HR = T.matrix() symbolic_z = T.matrix() symbolic_learning_rate = T.scalar('learning_rate') # Fix random seed for reproducibility numpy.random.seed(1234) # Data file_name = "mnist.pkl.gz" file_path = data_path(file_name) (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = data.loadMNIST(file_path, shape) X_train = numpy.concatenate([X_train, X_valid]) X_train = X_train.astype(theano.config.floatX) X_test = X_test.astype(theano.config.floatX) N_train_batches = X_train.shape[0] / batch_size N_test_batches = X_test.shape[0] / batch_size if bernoulli_sampling: preprocess = bernoullisample else: preprocess = numpy.round # Setup shared variables X_train_shared = theano.shared(preprocess(X_train), borrow = True) X_test_shared = theano.shared(preprocess(X_test), borrow = True) X_test_shared_fixed = theano.shared(numpy.round(X_test), borrow = True) X_test_shared_normal = theano.shared(X_test, borrow = True) all_runs_duration = 0 for latent_size, downsampling_factor in product(latent_sizes, downsampling_factors): run_start = time.time() print("Training model with a latent size of {} and images downsampled by {}:\n".format(latent_size, downsampling_factor)) # Models h = H / downsampling_factor w = W / downsampling_factor ## Recognition model q(z|x) l_enc_HR_in = InputLayer((None, H * W * C), name = "ENC_HR_INPUT") l_enc_HR_downsample = l_enc_HR_in l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C, H, W)) if downsampling_factor != 1: l_enc_HR_downsample = Pool2DLayer(l_enc_HR_downsample, pool_size = downsampling_factor, mode = "average_exc_pad") # TODO Should downsampled data be binarised? (worse performance) if binarise_downsampling: l_enc_HR_downsample = NonlinearityLayer(l_enc_HR_downsample, nonlinearity = T.round) l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, h * w * C)) l_enc_LR_in = InputLayer((None, h * w * C), name = "ENC_LR_INPUT") l_enc = l_enc_LR_in for i, hidden_size in enumerate(hidden_sizes, start = 1): l_enc = DenseLayer(l_enc, num_units = hidden_size, nonlinearity = softplus, name = 'ENC_DENSE{:d}'.format(i)) l_z_mu = DenseLayer(l_enc, num_units = latent_size, nonlinearity = identity, name = 'ENC_Z_MU') l_z_log_var = DenseLayer(l_enc, num_units = latent_size, nonlinearity = identity, name = 'ENC_Z_LOG_VAR') # Sample the latent variables using mu(x) and log(sigma^2(x)) l_z = SimpleSampleLayer(mean = l_z_mu, log_var = l_z_log_var) # as Kingma # l_z = SampleLayer(mean = l_z_mu, log_var = l_z_log_var) ## Generative model p(x|z) l_dec_in = InputLayer((None, latent_size), name = "DEC_INPUT") l_dec = l_dec_in for i, hidden_size in enumerate_reversed(hidden_sizes, start = 0): l_dec = DenseLayer(l_dec, num_units = hidden_size, nonlinearity = softplus, name = 'DEC_DENSE{:d}'.format(i)) l_dec_x_mu = DenseLayer(l_dec, num_units = H * W * C, nonlinearity = sigmoid, name = 'DEC_X_MU') l_dec_x_log_var = DenseLayer(l_dec, num_units = H * W * C, nonlinearity = sigmoid, name = 'DEC_X_MU') # TRY relu instead of softplus (maybe with more hidden units) # TRY softmax instead of sigmoid # PROBLEM with this is that we have several pixels activated. ## Get outputs from models # With noise x_LR_train = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = False) z_train, z_mu_train, z_log_var_train = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_train}, deterministic = False ) x_mu_train, x_log_var_train = get_output([l_dec_x_mu, l_dec_x_log_var], {l_dec_in: z_train}, deterministic = False) # Without noise x_LR_eval = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = True) z_eval, z_mu_eval, z_log_var_eval = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_eval}, deterministic = True ) x_mu_eval, x_log_var_eval = get_output([l_dec_x_mu, l_dec_x_log_var], {l_dec_in: z_eval}, deterministic = True) # Sampling x_mu_sample = get_output(l_dec_x_mu, {l_dec_in: symbolic_z}, deterministic = True) # Likelihood # Calculate the loglikelihood(x) = E_q[ log p(x|z) + log p(z) - log q(z|x)] def log_likelihood(z, z_mu, z_log_var, x_mu, x_log_var, x, analytic_kl_term): if analytic_kl_term: kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) # log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis = 1) LL = T.mean(-kl_term + log_px_given_z) else: log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1) log_pz = log_stdnormal(z).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) # log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis = 1) LL = T.mean(log_pz + log_px_given_z - log_qz_given_x) return LL # log-likelihood for training ll_train = log_likelihood( z_train, z_mu_train, z_log_var_train, x_mu_train, x_log_var_train, symbolic_x_HR, analytic_kl_term) # log-likelihood for evaluating ll_eval = log_likelihood( z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, x_log_var_eval, symbolic_x_HR, analytic_kl_term) # Parameters to train parameters = get_all_params([l_z, l_dec_x_mu], trainable = True) # parameters = get_all_params([l_z, l_dec_x_mu, l_dec_x_log_var], trainable = True) print("Parameters that will be trained:") for parameter in parameters: print("{}: {}".format(parameter, parameter.get_value().shape)) ### Take gradient of negative log-likelihood gradients = T.grad(-ll_train, parameters) # Adding gradient clipping to reduce the effects of exploding gradients, # and hence speed up convergence gradient_clipping = 1 gradient_norm_max = 5 gradient_constrained = updates.total_norm_constraint(gradients, max_norm = gradient_norm_max) gradients_clipped = [T.clip(g,-gradient_clipping, gradient_clipping) for g in gradient_constrained] # Setting up functions for training symbolic_batch_index = T.iscalar('index') batch_slice = slice(symbolic_batch_index * batch_size, (symbolic_batch_index + 1) * batch_size) update_expressions = updates.adam(gradients_clipped, parameters, learning_rate = symbolic_learning_rate) train_model = theano.function( [symbolic_batch_index, symbolic_learning_rate], ll_train, updates = update_expressions, givens = {symbolic_x_HR: X_train_shared[batch_slice]} ) test_model = theano.function( [symbolic_batch_index], ll_eval, givens = {symbolic_x_HR: X_test_shared[batch_slice]} ) test_model_fixed = theano.function( [symbolic_batch_index], ll_eval, givens = {symbolic_x_HR: X_test_shared_fixed[batch_slice]} ) def train_epoch(learning_rate): costs = [] for i in range(N_train_batches): cost_batch = train_model(i, learning_rate) costs += [cost_batch] return numpy.mean(costs) def test_epoch(): costs = [] for i in range(N_test_batches): cost_batch = test_model(i) costs += [cost_batch] return numpy.mean(costs) def test_epoch_fixed(): costs = [] for i in range(N_test_batches): cost_batch = test_model_fixed(i) costs += [cost_batch] return numpy.mean(costs) # Training epochs = [] cost_train = [] cost_test = [] print for epoch in range(N_epochs): epoch_start = time.time() # Shuffle train data numpy.random.shuffle(X_train) X_train_shared.set_value(preprocess(X_train)) # TODO: Using dynamically changed learning rate train_cost = train_epoch(learning_rate) test_cost = test_epoch() test_cost_fixed = test_epoch_fixed() epoch_duration = time.time() - epoch_start epochs.append(epoch + 1) cost_train.append(train_cost) cost_test.append(test_cost) # line = "Epoch: %i\tTime: %0.2f\tLR: %0.5f\tLL Train: %0.3f\tLL test: %0.3f\t" % ( epoch, t, learning_rate, train_cost, test_cost) print("Epoch {:d} (duration: {:.2f} s, learning rate: {:.1e}):".format(epoch + 1, epoch_duration, learning_rate)) print(" log-likelihood: {:.3f} (training set), {:.3f} (test set)".format(train_cost, test_cost)) print # Results ## Reconstruction N_reconstructions = 50 X_test_eval = X_test_shared.eval() X_test_eval_fixed = X_test_shared_fixed.eval() X_test_eval_normal = X_test_shared_normal.eval() subset = numpy.random.randint(0, len(X_test_eval), size = N_reconstructions) x_original = X_test_eval[numpy.array(subset)] x_LR = get_output(l_enc_HR_downsample, x_original).eval() z = get_output(l_z, x_LR).eval() x_reconstructed = x_mu_sample.eval({symbolic_z: z}) x_original_fixed = X_test_eval_fixed[numpy.array(subset)] x_LR_fixed = get_output(l_enc_HR_downsample, x_original_fixed).eval() z_fixed = get_output(l_z, x_LR_fixed).eval() x_reconstructed_fixed = x_mu_sample.eval({symbolic_z: z_fixed}) originals = X_test_eval_normal[numpy.array(subset)] reconstructions = { "originals": x_original, "downsampled": x_LR, "reconstructions": x_reconstructed } reconstructions_fixed = { "originals": x_original_fixed, "downsampled": x_LR_fixed, "reconstructions": x_reconstructed_fixed } ## Manifold if latent_size == 2: x = numpy.linspace(0.1, 0.9, 20) # TODO: Ideally sample from the real p(z) v = gaussian.ppf(x) z = numpy.zeros((20**2, 2)) i = 0 for a in v: for b in v: z[i,0] = a z[i,1] = b i += 1 z = z.astype('float32') samples = x_mu_sample.eval({symbolic_z: z}) else: samples = None ## Reconstructions of homemade numbers if downsampling_factor == 2: file_names = [ "hm_7_Avenir.png", "hm_7_Noteworthy.png", "hm_7_Chalkboard.png", "hm_7_drawn.png", "hm_A_Noteworthy.png", "hm_A_drawn.png", "hm_7_0.txt", "hm_7_1.txt", "hm_7_2.txt", "hm_A.txt" ] x_LR_HM = data.loadHomemade(map(data_path, file_names), [h * w]) z = get_output(l_z, x_LR_HM).eval() x_HM_reconstructed = x_mu_sample.eval({symbolic_z: z}) reconstructions_homemade = { "originals": x_LR_HM, "reconstructions": x_HM_reconstructed } else: reconstructions_homemade = None # Saving setup_and_results = { "setup": { "image size": (C, H, W), "downsampling factor": downsampling_factor, "learning rate": learning_rate, "analytic K-L term": analytic_kl_term, "batch size": batch_size, "hidden layer sizes": hidden_sizes, "latent size": latent_size, "number of epochs": N_epochs }, "results": { "learning curve": { "epochs": epochs, "training cost function": cost_train, "test cost function": cost_test }, "originals": originals, "reconstructions": reconstructions, "reconstructions (fixed)": reconstructions_fixed, "manifold": { "samples": samples }, "reconstructed homemade numbers": reconstructions_homemade } } file_name = "results{}_ds{}{}_l{}_e{}.pkl".format("_bs" if bernoulli_sampling else "", downsampling_factor, "b" if binarise_downsampling else "", latent_size, N_epochs) with open(data_path(file_name), "w") as f: pickle.dump(setup_and_results, f) run_duration = time.time() - run_start all_runs_duration += run_duration print("Run took {:.2f} minutes.".format(run_duration / 60)) print("\n") print("All runs took {:.2f} minutes in total.".format(all_runs_duration / 60))
def predict (LD, output_dir, basename): import os import numpy as np import random np.random.seed(0) random.seed(0) import data_converter from sklearn import preprocessing, decomposition from sklearn.utils import shuffle import time from sklearn.externals import joblib from lasagne import layers from lasagne.updates import nesterov_momentum from lasagne.updates import norm_constraint, total_norm_constraint import lasagne import theano import theano.tensor as T from lasagne.regularization import regularize_layer_params, regularize_layer_params_weighted, l2, l1 LD.data['X_train'], LD.data['Y_train'] = shuffle(LD.data['X_train'], LD.data['Y_train'] , random_state=1) X_train = LD.data['X_train'] X_valid = LD.data['X_valid'] X_test = LD.data['X_test'] X_train = X_train[:, 0:2000] X_valid = X_valid[:, 0:2000] X_test = X_test[:, 0:2000] X_train = X_train.toarray() X_valid = X_valid.toarray() X_test = X_test.toarray() fs = decomposition.PCA(n_components=100) fs.fit(X_train) X_train2 = fs.transform(X_train) X_valid2 = fs.transform(X_valid) X_test2 = fs.transform(X_test) X_train = X_train[:, 0:200] X_valid = X_valid[:, 0:200] X_test = X_test[:, 0:200] X_train = np.float32(X_train) X_valid = np.float32(X_valid) X_test = np.float32(X_test) X_train = np.hstack([X_train, X_train2]) X_valid = np.hstack([X_valid, X_valid2]) X_test = np.hstack([X_test, X_test2]) normx = preprocessing.StandardScaler() normx.fit(X_train) X_train = normx.transform(X_train) X_valid = normx.transform(X_valid) X_test = normx.transform(X_test) X_train = np.float32(X_train) X_valid = np.float32(X_valid) X_test = np.float32(X_test) print "p5" y_train = np.copy(LD.data['Y_train']) y_train = np.float32(y_train) y_train = y_train.reshape((-1, 1)) def batches(X, y, csize, rs): X, y = shuffle(X, y, random_state=rs) for cstart in range(0, X.shape[0] - csize+1, csize): Xc = X[cstart:cstart+csize] yc = y[cstart:cstart+csize] yield Xc, yc input_var = T.matrix('inputs') target_var = T.matrix('targets') l_in = lasagne.layers.InputLayer(shape=(None, X_train.shape[1]), input_var=input_var, nonlinearity=None, W=lasagne.init.Sparse()) l_hid1 = lasagne.layers.DenseLayer( l_in, num_units= 100, nonlinearity=lasagne.nonlinearities.sigmoid, W=lasagne.init.Sparse()) l_hid2 = lasagne.layers.DenseLayer( l_hid1, num_units= 40, nonlinearity=lasagne.nonlinearities.tanh, W=lasagne.init.GlorotUniform() ) Lnum_out_units = 1 l_out = lasagne.layers.DenseLayer( l_hid2, num_units=Lnum_out_units, nonlinearity=None) network = l_out prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.squared_error(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) all_grads = T.grad(loss, params) scaled_grads = total_norm_constraint(all_grads, 100) updates = lasagne.updates.sgd(scaled_grads, params, learning_rate=0.001) train_fn = theano.function([input_var, target_var], loss, updates=updates) for epoch in range(1200): train_err = 0 train_batches = 0 for batch in batches(X_train, y_train, 100, epoch): Xt, yt = batch train_err += train_fn(Xt, yt) train_batches += 1 xml1 = T.matrix('xml1') Xlt1 = lasagne.layers.get_output(l_out, xml1, deterministic=True) f2 = theano.function([xml1], Xlt1) preds_valid = f2(X_valid).ravel() preds_test = f2(X_test).ravel() import data_io cycle = 0 filename_valid = basename + '_valid_' + str(cycle).zfill(3) + '.predict' data_io.write(os.path.join(output_dir,filename_valid), preds_valid) filename_test = basename + '_test_' + str(cycle).zfill(3) + '.predict' data_io.write(os.path.join(output_dir,filename_test), preds_test)
def build_model(self, train_set_unlabeled, train_set_labeled, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(CSDGM, self).build_model(train_set_unlabeled, test_set, validation_set) sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0], dtype=theano.config.floatX), borrow=True) sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1], dtype=theano.config.floatX), borrow=True) n = self.sh_train_x.shape[0].astype( theano.config.floatX) # no. of data points n_l = sh_train_x_l.shape[0].astype( theano.config.floatX) # no. of labeled data points # Define the layers for the density estimation used in the lower bound. l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu, self.l_qa_logvar) l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar) l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8) l_log_pz = StandardNormalLogDensityLayer(self.l_qz) l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu, self.l_pa_logvar) l_x_in = ReshapeLayer(self.l_x_in, (-1, self.n_l * self.n_c)) l_px = DimshuffleLayer(self.l_px, (0, 3, 1, 2, 4)) l_px = ReshapeLayer(l_px, (-1, self.sym_samples, 1, self.n_c)) if self.x_dist == 'bernoulli': l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'multinomial': l_log_px = MultinomialLogDensityLayer(l_px, l_x_in) l_log_px = ReshapeLayer(l_log_px, (-1, self.n_l, 1, 1, 1)) l_log_px = MeanLayer(l_log_px, axis=1) elif self.x_dist == 'gaussian': l_px_mu = ReshapeLayer( DimshuffleLayer(self.l_px_mu, (0, 2, 3, 1, 4)), (-1, self.sym_samples, 1, self.n_l * self.n_c)) l_px_logvar = ReshapeLayer( DimshuffleLayer(self.l_px_logvar, (0, 2, 3, 1, 4)), (-1, self.sym_samples, 1, self.n_l * self.n_c)) l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar) def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px): lb = log_px + log_py + (log_pz + log_pa - log_qa - log_qz) * (1.1 - self.sym_warmup) return lb # Lower bound for labeled data out_layers = [ l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy ] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out # Prior p(y) expecting that all classes are evenly distributed py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y))) log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l, log_py_l, log_px_zy_l) lb_l = lb_l.mean(axis=(1, 2)) # Mean over the sampling dimensions log_qy_ax_l *= ( self.sym_beta * (n / n_l) ) # Scale the supervised cross entropy with the alpha constant lb_l += log_qy_ax_l.mean(axis=( 1, 2 )) # Collect the lower bound term and mean over sampling dimensions # Lower bound for unlabeled data bs_u = self.sym_x_u.shape[0] # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form # x_repeat t_repeat # [[x[0,0], x[0,1], ..., x[0,n_x]] [[1, 0, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [1, 0, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 1, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [0, 1, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 0, 1] # [x[1,0], x[1,1], ..., x[1,n_x]]] [0, 0, 1]] t_eye = T.eye(self.n_y, k=0) t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape( (-1, self.n_y)) x_u = self.sym_x_u.reshape( (1, bs_u, self.n_l, self.n_c)).repeat(self.n_y, axis=0).reshape( (-1, self.n_l, self.n_c)) # Since the expectation of var a is outside the integration we calculate E_q(a|x) first a_x_u = get_output(self.l_qa, self.sym_x_u, batch_norm_update_averages=True, batch_norm_use_averages=False) a_x_u_rep = a_x_u.reshape( (1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y, axis=0).reshape( (-1, self.n_a)) out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px] inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out # Prior p(y) expecting that all classes are evenly distributed py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y))) log_py_u = -categorical_crossentropy(py_u, t_u).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u, log_py_u, log_px_zy_u) lb_u = lb_u.reshape( (self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2)) inputs = { self.l_x_in: self.sym_x_u, self.l_a_in: a_x_u.reshape((-1, self.n_a)) } y_u = get_output(self.l_qy, inputs, batch_norm_update_averages=True, batch_norm_use_averages=False).mean(axis=(1, 2)) y_u += 1e-8 # Ensure that we get no NANs when calculating the entropy y_u /= T.sum(y_u, axis=1, keepdims=True) lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1) # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients weight_priors = 0.0 for p in self.trainable_model_params: if 'W' not in str(p): continue weight_priors += log_normal(p, 0, 1).sum() # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n lb_labeled = -lb_l.mean() lb_unlabeled = -lb_u.mean() log_px = log_px_zy_l.mean() + log_px_zy_u.mean() log_pz = log_pz_l.mean() + log_pz_u.mean() log_qz = log_qz_axy_l.mean() + log_qz_axy_u.mean() log_pa = log_pa_l.mean() + log_pa_u.mean() log_qa = log_qa_x_l.mean() + log_qa_x_u.mean() grads_collect = T.grad(elbo, self.trainable_model_params) params_collect = self.trainable_model_params sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1, sym_beta2) # Training function indices = self._srng.choice(size=[self.sym_bs_l], a=sh_train_x_l.shape[0], replace=False) x_batch_l = sh_train_x_l[indices] t_batch_l = sh_train_t_l[indices] x_batch_u = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = { self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l } inputs = [ self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples, self.sym_warmup ] outputs = [ elbo, lb_labeled, lb_unlabeled, log_px, log_pz, log_qz, log_pa, log_qa ] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize_unlabeled'] = 100 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 0.1 self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['inputs']['warmup'] = 0.1 self.train_args['outputs']['lb'] = '%0.3f' self.train_args['outputs']['lb-l'] = '%0.3f' self.train_args['outputs']['lb-u'] = '%0.3f' self.train_args['outputs']['px'] = '%0.3f' self.train_args['outputs']['pz'] = '%0.3f' self.train_args['outputs']['qz'] = '%0.3f' self.train_args['outputs']['pa'] = '%0.3f' self.train_args['outputs']['qa'] = '%0.3f' # Validation and test function y = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100 givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['test'] = '%0.2f%%' f_validate = None if validation_set is not None: givens = { self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t } f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['validation'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def set_network_trainer(input_data, input_mask, target_data, target_mask, num_outputs, network, inner_loop_layers, updater, learning_rate, grad_max_norm=10., l2_lambda=1e-5, load_updater_params=None): # get one hot target one_hot_target_data = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1), nb_class=num_outputs, dtype=floatX) # get network output data network_outputs = get_output(inner_loop_layers + [ network, ], deterministic=False) inner_feats = T.concatenate(network_outputs[:-1], axis=-1) predict_data = network_outputs[-1] num_seqs = predict_data.shape[0] # get prediction cost predict_data = T.reshape(x=predict_data, newshape=(-1, num_outputs), ndim=2) predict_data = predict_data - T.max(predict_data, axis=-1, keepdims=True) predict_data = predict_data - T.log( T.sum(T.exp(predict_data), axis=-1, keepdims=True)) train_predict_cost = -T.sum(T.mul(one_hot_target_data, predict_data), axis=-1) train_predict_cost = train_predict_cost * T.flatten(target_mask, 1) train_model_cost = train_predict_cost.sum() / num_seqs train_frame_cost = train_predict_cost.sum() / target_mask.sum() # get inner loop cost (num_batches x seq x features) train_sf_cost0 = T.var(inner_feats, axis=1).mean() # intra var low train_sf_cost1 = -T.var(T.mean(inner_feats, axis=1), axis=0).mean() # inter var high train_sf_cost2 = T.sum(T.sqr(inner_feats[:, 1:, :] - inner_feats[:, :-1, :]), axis=-1).mean() # get l2 cost train_l2_cost = regularize_network_params(network, penalty=l2) * l2_lambda # get network parameters network_params = get_all_params(network, trainable=True) # get network gradients network_grads = theano.grad(cost=train_model_cost + train_l2_cost + train_sf_cost2 * 1.0, wrt=network_params) if grad_max_norm > 0.: network_grads, network_grads_norm = total_norm_constraint( tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) else: network_grads_norm = T.sqrt( sum(T.sum(grad**2) for grad in network_grads)) # set updater train_updates, trainer_params = updater( loss_or_grads=network_grads, params=network_params, learning_rate=learning_rate, load_params_dict=load_updater_params) # get training (update) function training_fn = theano.function( inputs=[input_data, input_mask, target_data, target_mask], outputs=[ train_frame_cost, network_grads_norm, train_sf_cost0, train_sf_cost1, train_sf_cost2 ], updates=train_updates) return training_fn, trainer_params
def __init__(self, input_vars, target_vars, l_out, loss, optimizer, learning_rate=0.001, id=None): if not isinstance(input_vars, Sequence): raise ValueError('input_vars should be a sequence, instead got %s' % (input_vars,)) if not isinstance(target_vars, Sequence): raise ValueError('target_vars should be a sequence, instead got %s' % (input_vars,)) self.get_options() self.input_vars = input_vars self.l_out = l_out self.loss = loss self.optimizer = optimizer self.id = id id_tag = (self.id + '/') if self.id else '' id_tag_log = (self.id + ': ') if self.id else '' if self.options.verbosity >= 6: output_model_structure(l_out) params = self.params() (monitored, train_loss_grads, synth_vars) = self.get_train_loss(target_vars, params) self.monitored_tags = monitored.keys() if self.options.true_grad_clipping: scaled_grads = total_norm_constraint(train_loss_grads, self.options.true_grad_clipping) else: scaled_grads = train_loss_grads updates = optimizer(scaled_grads, params, learning_rate=learning_rate) if not self.options.no_nan_suppression: # TODO: print_mode='all' somehow is always printing, even when # there are no NaNs. But tests are passing, even on GPU! updates = apply_nan_suppression(updates, print_mode='none') if self.options.detect_nans: mode = MonitorMode(post_func=detect_nan) else: mode = None if self.options.verbosity >= 2: print(id_tag_log + 'Compiling training function') params = input_vars + target_vars + synth_vars if self.options.verbosity >= 6: print('params = %s' % (params,)) self.train_fn = theano.function(params, monitored.values(), updates=updates, mode=mode, name=id_tag + 'train', on_unused_input='warn') if self.options.run_dir and not self.options.no_graphviz: self.visualize_graphs({'loss': monitored['loss']}, out_dir=self.options.run_dir) test_prediction = get_output(l_out, deterministic=True) if self.options.verbosity >= 2: print(id_tag_log + 'Compiling prediction function') if self.options.verbosity >= 6: print('params = %s' % (input_vars,)) self.predict_fn = theano.function(input_vars, test_prediction, mode=mode, name=id_tag + 'predict', on_unused_input='ignore') if self.options.run_dir and not self.options.no_graphviz: self.visualize_graphs({'test_prediction': test_prediction}, out_dir=self.options.run_dir)
def __init__(self, network, loss, trn_data, trn_inputs, step=lu.adam, lr=0.001, lr_decay=1.0, max_norm=0.1, monitor=None, val_frac=0., assemble_extra_inputs=None, seed=None): """Construct and configure the trainer The trainer takes as inputs a neural network, a loss function and training data. During init the theano functions for training are compiled. Parameters ---------- network : NeuralNet instance The neural network to train loss : theano variable Loss function to be computed for network training trn_data : tuple of arrays Training data in the form (params, stats) trn_inputs : list of theano variables Theano variables that should contain the the training data step : function Function to call for updates, will pass gradients and parameters lr : float initial learning rate lr_decay : float learning rate decay factor, learning rate for each epoch is set to lr * (lr_decay**epoch) max_norm : float Total norm constraint for gradients monitor : dict Dict containing theano variables (and names as keys) that should be recorded during training along with the loss function val_frac: float Fraction of dataset to use as validation set assemble_extra_inputs: function (optional) function to compute extra inputs needed to evaluate loss seed : int or None If provided, random number generator for batches will be seeded """ self.network = network self.loss = loss self.trn_data = trn_data self.trn_inputs = trn_inputs self.seed = seed if seed is not None: self.rng = np.random.RandomState(seed=seed) else: self.rng = np.random.RandomState() # gradients grads = tt.grad(self.loss, self.network.aps) if max_norm is not None: grads = lu.total_norm_constraint(grads, max_norm=max_norm) # updates self.lr = lr self.lr_decay = lr_decay self.lr_op = theano.shared(np.array(self.lr, dtype=dtype)) self.updates = step(grads, self.network.aps, learning_rate=self.lr_op) # check trn_data n_trn_data_list = set([x.shape[0] for x in trn_data]) assert len(n_trn_data_list) == 1, 'trn_data elements got different len' self.n_trn_data = trn_data[0].shape[0] # outputs self.trn_outputs_names = ['loss'] self.trn_outputs_nodes = [self.loss] if monitor is not None and len(monitor) > 0: monitor_names, monitor_nodes = zip(*monitor.items()) self.trn_outputs_names += monitor_names self.trn_outputs_nodes += monitor_nodes # function for single update self.make_update = theano.function(inputs=self.trn_inputs, outputs=self.trn_outputs_nodes, updates=self.updates) self.assemble_extra_inputs = assemble_extra_inputs if not (val_frac == 0.): self.do_validation = True n_trn = int((1 - val_frac) * self.n_trn_data) self.val_data = [data[n_trn:] for data in trn_data ].copy() # copy() might be overly prudent self.trn_data = [data[:n_trn] for data in trn_data].copy() # assemble extra inputs *once* for validation data if self.assemble_extra_inputs is not None: self.val_data = self.assemble_extra_inputs(tuple( self.val_data)) # prepare validation data self.val_inputs = [ theano.shared(data.astype(dtype), borrow=True) for data in self.val_data ] # compile theano function for validation self.validate = theano.function(inputs=[], outputs=self.loss, givens=list( zip(self.trn_inputs, self.val_inputs))) self.best_val_loss = np.inf else: self.do_validation = False # initialize variables self.loss = float('inf')
def set_network_trainer(input_data, input_mask, target_data, target_mask, network_outputs, updater, learning_rate, grad_max_norm=10., l2_lambda=1e-5, inner_lambda=1e-2, load_updater_params=None): network = network_outputs[-1] # get network output data output_data = get_output(network_outputs, deterministic=False) predict_data = output_data[-1] inner_hid_list = output_data[:-1] num_seqs = predict_data.shape[0] # get prediction cost predict_data = T.reshape(x=predict_data, newshape=(-1, predict_data.shape[-1]), ndim=2) predict_data = T.clip(predict_data, eps, 1.0 - eps) train_predict_cost = categorical_crossentropy(predictions=predict_data, targets=T.flatten( target_data, 1)) train_predict_cost = train_predict_cost * T.flatten(target_mask, 1) train_model_cost = train_predict_cost.sum() / num_seqs train_frame_cost = train_predict_cost.sum() / target_mask.sum() # get regularizer cost train_regularizer_cost = regularize_network_params(network, penalty=l2) * l2_lambda # reduce inner loop variance (over time) train_inner_cost = 0. num_inners = len(inner_hid_list) for inner_hid in inner_hid_list: # variance over time seq_var = T.var(input=inner_hid, axis=1) # mean over sample variance train_inner_cost += T.mean(seq_var) train_inner_cost /= num_inners # get network parameters network_params = get_all_params(network, trainable=True) # get network gradients network_grads = theano.grad(cost=train_model_cost + train_regularizer_cost + train_inner_cost * inner_lambda, wrt=network_params) network_grads, network_grads_norm = total_norm_constraint( tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) # set updater train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, trainer_params = updater( loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) # get training (update) function training_fn = theano.function( inputs=[input_data, input_mask, target_data, target_mask], outputs=[train_frame_cost, train_inner_cost, network_grads_norm], updates=train_updates) return training_fn, trainer_params
def __init__(self, network, loss, trn_data, trn_inputs, step=lu.adam, lr=0.001, lr_decay=1.0, max_norm=0.1, monitor=None, seed=None): """Construct and configure the trainer The trainer takes as inputs a neural network, a loss function and training data. During init the theano functions for training are compiled. Parameters ---------- network : NeuralNet instance The neural network to train loss : theano variable Loss function to be computed for network training trn_data : tuple of arrays Training data in the form (params, stats) trn_inputs : list of theano variables Theano variables that should contain the the training data step : function Function to call for updates, will pass gradients and parameters lr : float initial learning rate lr_decay : float learning rate decay factor, learning rate for each epoch is set to lr * (lr_decay**epoch) max_norm : float Total norm constraint for gradients monitor : dict Dict containing theano variables (and names as keys) that should be recorded during training along with the loss function seed : int or None If provided, random number generator for batches will be seeded """ self.network = network self.loss = loss self.trn_data = trn_data self.trn_inputs = trn_inputs self.seed = seed if seed is not None: self.rng = np.random.RandomState(seed=seed) else: self.rng = np.random.RandomState() # gradients grads = tt.grad(self.loss, self.network.aps) if max_norm is not None: grads = lu.total_norm_constraint(grads, max_norm=max_norm) # updates self.lr = lr self.lr_decay = lr_decay self.lr_op = theano.shared(np.array(self.lr, dtype=dtype)) self.updates = step(grads, self.network.aps, learning_rate=self.lr_op) # check trn_data n_trn_data_list = set([x.shape[0] for x in trn_data]) assert len(n_trn_data_list) == 1, 'trn_data elements got different len' self.n_trn_data = trn_data[0].shape[0] # outputs self.trn_outputs_names = ['loss'] self.trn_outputs_nodes = [self.loss] if monitor is not None and len(monitor) > 0: monitor_names, monitor_nodes = zip(*monitor.items()) self.trn_outputs_names += monitor_names self.trn_outputs_nodes += monitor_nodes # function for single update self.make_update = theano.function(inputs=self.trn_inputs, outputs=self.trn_outputs_nodes, updates=self.updates) # initialize variables self.loss = float('inf')