Exemple #1
0
def test_total_norm_constraint():
    import numpy as np
    import theano
    import theano.tensor as T
    from lasagne.updates import total_norm_constraint

    x1 = T.scalar()
    x2 = T.matrix()
    threshold = 5.0
    tensors1 = total_norm_constraint([x1, x2], threshold, return_norm=False)
    tensors2, norm = total_norm_constraint([x1, x2], threshold,
                                           return_norm=True)

    f1 = theano.function([x1, x2], [tensors1[0], tensors1[1]])
    f2 = theano.function([x1, x2], [tensors2[0], tensors2[1],
                                    norm])

    x_test = np.arange(1+9, dtype='float32')
    x1_test = x_test[-1]
    x2_test = x_test[:9].reshape((3, 3))
    x1_out1, x2_out1 = f1(x1_test, x2_test)
    x1_out2, x2_out2, norm = f2(x1_test, x2_test)

    np.testing.assert_array_almost_equal(x1_out1, x1_out2)
    np.testing.assert_array_almost_equal(x2_out1, x2_out2)

    x_out = [float(x1_out1)] + list(x2_out1.flatten())

    np.testing.assert_array_almost_equal(np.linalg.norm(x_test), norm)
    np.testing.assert_array_almost_equal(np.linalg.norm(x_out), threshold)
Exemple #2
0
def test_total_norm_constraint():
    import numpy as np
    import theano
    import theano.tensor as T
    from lasagne.updates import total_norm_constraint

    x1 = T.scalar()
    x2 = T.matrix()
    threshold = 5.0
    tensors1 = total_norm_constraint([x1, x2], threshold, return_norm=False)
    tensors2, norm = total_norm_constraint([x1, x2],
                                           threshold,
                                           return_norm=True)

    f1 = theano.function([x1, x2], [tensors1[0], tensors1[1]])
    f2 = theano.function([x1, x2], [tensors2[0], tensors2[1], norm])

    x_test = np.arange(1 + 9, dtype='float32')
    x1_test = x_test[-1]
    x2_test = x_test[:9].reshape((3, 3))
    x1_out1, x2_out1 = f1(x1_test, x2_test)
    x1_out2, x2_out2, norm = f2(x1_test, x2_test)

    np.testing.assert_array_almost_equal(x1_out1, x1_out2)
    np.testing.assert_array_almost_equal(x2_out1, x2_out2)

    x_out = [float(x1_out1)] + list(x2_out1.flatten())

    np.testing.assert_array_almost_equal(np.linalg.norm(x_test), norm)
    np.testing.assert_array_almost_equal(np.linalg.norm(x_out), threshold)
Exemple #3
0
def apply_grad_norm_clip(gradients, clip=None):
    if clip is None:
        _, norm = LU.total_norm_constraint(gradients, 1, return_norm=True)
    else:
        gradients, norm = LU.total_norm_constraint(gradients,
                                                   clip,
                                                   return_norm=True)
    return gradients, norm
Exemple #4
0
def careful_rmsprop(loss_or_grads,
                    params,
                    learning_rate=1.0,
                    rho=0.9,
                    epsilon=1e-6,
                    grad_clipping=1.0e-2):
    """
  RMSProp with gradient clipping.
  :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled.
  :return: updates
  """
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    grads = total_norm_constraint(grads,
                                  max_norm=grad_clipping,
                                  epsilon=epsilon)

    # Using theano constant to prevent upcasting of float32
    one = T.constant(1)

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)
        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        accu_new = rho * accu + (one - rho) * grad**2
        updates[accu] = accu_new
        updates[param] = param - (learning_rate * grad /
                                  T.sqrt(accu_new + epsilon))

    return updates
Exemple #5
0
    def u(loss_or_grads, params, *args, **kwargs):
        grads = get_or_compute_grads(loss_or_grads, params)
        grads = total_norm_constraint(grads,
                                      max_norm=grad_clipping,
                                      epsilon=epsilon)

        return updates(grads, params, *args, **kwargs)
Exemple #6
0
def build_trainer(input_data,
                  input_mask,
                  target_data,
                  target_mask,
                  network_params,
                  network_reg_params,
                  output_layer,
                  weight_decay,
                  updater,
                  learning_rate,
                  max_grad_norm=0.0,
                  load_updater_params=None):
    output_score = get_output(output_layer, deterministic=False)
    frame_prd_idx = T.argmax(output_score, axis=-1)

    one_hot_target = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1),
                                            nb_class=output_dim,
                                            dtype=floatX)

    output_score = T.reshape(x=output_score,
                             newshape=(-1, output_dim),
                             ndim=2)
    output_score = output_score - T.max(output_score, axis=-1, keepdims=True)
    output_score = output_score - T.log(T.sum(T.exp(output_score), axis=-1, keepdims=True))

    train_ce = -T.sum(T.mul(one_hot_target, output_score), axis=-1)*T.flatten(target_mask, 1)

    train_loss = T.sum(train_ce)/target_mask.shape[0]
    frame_loss = T.sum(train_ce)/T.sum(target_mask)

    frame_accr = T.sum(T.eq(frame_prd_idx, target_data)*target_mask)/T.sum(target_mask)

    train_total_loss = train_loss
    if weight_decay > 0:
        train_total_loss += apply_penalty(network_reg_params, l2)*10**(-weight_decay)

    network_grads = theano.grad(cost=train_total_loss, wrt=network_params)

    if max_grad_norm > 0.:
        network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads,
                                                                  max_norm=max_grad_norm,
                                                                  return_norm=True)
    else:
        network_grads_norm = T.sqrt(sum(T.sum(grad ** 2) for grad in network_grads))

    train_lr = theano.shared(lasagne.utils.floatX(learning_rate))
    train_updates, updater_params = updater(loss_or_grads=network_grads,
                                            params=network_params,
                                            learning_rate=train_lr,
                                            load_params_dict=load_updater_params)

    training_fn = theano.function(inputs=[input_data,
                                          input_mask,
                                          target_data,
                                          target_mask],
                                  outputs=[frame_loss,
                                           frame_accr,
                                           network_grads_norm],
                                  updates=train_updates)
    return training_fn, train_lr, updater_params
Exemple #7
0
    def create_dadgm_gradients(self, loss, deterministic=False):
        grads = GSM.create_gradients(self, loss, deterministic)

        # combine and clip gradients
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads, max_norm=max_norm)
        cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]

        return cgrads
Exemple #8
0
def set_network_trainer(input_data,
                        input_mask,
                        target_data,
                        target_mask,
                        network,
                        updater,
                        learning_rate,
                        grad_max_norm=10.,
#                        l2_lambda=1e-5,
                        load_updater_params=None):

    # get network output data
    predict_data = get_output(network, deterministic=False)

    predict_idx = T.argmax(predict_data, axis=-1)

    # get prediction cost
    train_predict_cost = categorical_crossentropy(predictions=T.reshape(predict_data, (-1, predict_data.shape[-1])) + eps,
                                                  targets=T.flatten(target_data, 1))
    train_predict_cost = train_predict_cost*T.flatten(target_mask, 1)
    train_predict_cost = train_predict_cost.sum()/target_mask.sum()

    # get regularizer cost
    train_regularizer_cost = regularize_network_params(network, penalty=l2)

    # get network parameters
    network_params = get_all_params(network, trainable=True)

    # get network gradients with clipping
    network_grads = theano.grad(cost=train_predict_cost + train_regularizer_cost*l2_lambda,
                                wrt=network_params)
    network_grads = theano.grad(cost=train_predict_cost,
                                wrt=network_params)
    network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads,
                                                              max_norm=grad_max_norm,
                                                              return_norm=True)


    # set updater
    train_lr = theano.shared(lasagne.utils.floatX(learning_rate))
    train_updates, trainer_params = updater(loss_or_grads=network_grads,
                                            params=network_params,
                                            learning_rate=train_lr,
                                            load_params_dict=load_updater_params)

    # get training (update) function
    training_fn = theano.function(inputs=[input_data,
                                          input_mask,
                                          target_data,
                                          target_mask],
                                  outputs=[predict_data,
                                           predict_idx,
                                           train_predict_cost,
                                           train_regularizer_cost],
                                           network_grads_norm],
                                  updates=train_updates, allow_input_downcast=True)
Exemple #9
0
def calculate_gradient(loss, params, weight_norm=[]):
	""" calculate gradients with option to clip norm """

	grad = T.grad(loss, params)

	# gradient norm option
	if weight_norm:
		grad = updates.total_norm_constraint(grad, weight_norm)

	return grad
Exemple #10
0
 def train_function(self, semi_supervised= True, unlabel_stable=False):
     '''
     use_unlabel == True, semi-superviesd learning
     return: train function for 1 epoch use
     '''
     self.semi_supervised = semi_supervised
     sym_klw = T.scalar('sym_klw',dtype=theano.config.floatX) # symbolic scalar of warming up
     sym_cw = T.scalar('sym_cw',dtype=theano.config.floatX) # classifier warm up
     sym_s = T.matrix('sym_s',dtype='int64')
     sym_mask = T.matrix('sym_mask',dtype=theano.config.floatX)
     sym_y = T.matrix('sym_label',dtype=theano.config.floatX)
     sym_s_u = T.matrix('sym_s_u',dtype='int64')
     sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX)
     num_l, num_u = sym_s.shape[0].astype(theano.config.floatX), 0.0
     if self.semi_supervised:
         print 'Train with unlabel data.'
         num_u = sym_s_u.shape[0].astype(theano.config.floatX)
     #get labeled/unlabeled cost
     outs1 = self.cost_label([sym_s, sym_mask, sym_y], dev_stage=False, return_mode = 'mean')
     loss_recons, loss_kl, valid_words, word_drop_num, loss_classifier, batch_ppl, acc = outs1
     loss_recons_u, loss_kl_u,loss_entropy_u, batch_ppl_u = 0.0,0.0,0.0,0.0
     valid_words_u = 0
     if self.semi_supervised:
         outs2 = self.cost_unlabel([sym_s_u, sym_mask_u], dev_stage=unlabel_stable, sample_by_prob=self.sample_unlabel)
         loss_recons_u, loss_kl_u, valid_words_u, loss_entropy_u, batch_ppl_u = outs2
     '''
     total Loss:
     L = Loss_labeled(s,mask,y) + beta*(n_l+n_u)/n_l * Loss_classisifer(s,mask,y)
         + Loss_unlabel(s_u, mask_u)
     L = recons_term + sym_klw_term + loss_classifier_term - loss_entropy_u
     '''
     alpha = sym_cw * self.cost_beta * ( num_l + num_u ) / num_l
     total_cost = loss_recons * num_l + loss_recons_u * num_u\
                  + sym_klw * ( loss_kl * num_l + loss_kl_u * num_u)\
                  + alpha * loss_classifier * num_l\
                  - loss_entropy_u * num_u
     total_cost /= (num_l + num_u)
     train_params = self.get_params(only_trainable=True)
     all_grads = theano.grad(total_cost,train_params)
     all_grads = [T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads]
     all_grads = total_norm_constraint( all_grads, max_norm=self.max_norm )
     #all_grads = [T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads]
     updates = adam(all_grads,train_params, self.lr, self.beta1, self.beta2)
     if self.semi_supervised:
         train_input = [sym_s, sym_mask, sym_y, sym_s_u, sym_mask_u, sym_klw, sym_cw]
         train_output = [total_cost,
                         loss_recons, loss_recons_u, loss_kl, loss_kl_u, alpha, loss_classifier, loss_entropy_u,
                         batch_ppl, batch_ppl_u, valid_words, valid_words_u, word_drop_num, acc]
     else:
         train_input = [sym_s, sym_mask, sym_y, sym_klw, sym_cw]
         train_output = [total_cost, loss_recons, loss_kl, loss_classifier,
                         batch_ppl, valid_words, word_drop_num, acc]
     train_f = theano.function(inputs=train_input, outputs=train_output,updates=updates, name='train_function')
     return train_f
Exemple #11
0
    def train_expectation_function(self):
        '''
        unlabeled data train with expection
        '''
        print "Train Function: Calculate the Expectation of unlabeled data."
        sym_klw = T.scalar('sym_klw', dtype=theano.config.floatX)  # symbolic scalar of warming up
        sym_sents = T.matrix('sym_s', dtype='int64')
        sym_mask = T.matrix('sym_mask', dtype=theano.config.floatX)  # one hot!
        sym_label = T.matrix('sym_label', dtype=theano.config.floatX)
        sym_sents_u = T.matrix('sym_s_u', dtype='int64')
        sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX)
        num_l, num_u = sym_sents.shape[0].astype(theano.config.floatX), \
                       sym_sents_u.shape[0].astype(theano.config.floatX)
        num_all = num_l + num_u

        # forward the network and get cost values
        enc_sents, dec_sents, _ = self._forward_sents(sym_sents, dev_stage=False)
        enc_sents_u, dec_sents_u, _ = self._forward_sents(sym_sents_u, dev_stage=False)

        # classifier loss
        y_pred, loss_class, acc = self._forward_classifier([enc_sents, sym_mask], sym_label, dev_stage=False)
        y_pred_u, loss_entropy, _ = self._forward_classifier([enc_sents_u, sym_mask_u], None, dev_stage=False)

        # reconstruction and kl loss
        loss_rec, loss_kl, ppl = self.cost_label([sym_sents, enc_sents, dec_sents, sym_mask, sym_label], dev_stage=False)
        loss_rec_u, loss_kl_u, ppl_u = self.cost_unlabel_expectation([sym_sents_u, enc_sents_u, dec_sents_u,
                                                        sym_mask_u, y_pred_u], dev_stage=False)
        
        # use baseline
        if self.use_baseline:
            baselines_u = self._get_baselines([sym_sents_u, enc_sents_u, sym_mask_u])
            loss_rec_u -= baselines_u
        
        total_cost = T.sum(loss_rec) + T.sum(loss_rec_u) - T.sum(loss_entropy)
        total_cost += sym_klw * (T.sum(loss_kl) + T.sum(loss_kl_u))
        total_cost += self.alpha * T.sum(loss_class) * num_all / num_l
        total_cost /= num_all

        all_params = self.get_params(tag='all')
        all_grads = theano.grad(total_cost, all_params)
        all_grads = total_norm_constraint(all_grads, max_norm=self.max_norm)
        updates = adam(all_grads, all_params, self.lr)

        train_input = [sym_sents, sym_mask, sym_label, sym_sents_u, sym_mask_u, sym_klw]
        train_output = [total_cost,
                        T.mean(loss_rec), T.mean(loss_rec_u), T.mean(loss_kl), T.mean(loss_kl_u),
                        T.mean(loss_class), T.mean(loss_entropy), ppl, ppl_u, acc, self.b]
        train_f = theano.function(inputs=train_input, outputs=train_output, updates=updates, name='train_expectation')
        return train_f
Exemple #12
0
    def _get_updates(self, loss, params, optimizer, optimizer_params={},
                     clip_grad=None, max_norm_constraint=None,
                     clip_param=None):
        if clip_param:
            print clip_param
            params = [T.clip(p, -clip_param, clip_param) for p in params]

        grads = T.grad(loss, params)
        if max_norm_constraint:
            grads =\
                total_norm_constraint(grads,
                                      max_norm=max_norm_constraint)
        if clip_grad:
            grads = [T.clip(g, -clip_grad, clip_grad) for g in grads]

        return optimizer(grads, params, **optimizer_params)
Exemple #13
0
    def _get_updates(self,
                     loss,
                     params,
                     optimizer,
                     optimizer_params={},
                     clip_grad=None,
                     max_norm_constraint=None):
        grads = T.grad(loss, params)
        if max_norm_constraint:
            grads =\
                total_norm_constraint(grads,
                                      max_norm=max_norm_constraint)
        if clip_grad:
            grads = [T.clip(g, -clip_grad, clip_grad) for g in grads]

        return optimizer(grads, params, **optimizer_params)
Exemple #14
0
def params_update(grads, params, lrt, max_g=None):
    def optimize(grads, params):
        if state['optim_method'] == 'adam':
            updates = adam(grads, params, lrt, state['momentum'])
        elif state['optim_method'] == 'adagrad':
            updates = adagrad(grads, params, lrt)
        elif state['optim_method'] == 'sgd':
            updates = sgd(grads, params, lrt)
        return updates

    if max_g is not None:
        scaled_grads = total_norm_constraint(grads, max_g)
        updates = optimize(scaled_grads, params)
    else:
        updates = optimize(grads, params)

    return updates
    def adam(self,
             cost,
             params,
             learning_rate=0.001,
             beta1=0.9,
             beta2=0.999,
             epsilon=1e-8):

        all_grads = T.grad(cost=cost, wrt=params)
        all_grads = total_norm_constraint(all_grads, 10)

        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), all_grads)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))

        t_prev = theano.shared(utils.floatX(0.))
        updates = OrderedDict()

        t = t_prev + 1
        a_t = learning_rate * T.sqrt(1 - beta2**t) / (1 - beta1**t)

        for param, g_t in zip(params, all_grads):
            g_t = T.switch(not_finite, 0.1 * param, g_t)
            value = param.get_value(borrow=True)
            m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                   broadcastable=param.broadcastable)
            v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                   broadcastable=param.broadcastable)

            m_t = beta1 * m_prev + (1 - beta1) * g_t
            v_t = beta2 * v_prev + (1 - beta2) * g_t**2
            step = a_t * m_t / (T.sqrt(v_t) + epsilon)

            updates[m_prev] = m_t
            updates[v_prev] = v_t
            updates[param] = param - step

        updates[t_prev] = t
        return updates
Exemple #16
0
def cruel_rmsprop(loss_or_grads,
                  params,
                  learning_rate=1.0,
                  rho=0.9,
                  epsilon=1e-6,
                  grad_clipping=1.0e-2,
                  param_clipping=1.0e-2):
    """
  A version of careful RMSProp for Wassershtein GAN. 
  :param epsilon: small number for computational stability.
  :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled.
  :param param_clipping: after each update all params are clipped to [-`param_clipping`, `param_clipping`].
  :return: 
  """
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    grads = total_norm_constraint(grads,
                                  max_norm=grad_clipping,
                                  epsilon=epsilon)

    # Using theano constant to prevent upcasting of float32
    one = T.constant(1)

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)
        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        accu_new = rho * accu + (one - rho) * grad**2
        updates[accu] = accu_new

        updated = param - (learning_rate * grad / T.sqrt(accu_new + epsilon))

        if param_clipping is not None:
            updates[param] = T.clip(updated, -param_clipping, param_clipping)
        else:
            updates[param] = updated

    return updates
Exemple #17
0
    def get_f_train(self):
        network_params = self.get_params()
        for param in network_params:
            print param.get_value().shape, param.name

        x = T.imatrix()
        m = T.matrix()
        y = T.matrix()
        pred = layers.get_output(self.l_y, {
            self.l_x: x,
            self.l_m: m,
        },
                                 deterministic=False)

        cost = objectives.categorical_crossentropy(pred, y).mean()
        acc = T.eq(T.argmax(pred, axis=1), T.argmax(y, axis=1)).mean()
        grads = theano.grad(cost, network_params)
        grads = updates.total_norm_constraint(grads, max_norm=20.0)
        grads = [T.clip(g, -10.0, 10.0) for g in grads]
        params_update = updates.adam(grads, network_params, self.lr)
        f_train = theano.function([x, m, y], [cost, acc],
                                  updates=params_update)
        return f_train
Exemple #18
0
from lasagne.layers import InputLayer, DenseLayer
import lasagne
from lasagne.updates import sgd, total_norm_constraint
import theano.tensor as T

x = T.matrix()
y = T.ivector()
l_in = InputLayer((5, 10))
l1 = DenseLayer(l_in, num_units=7, nonlinearity=T.nnet.softmax)
output = lasagne.layers.get_output(l1, x)
cost = T.mean(T.nnet.categorical_crossentropy(output, y))
all_params = lasagne.layers.get_all_params(l1)
all_grads = T.grad(cost, all_params)
scaled_grads = total_norm_constraint(all_grads[i], 5)
updates = sgd(scaled_grads, all_params, learning_rate=0.1)
Exemple #19
0
def set_network_trainer(input_data,
                        input_mask,
                        target_data,
                        target_mask,
                        network,
                        updater,
                        learning_rate,
                        grad_max_norm=10.,
                        l2_lambda=1e-5,
                        load_updater_params=None):
    ###########################
    # get network output data #
    ###########################
    # get network output data
    network_output = get_output(network, deterministic=False)

    ################################
    # get training cost (CTC + L2) #
    ################################
    # get prediction cost (CTC)
    train_ctc_cost = ctc_cost(y=target_data.dimshuffle(1, 0),
                              y_mask=target_mask.dimshuffle(1, 0),
                              y_hat=network_output.dimshuffle(1, 0, 2),
                              y_hat_mask=input_mask.dimshuffle(1, 0),
                              skip_softmax=True)
    train_ctc_cost = train_ctc_cost.mean()
    # get prediction cost (char-level), CTC)
    train_cost_per_char = train_ctc_cost/target_mask.sum()
    # get regularizer cost (L2)
    train_regularizer_cost = regularize_network_params(network, penalty=l2)

    ##########################
    # get network parameters #
    ##########################
    network_params = get_all_params(network, trainable=True)

    #########################
    # get network gradients #
    #########################
    # get gradient over cost
    network_grads = theano.grad(cost=train_ctc_cost + train_regularizer_cost*l2_lambda,
                                wrt=network_params)
    # get gradient norm constraint
    network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads,
                                                              max_norm=grad_max_norm,
                                                              return_norm=True)

    #######################
    # get network updater #
    #######################
    # get learning rate variable
    train_lr = theano.shared(lasagne.utils.floatX(learning_rate))
    # get updater
    train_updates, trainer_params = updater(loss_or_grads=network_grads,
                                            params=network_params,
                                            learning_rate=train_lr,
                                            load_params_dict=load_updater_params)

    ################################
    # get network updater function #
    ################################
    training_fn = theano.function(inputs=[input_data,
                                          input_mask,
                                          target_data,
                                          target_mask],
                                  outputs=[train_ctc_cost,
                                           train_cost_per_char,
                                           train_regularizer_cost,
                                           network_grads_norm],
                                  updates=train_updates)
    return training_fn, trainer_params
    def train_sample_function(self, ew=1.0):
        '''
        unlabeled data train with sample
        '''
        print "Train Function: Estimate the Expectation of unlabeled data by Sample."
        sym_klw = T.scalar(
            'sym_klw',
            dtype=theano.config.floatX)  # symbolic scalar of warming up
        sym_sents = T.matrix('sym_s', dtype='int64')
        sym_mask = T.matrix('sym_mask', dtype=theano.config.floatX)  # one hot!
        sym_label = T.matrix('sym_label', dtype=theano.config.floatX)
        sym_sents_u = T.matrix('sym_s_u', dtype='int64')
        sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX)
        num_l, num_u = sym_sents.shape[0].astype(theano.config.floatX), \
                       sym_sents_u.shape[0].astype(theano.config.floatX)
        num_all = num_l + num_u

        # forward the network and get cost values
        enc_sents, dec_sents, _ = self._forward_sents(sym_sents,
                                                      dev_stage=False)
        enc_sents_u, dec_sents_u, _ = self._forward_sents(sym_sents_u,
                                                          dev_stage=False)

        # classifier loss
        y_pred, loss_class, acc = self._forward_classifier(
            [enc_sents, sym_mask], sym_label, dev_stage=False)
        y_pred_u, loss_entropy, _ = self._forward_classifier(
            [enc_sents_u, sym_mask_u], None, dev_stage=False)
        sampled_label, y_pred_sampled = self._sample_one_category(y_pred_u)

        # reconstruction and kl loss
        loss_rec, loss_kl, ppl = self.cost_label(
            [sym_sents, enc_sents, dec_sents, sym_mask, sym_label],
            dev_stage=False)
        loss_rec_u, loss_kl_u, ppl_u = self.cost_label(
            [sym_sents_u, enc_sents_u, dec_sents_u, sym_mask_u, sampled_label],
            dev_stage=False)

        # use baseline
        # length normalization for unlabel
        const_Lxy = loss_rec_u / T.sum(sym_mask_u,
                                       axis=1) + sym_klw * loss_kl_u
        if self.use_baseline:
            baselines_u = self._get_baselines(
                [sym_sents_u, enc_sents_u, sym_mask_u])
            const_Lxy -= baselines_u

        # gradients, see supplementary files for detail
        all_params, params_e, params_w, params_phi, params_theta = self.get_params(tag='all'), \
        self.get_params(tag='e'), self.get_params(tag='c'), self.get_params(tag='i'), self.get_params(tag='g')
        total_cost_directly = -T.sum(loss_entropy) * ew + T.sum(
            loss_rec + sym_klw * loss_kl)
        total_cost_directly += self.alpha * T.sum(loss_class) * num_all / num_l
        total_cost_directly /= num_all
        all_grads = theano.grad(total_cost_directly, all_params)
        grad_e = theano.grad(T.sum(const_Lxy * T.log(y_pred_sampled) +
                                   loss_rec_u + sym_klw * loss_kl_u) / num_all,
                             params_e,
                             consider_constant=[const_Lxy])
        grad_w = theano.grad(T.sum(const_Lxy * T.log(y_pred_sampled)) /
                             num_all,
                             params_w,
                             consider_constant=[const_Lxy])
        grad_ig = theano.grad(T.sum(loss_rec_u + sym_klw * loss_kl_u) /
                              num_all,
                              params_phi + params_theta,
                              consider_constant=[const_Lxy])
        # combine the grads
        grad_unlabel = grad_e + grad_w + grad_ig
        all_grads = [gi + gj for gi, gj in zip(all_grads, grad_unlabel)]
        total_cost = total_cost_directly + T.sum(
            const_Lxy) / num_all  # not used in gradients
        '''
        # old cost function in AVAE
        all_params = self.get_params(tag='all')
        total_cost = T.sum(loss_rec) + T.sum(loss_rec_u) - T.sum(loss_entropy)
        total_cost += sym_klw * (T.sum(loss_kl) + T.sum(loss_kl_u))
        total_cost += self.alpha * T.sum(loss_class) * num_all / num_l
        total_cost /= num_all
        all_grads = theano.grad(total_cost, all_params)
        '''

        all_grads = [
            T.clip(g, -self.grad_clipping, self.grad_clipping)
            for g in all_grads
        ]
        all_grads = total_norm_constraint(all_grads, max_norm=self.max_norm)
        updates = adam(all_grads, all_params, self.lr)
        update_baseline = {self.b: 0.9 * self.b + 0.1 * T.mean(loss_rec_u)}
        updates.update(update_baseline)
        train_input = [
            sym_sents, sym_mask, sym_label, sym_sents_u, sym_mask_u, sym_klw
        ]

        train_output = [
            total_cost,
            T.mean(loss_rec),
            T.mean(loss_rec_u),
            T.mean(loss_kl),
            T.mean(loss_kl_u),
            T.mean(loss_class),
            T.mean(loss_entropy), ppl, ppl_u, acc, self.b
        ]
        train_f = theano.function(inputs=train_input,
                                  outputs=train_output,
                                  updates=updates,
                                  name='train_sample')
        return train_f
def main():
    
    # TODO Make this work better.
    # See https://swarbrickjones.wordpress.com/2015/04/29/convolutional-autoencoders-in-pythontheanolasagne/.
    
    # Setup
    
    C = 1 # number of channels in image
    H = 28 # height of image
    W = 28 # width of image
    # K = 10 # number of classes
    
    shape = [C * H * W]
    
    padding_size = 2
    
    downsampling_factor = 2
    
    # Dense layers
    hidden_sizes = [200, 200]
    latent_size = 2
    
    # Convolutional layers
    filters = [{"number": 16, "size": 3, "stride": 1}]
    
    batch_size = 100
    
    analytic_kl_term = True
    learning_rate = 0.01
    
    N_epochs = 10 # 1000
    
    # Symbolic variables
    symbolic_x_LR = T.matrix()
    symbolic_x_HR = T.matrix()
    symbolic_z = T.matrix()
    symbolic_learning_rate = T.scalar('learning_rate')
    
    # Fix random seed for reproducibility
    numpy.random.seed(1234)
    
    # Data
    
    file_name = "mnist.pkl.gz"
    file_path = data_path(file_name)
    
    (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = data.loadMNIST(file_path, shape)
    
    X_train = numpy.concatenate([X_train, X_valid])
    
    X_train = X_train.astype(theano.config.floatX)
    X_test = X_test.astype(theano.config.floatX)

    N_train_batches = X_train.shape[0] / batch_size
    N_test_batches = X_test.shape[0] / batch_size

    # Setup shared variables
    X_train_shared = theano.shared(X_train, borrow = True)
    X_test_shared = theano.shared(X_test, borrow = True)
    
    # Models
    
    ## Recognition model q(z|x)
    
    pool_size = 2
    
    l_enc_HR_in = InputLayer((None, C * H * W), name = "ENC_HR_INPUT")
    
    l_enc_HR_downsample = l_enc_HR_in
    l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C, H, W))
    l_enc_HR_downsample = PadLayer(l_enc_HR_downsample, width = padding_size)
    l_enc_HR_downsample = Pool2DLayer(l_enc_HR_downsample, pool_size = downsampling_factor, mode = "average_exc_pad")
    _, _, h, w = l_enc_HR_downsample.output_shape
    l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C * h * w))
    
    l_enc_LR_in = InputLayer((None, C * h * w), name = "ENC_LR_INPUT")
    
    l_enc = l_enc_LR_in
    l_enc = ReshapeLayer(l_enc, (-1, C, h, w))
    for i, filter_ in enumerate(filters):
        l_enc = Conv2DLayer(l_enc, filter_["number"], filter_["size"], filter_["stride"], pad = "same", nonlinearity = rectify, name = 'ENC_CONV_{:d}'.format(i))
    # l_enc = Pool2DLayer(l_enc, pool_size)
    
    l_z_mu = DenseLayer(l_enc, num_units = latent_size, nonlinearity = None, name = 'ENC_Z_MU')
    l_z_log_var = DenseLayer(l_enc, num_units = latent_size, nonlinearity = None, name = 'ENC_Z_LOG_VAR')
    
    # Sample the latent variables using mu(x) and log(sigma^2(x))
    l_z = SimpleSampleLayer(mean = l_z_mu, log_var = l_z_log_var) # as Kingma
    # l_z = SampleLayer(mean = l_z_mu, log_var = l_z_log_var)

    ## Generative model p(x|z)
    
    l_dec_in = InputLayer((None, latent_size), name = "DEC_INPUT")
    
    l_dec = DenseLayer(l_dec_in, num_units = C * H * W, nonlinearity = rectify, name = "DEC_DENSE")
    l_dec = ReshapeLayer(l_dec, (-1, C, H, W))
    for i, filter_ in enumerate_reversed(filters, start = 0):
        if filter_["stride"] == 1:
            l_dec = Conv2DLayer(l_dec, filter_["number"], filter_["size"], filter_["stride"], pad = "same", nonlinearity = rectify, name = 'DEC_CONV_{:d}'.format(i))
        else:
            l_dec = Deconv2DLayer(l_dec, filter_["number"], filter_["size"], filter_["stride"], nonlinearity = rectify, name = 'DEC_CONV_{:d}'.format(i))
    
    l_dec_x_mu = Conv2DLayer(l_dec, num_filters = C, filter_size = (3, 3), stride = 1, pad  = 'same', nonlinearity = None, name = 'DEC_X_MU')
    l_dec_x_mu = ReshapeLayer(l_dec_x_mu, (-1, C * H * W))
    
    ## Get outputs from models
    
    # With noise
    x_LR_train = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = False)
    z_train, z_mu_train, z_log_var_train = get_output(
        [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_train}, deterministic = False
    )
    x_mu_train = get_output(l_dec_x_mu, {l_dec_in: z_train}, deterministic = False)

    # Without noise
    x_LR_eval = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = True)
    z_eval, z_mu_eval, z_log_var_eval = get_output(
        [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_eval}, deterministic = True
    )
    x_mu_eval = get_output(l_dec_x_mu, {l_dec_in: z_eval}, deterministic = True)
    
    # Sampling
    x_mu_sample = get_output(l_dec_x_mu, {l_dec_in: symbolic_z}, deterministic = True)
    
    # Likelihood
    
    # Calculate the loglikelihood(x) = E_q[ log p(x|z) + log p(z) - log q(z|x)]
    def log_likelihood(z, z_mu, z_log_var, x_mu, x, analytic_kl_term):
        if analytic_kl_term:
            kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1)
            log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
            LL = T.mean(-kl_term + log_px_given_z)
        else:
            log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1)
            log_pz = log_stdnormal(z).sum(axis = 1)
            log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
            LL = T.mean(log_pz + log_px_given_z - log_qz_given_x)
        return LL

    # log-likelihood for training
    ll_train = log_likelihood(
        z_train, z_mu_train, z_log_var_train, x_mu_train, symbolic_x_HR, analytic_kl_term)

    # log-likelihood for evaluating
    ll_eval = log_likelihood(
        z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, symbolic_x_HR, analytic_kl_term)
    
    # Parameters to train
    parameters = get_all_params([l_z_mu, l_dec_x_mu], trainable = True)
    print("Parameters that will be trained:")
    for parameter in parameters:
        print("{}: {}".format(parameter, parameter.get_value().shape))

    ### Take gradient of negative log-likelihood
    gradients = T.grad(-ll_train, parameters)

    # Adding gradient clipping to reduce the effects of exploding gradients,
    # and hence speed up convergence
    gradient_clipping = 1
    gradient_norm_max = 5
    gradient_constrained = updates.total_norm_constraint(gradients,
        max_norm = gradient_norm_max)
    gradients_clipped = [T.clip(g,-gradient_clipping, gradient_clipping) for g in gradient_constrained]
    
    # Setting up functions for training
    
    symbolic_batch_index = T.iscalar('index')
    batch_slice = slice(symbolic_batch_index * batch_size, (symbolic_batch_index + 1) * batch_size)

    update_expressions = updates.adam(gradients_clipped, parameters,
        learning_rate = symbolic_learning_rate)

    train_model = theano.function(
        [symbolic_batch_index, symbolic_learning_rate], ll_train,
        updates = update_expressions, givens = {symbolic_x_HR: X_train_shared[batch_slice]}
    )

    test_model = theano.function(
        [symbolic_batch_index], ll_eval,
        givens = {symbolic_x_HR: X_test_shared[batch_slice]}
    )
    
    def train_epoch(learning_rate):
        costs = []
        for i in range(N_train_batches):
            cost_batch = train_model(i, learning_rate)
            costs += [cost_batch]
        return numpy.mean(costs)
    
    def test_epoch():
        costs = []
        for i in range(N_test_batches):
            cost_batch = test_model(i)
            costs += [cost_batch]
        return numpy.mean(costs)
    
    # Training
    
    epochs = []
    cost_train = []
    cost_test = []

    for epoch in range(N_epochs):
        
        start = time.time()
        
        # Shuffle train data
        numpy.random.shuffle(X_train)
        X_train_shared.set_value(X_train)
        
        train_cost = train_epoch(learning_rate)
        test_cost = test_epoch()
        
        duration = time.time() - start
        
        epochs.append(epoch)
        cost_train.append(train_cost)
        cost_test.append(test_cost)
        
        # line = "Epoch: %i\tTime: %0.2f\tLR: %0.5f\tLL Train: %0.3f\tLL test: %0.3f\t" % ( epoch, t, learning_rate, train_cost, test_cost)
        print("Epoch {:d} (duration: {:.2f} s, learning rate: {:.1e}):".format(epoch + 1, duration, learning_rate))
        print("    log-likelihood: {:.3f} (training set), {:.3f} (test set)".format(train_cost, test_cost))
cost_g = E_F
cost_g += NLL
cost_g += l2_cost_g

# 3) inferencer
cost_i = NLL

############################
# Gradient & Optimization
############################

# parameter updates ##########
lrt = theano.shared(np.asarray(state['lr'], dtype=floatX), name='lr')

grads_d = T.grad(cost_d, params_d)
scaled_grads_d = total_norm_constraint(grads_d, 500.)
updates_d = adam(scaled_grads_d, params_d, lrt/10., 0.5)
# updates_d = adam(grads_d, params_d, lrt/10., 0.5)

grads_g = T.grad(cost_g, params_g)
scaled_grads_g = total_norm_constraint(grads_g, 500.)
updates_g = adam(scaled_grads_g, params_g, lrt, 0.5)
# updates_g = adam(grads_g, params_g, lrt, 0.5)

grads_i = T.grad(cost_i, params_i)
scaled_grads_i = total_norm_constraint(grads_i, 500.)
updates_i = adam(scaled_grads_i, params_i, lrt, 0.5)
# updates_i = adam(grads_i, params_i, lrt, 0.5)

gnorm_d = T.sqrt(sum(T.sum(g**2) for g in grads_d))
gnorm_g = T.sqrt(sum(T.sum(g**2) for g in grads_g))
Exemple #23
0
    def train_n_samples_function(self, n_samples=2):

        print "Train Function: Estimate the Expectation of unlabeled data by n samples."
        sym_klw = T.scalar(
            'sym_klw',
            dtype=theano.config.floatX)  # symbolic scalar of warming up
        sym_sents = T.matrix('sym_s', dtype='int64')
        sym_mask = T.matrix('sym_mask', dtype=theano.config.floatX)  # one hot!
        sym_label = T.matrix('sym_label', dtype=theano.config.floatX)
        sym_sents_u = T.matrix('sym_s_u', dtype='int64')
        sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX)
        num_l, num_u = sym_sents.shape[0].astype(theano.config.floatX), \
                       sym_sents_u.shape[0].astype(theano.config.floatX)
        num_all = num_l + num_u
        self.n_samples = min(n_samples, self.dim_y)

        # forward the network and get cost values
        enc_sents, dec_sents, _ = self._forward_sents(sym_sents,
                                                      dev_stage=False)
        enc_sents_u, dec_sents_u, _ = self._forward_sents(sym_sents_u,
                                                          dev_stage=False)
        # classifier loss
        y_pred, loss_class, acc = self._forward_classifier(
            [enc_sents, sym_mask], sym_label, dev_stage=False)
        y_pred_u, loss_entropy, _ = self._forward_classifier(
            [enc_sents_u, sym_mask_u], None, dev_stage=False)
        label_onehot, y_pred_sampled, y_pred_sampled_norm = self._sample_n_categories(
            y_pred_u, self.n_samples)
        # reconstruction and kl loss
        loss_rec, loss_kl, ppl = self.cost_label(
            [sym_sents, enc_sents, dec_sents, sym_mask, sym_label],
            use_baseline=self.use_baseline,
            dev_stage=False)
        loss_rec_u, loss_kl_u, ppl_u = \
            self.cost_unlabel_n_samples([sym_sents_u, enc_sents_u, dec_sents_u, sym_mask_u,
                                        label_onehot, y_pred_sampled_norm], dev_stage=False)

        # gradients, see supplementary files for detail
        all_params, params_e, params_w, params_phi, params_theta = self.get_params(tag='all'), \
        self.get_params(tag='e'), self.get_params(tag='c'), self.get_params(tag='i'), self.get_params(tag='g')
        total_cost_directly = -self.ew * T.sum(loss_entropy) + T.sum(
            loss_rec + sym_klw * loss_kl)
        total_cost_directly += self.alpha * T.sum(loss_class) * num_all / num_l
        total_cost_directly /= num_all
        all_grads = theano.grad(total_cost_directly, all_params)

        # Const var and Baseline
        lxy = (loss_rec_u + sym_klw * loss_kl_u)  # (bs, ns)
        baseline_norm = T.sum(lxy * y_pred_sampled_norm, axis=1)  # (bs,)
        const_Lxy = (lxy - baseline_norm[:, None])
        grad_e = theano.grad(
            T.sum(y_pred_sampled_norm *
                  (const_Lxy * T.log(y_pred_sampled) + lxy)) / num_all,
            params_e,
            consider_constant=[y_pred_sampled_norm, baseline_norm, const_Lxy])
        grad_w = theano.grad(
            T.sum(y_pred_sampled_norm * const_Lxy * T.log(y_pred_sampled)) /
            num_all,
            params_w,
            consider_constant=[y_pred_sampled_norm, baseline_norm, const_Lxy])
        grad_ig = theano.grad(
            T.sum(y_pred_sampled_norm * lxy) / num_all,
            params_phi + params_theta,
            consider_constant=[y_pred_sampled_norm, baseline_norm, const_Lxy])

        # combine the grads
        grad_unlabel = grad_e + grad_w + grad_ig
        all_grads = [gi + gj for gi, gj in zip(all_grads, grad_unlabel)]
        total_cost = total_cost_directly + T.sum(
            y_pred_sampled_norm * lxy) / num_all  # not used in gradients

        all_grads = [
            T.clip(g, -self.grad_clipping, self.grad_clipping)
            for g in all_grads
        ]
        all_grads = total_norm_constraint(all_grads, max_norm=self.max_norm)
        updates = adam(all_grads, all_params, self.lr)
        train_input = [
            sym_sents, sym_mask, sym_label, sym_sents_u, sym_mask_u, sym_klw
        ]
        train_output = [
            total_cost,
            T.mean(loss_rec),
            T.mean(n_samples * y_pred_sampled_norm * loss_rec_u),
            T.mean(loss_kl),
            T.mean(n_samples * y_pred_sampled_norm * loss_kl_u),
            T.mean(loss_class),
            T.mean(loss_entropy), ppl,
            T.mean(const_Lxy), acc
        ]
        train_f = theano.function(inputs=train_input,
                                  outputs=train_output,
                                  updates=updates,
                                  name='train_n_samples')
        return train_f
Exemple #24
0
def set_network_trainer(input_data,
                        input_mask,
                        target_data,
                        target_mask,
                        network_outputs,
                        updater,
                        learning_rate,
                        grad_max_norm=10.,
                        l2_lambda=1e-5,
                        var_lambda=1e-5,
                        load_updater_params=None):
    network = network_outputs[-1]

    # get network output data
    output_data = get_output(network_outputs, deterministic=False)

    predict_data = output_data[-1]
    predict_idx = T.argmax(predict_data, axis=-1)

    # get prediction cost
    train_predict_cost = categorical_crossentropy(
        predictions=T.reshape(predict_data,
                              (-1, predict_data.shape[-1])) + eps,
        targets=T.flatten(target_data, 1))
    train_predict_cost = train_predict_cost * T.flatten(target_mask, 1)
    train_predict_cost = train_predict_cost.sum() / target_mask.sum()

    # get regularizer cost
    train_regularizer_cost = regularize_network_params(network, penalty=l2)

    # reduce inner loop variance (over time)
    train_fisher_cost = 0.
    inner_hid_list = output_data[:-1]
    num_inners = len(inner_hid_list)
    for inner_hid in inner_hid_list:
        # mean over time
        seq_mean = T.mean(input=inner_hid, axis=1)
        seq_mean_var = T.var(seq_mean, axis=0)

        # variance over time
        seq_var = T.var(input=inner_hid, axis=1)
        seq_var_mean = T.mean(seq_var, axis=0)

        # ratio
        train_fisher_cost += T.mean(seq_var_mean / (seq_mean_var + eps))

    train_fisher_cost /= num_inners

    # get network parameters
    network_params = get_all_params(network, trainable=True)

    # get network gradients with clipping
    network_grads = theano.grad(cost=train_predict_cost +
                                train_regularizer_cost * l2_lambda +
                                train_fisher_cost * var_lambda,
                                wrt=network_params)
    network_grads, network_grads_norm = total_norm_constraint(
        tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True)

    # set updater
    train_lr = theano.shared(lasagne.utils.floatX(learning_rate))
    train_updates, trainer_params = updater(
        loss_or_grads=network_grads,
        params=network_params,
        learning_rate=train_lr,
        load_params_dict=load_updater_params)

    # get training (update) function
    training_fn = theano.function(
        inputs=[input_data, input_mask, target_data, target_mask],
        outputs=[
            predict_data, predict_idx, train_predict_cost,
            train_regularizer_cost, train_fisher_cost, network_grads_norm
        ],
        updates=train_updates,
        allow_input_downcast=True)
    return training_fn, trainer_params
Exemple #25
0
def lasagne_model(model_base, model_flavor, **params):
    import theano
    theano.config.floatX = 'float32'

    from theano import function as tfunction, shared as tshared
    from theano.tensor import tensor4, imatrix, nnet
    from theano.tensor import grad as Tgrad, mean as Tmean, reshape as Treshape

    from lasagne.utils import floatX
    from lasagne.updates import adam as lasagne_adam, total_norm_constraint
    from lasagne.layers import get_output as ll_output, \
        get_all_params as ll_all_params

    max_norm = 5.0

    verbose = params.get('verbose', False)
    overwrite = params.get('overwrite', True)

    sym_x = tensor4()  # [nbatch,imgchan,imgrows,imgcols] dims
    sym_y = imatrix()  # one-hot vector of [nb_class x 1] dims

    l_A_net = model_base['A_net']
    l_transform = model_base['transform']
    l_out = model_base['net_out']
    output_train = ll_output(l_out, sym_x, deterministic=False)
    output_shape = (-1, l_out.shape[1])  # nb_classes = l_out.shape[1]
    output_flat = treshape(output_train, output_shape)
    output_loss = nnet.categorical_crossentropy
    output_cost = tmean(output_loss(output_flat + tol, sym_y.flatten()))

    trainable_params = ll_all_params(l_out, trainable=True)

    all_grads = tgrad(output_cost, trainable_params)
    updates, norm = total_norm_constraint(all_grads,
                                          max_norm=max_norm,
                                          return_norm=True)

    shared_lr = tshared(floatX(update_lr))
    updates = lasagne_adam(updates,
                           trainable_params,
                           learning_rate=shared_lr,
                           beta_1=beta_1,
                           beta_2=beta_2,
                           epsilon=tol)

    model_train = tfunction([sym_x, sym_y], [output_cost, output_train, norm],
                            updates=updates)

    output_eval, l_A_eval = ll_output([l_out, l_A_net],
                                      sym_x,
                                      deterministic=True)
    model_eval = tfunction(
        [sym_x],
        [output_eval.reshape(output_shape),
         l_A_eval.reshape(output_shape)])
    model_batch = lambda X, y: model_train(X, int32(y))[0]
    model_pred = lambda X: model_eval(X)[0]
    model_xform = lambda X: layer_output(X, l_transform)
    model_save = lambda outf: save_all_weights(
        l_out, outf, overwrite=overwrite)
    model_load = lambda weightf: load_all_weights(l_out, weightf)

    return Model(package='lasagne',
                 backend='theano',
                 flavor=model_flavor,
                 base=model_base,
                 batch=model_batch,
                 predict=model_pred,
                 transform=model_xform,
                 save=model_save,
                 load=model_load,
                 params=params)
    def __init__(self,
                 atari_env,
                 state_dimension,
                 action_dimension,
                 monitor_env=False,
                 learning_rate=0.001,
                 critic_update=10,
                 train_step=1,
                 gamma=0.95,
                 eps_max=1.0,
                 eps_min=0.1,
                 eps_decay=10000,
                 n_epochs=10000,
                 batch_size=32,
                 buffer_size=50000):

        self.env = gym.make(atari_env)
        if monitor_env:
            None

        self.state_dimension = state_dimension
        self.action_dimension = action_dimension
        self.learning_rate = learning_rate
        self.critic_update = critic_update
        self.train_step = train_step
        self.gamma = gamma
        self.eps_max = eps_max
        self.eps_min = eps_min
        self.eps_decay = eps_decay
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.experience_replay = []

        def q_network(state):
            input_state = InputLayer(input_var=state,
                                     shape=(None, self.state_dimension[0],
                                            self.state_dimension[1],
                                            self.state_dimension[2]))

            input_state = DimshuffleLayer(input_state, pattern=(0, 3, 1, 2))

            conv = Conv2DLayer(input_state,
                               num_filters=32,
                               filter_size=(8, 8),
                               stride=(4, 4),
                               nonlinearity=rectify)

            conv = Conv2DLayer(conv,
                               num_filters=64,
                               filter_size=(4, 4),
                               stride=(2, 2),
                               nonlinearity=rectify)

            conv = Conv2DLayer(conv,
                               num_filters=64,
                               filter_size=(3, 3),
                               stride=(1, 1),
                               nonlinearity=rectify)

            flatten = FlattenLayer(conv)

            dense = DenseLayer(flatten, num_units=512, nonlinearity=rectify)

            q_values = DenseLayer(dense,
                                  num_units=self.action_dimension,
                                  nonlinearity=linear)

            return q_values

        self.X_state = T.ftensor4()
        self.X_action = T.bvector()
        self.X_reward = T.fvector()
        self.X_next_state = T.ftensor4()
        self.X_done = T.bvector()

        self.X_action_hot = to_one_hot(self.X_action, self.action_dimension)

        self.q_ = q_network(self.X_state)
        self.q = get_output(self.q_)
        self.q_target_ = q_network(self.X_next_state)
        self.q_target = get_output(self.q_target_)
        self.q_max = T.max(self.q_target, axis=1)
        self.action = T.argmax(self.q, axis=1)

        self.mu = theano.function(inputs=[self.X_state],
                                  outputs=self.action,
                                  allow_input_downcast=True)

        self.loss = squared_error(
            self.X_reward + self.gamma * self.q_max * (1.0 - self.X_done),
            T.batched_dot(self.q, self.X_action_hot))
        self.loss = self.loss.mean()

        self.params = get_all_params(self.q_)

        self.grads = T.grad(self.loss, self.params)

        self.normed_grads = total_norm_constraint(self.grads, 1.0)

        self.updates = rmsprop(self.normed_grads,
                               self.params,
                               learning_rate=self.learning_rate)

        self.update_network = theano.function(inputs=[
            self.X_state, self.X_action, self.X_reward, self.X_next_state,
            self.X_done
        ],
                                              outputs=self.loss,
                                              updates=self.updates,
                                              allow_input_downcast=True)
Exemple #27
0
q_max     = T.max(q_target, axis=1)
action    = T.argmax(q, axis=1)

mu = theano.function(inputs               = [X_state],
                     outputs              = action,
                     allow_input_downcast = True)

loss = squared_error(X_reward + gamma * q_max * (1.0 - X_done), T.batched_dot(q, X_action_hot))
loss = loss.mean()

params = get_all_params(q_)

grads        = T.grad(loss,
                      params)

normed_grads = total_norm_constraint(grads, 1.0)

updates = adam(normed_grads,
               params,
               learning_rate = learning_rate)

update_network = theano.function(inputs               = [X_state,
                                                         X_action,
                                                         X_reward,
                                                         X_next_state,
                                                         X_done],
                                 outputs              = loss,
                                 updates              = updates,
                                 allow_input_downcast = True)

def get_action(state, step):
Exemple #28
0
def set_network_trainer(input_data,
                        input_mask,
                        target_data,
                        target_mask,
                        num_outputs,
                        network,
                        rand_layer_list,
                        updater,
                        learning_rate,
                        grad_max_norm=10.,
                        l2_lambda=1e-5,
                        load_updater_params=None):
    # get one hot target
    one_hot_target_data = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1),
                                                 nb_class=num_outputs,
                                                 dtype=floatX)

    # get network output data
    predict_data = get_output(network, deterministic=False)
    num_seqs = predict_data.shape[0]

    # get prediction cost
    predict_data = T.reshape(x=predict_data,
                             newshape=(-1, num_outputs),
                             ndim=2)
    predict_data = predict_data - T.max(predict_data, axis=-1, keepdims=True)
    predict_data = predict_data - T.log(
        T.sum(T.exp(predict_data), axis=-1, keepdims=True))
    train_predict_cost = -T.sum(T.mul(one_hot_target_data, predict_data),
                                axis=-1)
    train_predict_cost = train_predict_cost * T.flatten(target_mask, 1)
    train_model_cost = train_predict_cost.sum() / num_seqs
    train_frame_cost = train_predict_cost.sum() / target_mask.sum()

    # get regularizer cost
    train_regularizer_cost = regularize_network_params(network, penalty=l2)

    # get network parameters
    network_params = get_all_params(network, trainable=True)

    # get network gradients
    network_grads = theano.grad(cost=train_model_cost +
                                train_regularizer_cost * l2_lambda,
                                wrt=network_params)

    if grad_max_norm > 0.:
        network_grads, network_grads_norm = total_norm_constraint(
            tensor_vars=network_grads,
            max_norm=grad_max_norm,
            return_norm=True)
    else:
        network_grads_norm = T.sqrt(
            sum(T.sum(grad**2) for grad in network_grads))

    # set updater
    train_lr = theano.shared(lasagne.utils.floatX(learning_rate))
    train_updates, trainer_params = updater(
        loss_or_grads=network_grads,
        params=network_params,
        learning_rate=train_lr,
        load_params_dict=load_updater_params)

    skip_comp_list = []
    for rand_layer in rand_layer_list:
        skip_comp_list.append(
            T.sum(rand_layer.skip_comp * input_mask) / T.sum(input_mask))

    # get training (update) function
    training_fn = theano.function(
        inputs=[input_data, input_mask, target_data, target_mask],
        outputs=[train_frame_cost, network_grads_norm] + skip_comp_list,
        updates=train_updates)
    return training_fn, trainer_params
def main():
    
    # Main setup
    
    latent_sizes = [2, 5, 10, 20, 30, 50, 100]
    downsampling_factors = [1, 2, 4]
    N_epochs = 50
    binarise_downsampling = False
    bernoulli_sampling = True
    
    # Setup
    
    C = 1 # number of channels in image
    H = 28 # height of image
    W = 28 # width of image
    # K = 10 # number of classes
    
    hidden_sizes = [200, 200]
    
    batch_size = 100
    
    analytic_kl_term = True
    learning_rate = 0.001 #0.0003
    
    shape = [H * W * C]
    
    # Symbolic variables
    symbolic_x_LR = T.matrix()
    symbolic_x_HR = T.matrix()
    symbolic_z = T.matrix()
    symbolic_learning_rate = T.scalar('learning_rate')
    
    # Fix random seed for reproducibility
    numpy.random.seed(1234)
    
    # Data
    
    file_name = "mnist.pkl.gz"
    file_path = data_path(file_name)
    
    (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = data.loadMNIST(file_path, shape)
    
    X_train = numpy.concatenate([X_train, X_valid])
    
    X_train = X_train.astype(theano.config.floatX)
    X_test = X_test.astype(theano.config.floatX)

    N_train_batches = X_train.shape[0] / batch_size
    N_test_batches = X_test.shape[0] / batch_size
    
    if bernoulli_sampling:
        preprocess = bernoullisample
    else:
        preprocess = numpy.round

    # Setup shared variables
    X_train_shared = theano.shared(preprocess(X_train), borrow = True)
    X_test_shared = theano.shared(preprocess(X_test), borrow = True)
    X_test_shared_fixed = theano.shared(numpy.round(X_test), borrow = True)
    X_test_shared_normal = theano.shared(X_test, borrow = True)
    
    all_runs_duration = 0
    
    for latent_size, downsampling_factor in product(latent_sizes, downsampling_factors):
        
        run_start = time.time()
        
        print("Training model with a latent size of {} and images downsampled by {}:\n".format(latent_size, downsampling_factor))
        
        # Models
    
        h = H / downsampling_factor
        w = W / downsampling_factor
    
        ## Recognition model q(z|x)
    
        l_enc_HR_in = InputLayer((None, H * W * C), name = "ENC_HR_INPUT")
    
        l_enc_HR_downsample = l_enc_HR_in
    
        l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C, H, W))
        if downsampling_factor != 1:
            l_enc_HR_downsample = Pool2DLayer(l_enc_HR_downsample, pool_size = downsampling_factor, mode = "average_exc_pad")
            # TODO Should downsampled data be binarised? (worse performance)
            if binarise_downsampling:
                l_enc_HR_downsample = NonlinearityLayer(l_enc_HR_downsample, nonlinearity = T.round)
        l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, h * w * C))
    
        l_enc_LR_in = InputLayer((None, h * w * C), name = "ENC_LR_INPUT")
    
        l_enc = l_enc_LR_in
    
        for i, hidden_size in enumerate(hidden_sizes, start = 1):
            l_enc = DenseLayer(l_enc, num_units = hidden_size, nonlinearity = softplus, name = 'ENC_DENSE{:d}'.format(i))
    
        l_z_mu = DenseLayer(l_enc, num_units = latent_size, nonlinearity = identity, name = 'ENC_Z_MU')
        l_z_log_var = DenseLayer(l_enc, num_units = latent_size, nonlinearity = identity, name = 'ENC_Z_LOG_VAR')
    
        # Sample the latent variables using mu(x) and log(sigma^2(x))
        l_z = SimpleSampleLayer(mean = l_z_mu, log_var = l_z_log_var) # as Kingma
        # l_z = SampleLayer(mean = l_z_mu, log_var = l_z_log_var)

        ## Generative model p(x|z)
    
        l_dec_in = InputLayer((None, latent_size), name = "DEC_INPUT")
    
        l_dec = l_dec_in
    
        for i, hidden_size in enumerate_reversed(hidden_sizes, start = 0):
            l_dec = DenseLayer(l_dec, num_units = hidden_size, nonlinearity = softplus, name = 'DEC_DENSE{:d}'.format(i))
    
        l_dec_x_mu = DenseLayer(l_dec, num_units = H * W * C, nonlinearity = sigmoid, name = 'DEC_X_MU')
        l_dec_x_log_var = DenseLayer(l_dec, num_units = H * W * C, nonlinearity = sigmoid, name = 'DEC_X_MU')
    
        # TRY relu instead of softplus (maybe with more hidden units)
        # TRY softmax instead of sigmoid
        # PROBLEM with this is that we have several pixels activated.

        ## Get outputs from models
    
        # With noise
        x_LR_train = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = False)
        z_train, z_mu_train, z_log_var_train = get_output(
            [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_train}, deterministic = False
        )
        x_mu_train, x_log_var_train = get_output([l_dec_x_mu, l_dec_x_log_var], {l_dec_in: z_train}, deterministic = False)
    
        # Without noise
        x_LR_eval = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = True)
        z_eval, z_mu_eval, z_log_var_eval = get_output(
            [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_eval}, deterministic = True
        )
        x_mu_eval, x_log_var_eval = get_output([l_dec_x_mu, l_dec_x_log_var], {l_dec_in: z_eval}, deterministic = True)
    
        # Sampling
        x_mu_sample = get_output(l_dec_x_mu, {l_dec_in: symbolic_z},
            deterministic = True)
        
        # Likelihood
        
        # Calculate the loglikelihood(x) = E_q[ log p(x|z) + log p(z) - log q(z|x)]
        def log_likelihood(z, z_mu, z_log_var, x_mu, x_log_var, x, analytic_kl_term):
            if analytic_kl_term:
                kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1)
                log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
                # log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis = 1)
                LL = T.mean(-kl_term + log_px_given_z)
            else:
                log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1)
                log_pz = log_stdnormal(z).sum(axis = 1)
                log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
                # log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis = 1)
                LL = T.mean(log_pz + log_px_given_z - log_qz_given_x)
            return LL

        # log-likelihood for training
        ll_train = log_likelihood(
            z_train, z_mu_train, z_log_var_train, x_mu_train, x_log_var_train, symbolic_x_HR, analytic_kl_term)

        # log-likelihood for evaluating
        ll_eval = log_likelihood(
            z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, x_log_var_eval, symbolic_x_HR, analytic_kl_term)
    
        # Parameters to train
        parameters = get_all_params([l_z, l_dec_x_mu], trainable = True)
        # parameters = get_all_params([l_z, l_dec_x_mu, l_dec_x_log_var], trainable = True)
        print("Parameters that will be trained:")
        for parameter in parameters:
            print("{}: {}".format(parameter, parameter.get_value().shape))

        ### Take gradient of negative log-likelihood
        gradients = T.grad(-ll_train, parameters)

        # Adding gradient clipping to reduce the effects of exploding gradients,
        # and hence speed up convergence
        gradient_clipping = 1
        gradient_norm_max = 5
        gradient_constrained = updates.total_norm_constraint(gradients,
            max_norm = gradient_norm_max)
        gradients_clipped = [T.clip(g,-gradient_clipping, gradient_clipping) for g in gradient_constrained]
    
        # Setting up functions for training
    
        symbolic_batch_index = T.iscalar('index')
        batch_slice = slice(symbolic_batch_index * batch_size, (symbolic_batch_index + 1) * batch_size)

        update_expressions = updates.adam(gradients_clipped, parameters,
            learning_rate = symbolic_learning_rate)

        train_model = theano.function(
            [symbolic_batch_index, symbolic_learning_rate], ll_train,
            updates = update_expressions, givens = {symbolic_x_HR: X_train_shared[batch_slice]}
        )

        test_model = theano.function(
            [symbolic_batch_index], ll_eval,
            givens = {symbolic_x_HR: X_test_shared[batch_slice]}
        )
    
        test_model_fixed = theano.function(
            [symbolic_batch_index], ll_eval,
            givens = {symbolic_x_HR: X_test_shared_fixed[batch_slice]}
        )
    
        def train_epoch(learning_rate):
            costs = []
            for i in range(N_train_batches):
                cost_batch = train_model(i, learning_rate)
                costs += [cost_batch]
            return numpy.mean(costs)
    
        def test_epoch():
            costs = []
            for i in range(N_test_batches):
                cost_batch = test_model(i)
                costs += [cost_batch]
            return numpy.mean(costs)
    
        def test_epoch_fixed():
            costs = []
            for i in range(N_test_batches):
                cost_batch = test_model_fixed(i)
                costs += [cost_batch]
            return numpy.mean(costs)
    
        # Training
    
        epochs = []
        cost_train = []
        cost_test = []
    
        print

        for epoch in range(N_epochs):
        
            epoch_start = time.time()
        
            # Shuffle train data
            numpy.random.shuffle(X_train)
            X_train_shared.set_value(preprocess(X_train))
        
            # TODO: Using dynamically changed learning rate
            train_cost = train_epoch(learning_rate)
            test_cost = test_epoch()
            test_cost_fixed = test_epoch_fixed()
        
            epoch_duration = time.time() - epoch_start
        
            epochs.append(epoch + 1)
            cost_train.append(train_cost)
            cost_test.append(test_cost)
        
            # line = "Epoch: %i\tTime: %0.2f\tLR: %0.5f\tLL Train: %0.3f\tLL test: %0.3f\t" % ( epoch, t, learning_rate, train_cost, test_cost)
            print("Epoch {:d} (duration: {:.2f} s, learning rate: {:.1e}):".format(epoch + 1, epoch_duration, learning_rate))
            print("    log-likelihood: {:.3f} (training set), {:.3f} (test set)".format(train_cost, test_cost))
        
        print
        
        # Results
    
        ## Reconstruction
    
        N_reconstructions = 50
    
        X_test_eval = X_test_shared.eval()
        X_test_eval_fixed = X_test_shared_fixed.eval()
        X_test_eval_normal = X_test_shared_normal.eval()
        
        subset = numpy.random.randint(0, len(X_test_eval), size = N_reconstructions)
    
        x_original = X_test_eval[numpy.array(subset)]
        x_LR = get_output(l_enc_HR_downsample, x_original).eval()
        z = get_output(l_z, x_LR).eval()
        x_reconstructed = x_mu_sample.eval({symbolic_z: z})
    
        x_original_fixed = X_test_eval_fixed[numpy.array(subset)]
        x_LR_fixed = get_output(l_enc_HR_downsample, x_original_fixed).eval()
        z_fixed = get_output(l_z, x_LR_fixed).eval()
        x_reconstructed_fixed = x_mu_sample.eval({symbolic_z: z_fixed})
        
        originals = X_test_eval_normal[numpy.array(subset)]
        
        reconstructions = {
            "originals": x_original,
            "downsampled":  x_LR,
            "reconstructions": x_reconstructed
        }
        
        reconstructions_fixed = {
            "originals": x_original_fixed,
            "downsampled":  x_LR_fixed,
            "reconstructions": x_reconstructed_fixed
        }
        
        ## Manifold
        
        if latent_size == 2:
        
            x = numpy.linspace(0.1, 0.9, 20)
            # TODO: Ideally sample from the real p(z)
            v = gaussian.ppf(x)
            z = numpy.zeros((20**2, 2))
        
            i = 0
            for a in v:
                for b in v:
                    z[i,0] = a
                    z[i,1] = b
                    i += 1
            z = z.astype('float32')
        
            samples = x_mu_sample.eval({symbolic_z: z})
    
        else:
            samples = None
    
        ## Reconstructions of homemade numbers
    
        if downsampling_factor == 2:
        
            file_names = [
                "hm_7_Avenir.png",
                "hm_7_Noteworthy.png",
                "hm_7_Chalkboard.png",
                "hm_7_drawn.png",
                "hm_A_Noteworthy.png",
                "hm_A_drawn.png",
                "hm_7_0.txt",
                "hm_7_1.txt",
                "hm_7_2.txt",
                "hm_A.txt"
            ]
        
            x_LR_HM = data.loadHomemade(map(data_path, file_names), [h * w])
        
            z = get_output(l_z, x_LR_HM).eval()
            x_HM_reconstructed = x_mu_sample.eval({symbolic_z: z})
    
            reconstructions_homemade = {
                "originals": x_LR_HM,
                "reconstructions": x_HM_reconstructed
            }
    
        else:
            reconstructions_homemade = None
    
        # Saving
    
        setup_and_results = {
            "setup": {
                "image size": (C, H, W),
                "downsampling factor": downsampling_factor,
                "learning rate": learning_rate,
                "analytic K-L term": analytic_kl_term,
                "batch size": batch_size,
                "hidden layer sizes": hidden_sizes,
                "latent size": latent_size,
                "number of epochs": N_epochs
            },
            "results": {
                "learning curve": {
                    "epochs": epochs,
                    "training cost function": cost_train,
                    "test cost function": cost_test
                },
                "originals": originals,
                "reconstructions": reconstructions,
                "reconstructions (fixed)": reconstructions_fixed,
                "manifold": {
                    "samples": samples
                },
                "reconstructed homemade numbers": reconstructions_homemade
            }
        }
        
        file_name = "results{}_ds{}{}_l{}_e{}.pkl".format("_bs" if bernoulli_sampling else "", downsampling_factor, "b" if binarise_downsampling else "", latent_size, N_epochs)
    
        with open(data_path(file_name), "w") as f:
            pickle.dump(setup_and_results, f)
        
        run_duration = time.time() - run_start
        
        all_runs_duration += run_duration
        
        print("Run took {:.2f} minutes.".format(run_duration / 60))
        
        print("\n")
    
    print("All runs took {:.2f} minutes in total.".format(all_runs_duration / 60))
Exemple #30
0
def predict (LD, output_dir, basename):
	
	import os
	import numpy as np
	import random
	np.random.seed(0)
	random.seed(0)
	import data_converter
	from sklearn import preprocessing, decomposition
	from sklearn.utils import shuffle
	import time
	from sklearn.externals import joblib
	
	from lasagne import layers
	from lasagne.updates import nesterov_momentum
	from lasagne.updates import norm_constraint, total_norm_constraint
	import lasagne
	import theano
	import theano.tensor as T
	from lasagne.regularization import regularize_layer_params, regularize_layer_params_weighted, l2, l1


	LD.data['X_train'], LD.data['Y_train'] = shuffle(LD.data['X_train'], LD.data['Y_train'] , random_state=1)
	X_train = LD.data['X_train']
	X_valid = LD.data['X_valid']
	X_test = LD.data['X_test']
	

	X_train = X_train[:, 0:2000]
	X_valid = X_valid[:, 0:2000]
	X_test = X_test[:, 0:2000]
	X_train = X_train.toarray()
	X_valid = X_valid.toarray()
	X_test = X_test.toarray()
	
	
	
	fs = decomposition.PCA(n_components=100)
	fs.fit(X_train)
	
	X_train2 = fs.transform(X_train)
	X_valid2 = fs.transform(X_valid)
	X_test2 = fs.transform(X_test)
	
	X_train = X_train[:, 0:200]
	X_valid = X_valid[:, 0:200]
	X_test = X_test[:, 0:200]
	
	X_train = np.float32(X_train)
	X_valid = np.float32(X_valid)
	X_test = np.float32(X_test)
	
	X_train = np.hstack([X_train, X_train2])
	X_valid = np.hstack([X_valid, X_valid2])
	X_test = np.hstack([X_test, X_test2])
	
	
	normx = preprocessing.StandardScaler()
	
	normx.fit(X_train)
	X_train = normx.transform(X_train)
	X_valid = normx.transform(X_valid)
	X_test = normx.transform(X_test)
	
	X_train = np.float32(X_train)
	X_valid = np.float32(X_valid)
	X_test = np.float32(X_test)
	
	print "p5"
	
	y_train = np.copy(LD.data['Y_train'])
	y_train = np.float32(y_train)
	y_train = y_train.reshape((-1, 1))
	
	def batches(X, y, csize, rs):
		X, y = shuffle(X, y, random_state=rs)
		for cstart in range(0, X.shape[0] - csize+1, csize):
			Xc = X[cstart:cstart+csize] 
			yc = y[cstart:cstart+csize]
			yield  Xc, yc
	
	input_var = T.matrix('inputs')
	target_var = T.matrix('targets')
	
	l_in = lasagne.layers.InputLayer(shape=(None, X_train.shape[1]),
	     input_var=input_var,
	     nonlinearity=None,
	     W=lasagne.init.Sparse())
	     
	l_hid1 = lasagne.layers.DenseLayer(
	    l_in, num_units= 100,
	    nonlinearity=lasagne.nonlinearities.sigmoid,
	    W=lasagne.init.Sparse())
	    
	l_hid2 = lasagne.layers.DenseLayer(
	    l_hid1, num_units= 40,
	    nonlinearity=lasagne.nonlinearities.tanh,
	    W=lasagne.init.GlorotUniform()
	    )

	Lnum_out_units = 1
	
	l_out = lasagne.layers.DenseLayer(
		l_hid2, num_units=Lnum_out_units,
		nonlinearity=None)

	network = l_out
	
	prediction = lasagne.layers.get_output(network)

	loss = lasagne.objectives.squared_error(prediction, target_var)
	loss = loss.mean()
	
	params = lasagne.layers.get_all_params(network, trainable=True)
	all_grads = T.grad(loss, params)
	scaled_grads = total_norm_constraint(all_grads, 100)
	updates = lasagne.updates.sgd(scaled_grads, params, learning_rate=0.001)
	
	train_fn = theano.function([input_var, target_var], loss, updates=updates)

	for epoch in range(1200):
		train_err = 0
		train_batches = 0
		for batch in batches(X_train, y_train, 100, epoch):
		    Xt, yt = batch
		    train_err += train_fn(Xt, yt)
		    train_batches += 1
		
	xml1 = T.matrix('xml1')
	Xlt1 = lasagne.layers.get_output(l_out, xml1, deterministic=True)
	f2 = theano.function([xml1], Xlt1)
	preds_valid = f2(X_valid).ravel()
	preds_test = f2(X_test).ravel()
		

	import data_io
	
	cycle = 0 
	filename_valid = basename + '_valid_' + str(cycle).zfill(3) + '.predict'
	data_io.write(os.path.join(output_dir,filename_valid), preds_valid)
	filename_test = basename + '_test_' + str(cycle).zfill(3) + '.predict'
	data_io.write(os.path.join(output_dir,filename_test), preds_test)
Exemple #31
0
    def build_model(self,
                    train_set_unlabeled,
                    train_set_labeled,
                    test_set,
                    validation_set=None):
        """
        Build the auxiliary deep generative model from the initialized hyperparameters.
        Define the lower bound term and compile it into a training function.
        :param train_set_unlabeled: Unlabeled train set containing variables x, t.
        :param train_set_labeled: Unlabeled train set containing variables x, t.
        :param test_set: Test set containing variables x, t.
        :param validation_set: Validation set containing variables x, t.
        :return: train, test, validation function and dicts of arguments.
        """
        super(CSDGM, self).build_model(train_set_unlabeled, test_set,
                                       validation_set)

        sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0],
                                                dtype=theano.config.floatX),
                                     borrow=True)
        sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1],
                                                dtype=theano.config.floatX),
                                     borrow=True)
        n = self.sh_train_x.shape[0].astype(
            theano.config.floatX)  # no. of data points
        n_l = sh_train_x_l.shape[0].astype(
            theano.config.floatX)  # no. of labeled data points

        # Define the layers for the density estimation used in the lower bound.
        l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu,
                                           self.l_qa_logvar)
        l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu,
                                           self.l_qz_logvar)
        l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8)

        l_log_pz = StandardNormalLogDensityLayer(self.l_qz)
        l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu,
                                           self.l_pa_logvar)

        l_x_in = ReshapeLayer(self.l_x_in, (-1, self.n_l * self.n_c))
        l_px = DimshuffleLayer(self.l_px, (0, 3, 1, 2, 4))
        l_px = ReshapeLayer(l_px, (-1, self.sym_samples, 1, self.n_c))
        if self.x_dist == 'bernoulli':
            l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in)
        elif self.x_dist == 'multinomial':
            l_log_px = MultinomialLogDensityLayer(l_px, l_x_in)
            l_log_px = ReshapeLayer(l_log_px, (-1, self.n_l, 1, 1, 1))
            l_log_px = MeanLayer(l_log_px, axis=1)
        elif self.x_dist == 'gaussian':
            l_px_mu = ReshapeLayer(
                DimshuffleLayer(self.l_px_mu, (0, 2, 3, 1, 4)),
                (-1, self.sym_samples, 1, self.n_l * self.n_c))
            l_px_logvar = ReshapeLayer(
                DimshuffleLayer(self.l_px_logvar, (0, 2, 3, 1, 4)),
                (-1, self.sym_samples, 1, self.n_l * self.n_c))
            l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar)

        def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px):
            lb = log_px + log_py + (log_pz + log_pa - log_qa -
                                    log_qz) * (1.1 - self.sym_warmup)
            return lb

        # Lower bound for labeled data
        out_layers = [
            l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy
        ]
        inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l}
        out = get_output(out_layers,
                         inputs,
                         batch_norm_update_averages=False,
                         batch_norm_use_averages=False)
        log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out

        # Prior p(y) expecting that all classes are evenly distributed
        py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y)))
        log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l,
                           log_py_l, log_px_zy_l)
        lb_l = lb_l.mean(axis=(1, 2))  # Mean over the sampling dimensions
        log_qy_ax_l *= (
            self.sym_beta * (n / n_l)
        )  # Scale the supervised cross entropy with the alpha constant
        lb_l += log_qy_ax_l.mean(axis=(
            1, 2
        ))  # Collect the lower bound term and mean over sampling dimensions

        # Lower bound for unlabeled data
        bs_u = self.sym_x_u.shape[0]

        # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y
        # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form
        #               x_repeat                     t_repeat
        #  [[x[0,0], x[0,1], ..., x[0,n_x]]         [[1, 0, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [1, 0, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 1, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [0, 1, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 0, 1]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]]         [0, 0, 1]]
        t_eye = T.eye(self.n_y, k=0)
        t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u,
                                                            axis=1).reshape(
                                                                (-1, self.n_y))
        x_u = self.sym_x_u.reshape(
            (1, bs_u, self.n_l, self.n_c)).repeat(self.n_y, axis=0).reshape(
                (-1, self.n_l, self.n_c))

        # Since the expectation of var a is outside the integration we calculate E_q(a|x) first
        a_x_u = get_output(self.l_qa,
                           self.sym_x_u,
                           batch_norm_update_averages=True,
                           batch_norm_use_averages=False)
        a_x_u_rep = a_x_u.reshape(
            (1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y,
                                                           axis=0).reshape(
                                                               (-1, self.n_a))
        out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px]
        inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep}
        out = get_output(out_layers,
                         inputs,
                         batch_norm_update_averages=False,
                         batch_norm_use_averages=False)
        log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out

        # Prior p(y) expecting that all classes are evenly distributed
        py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y)))
        log_py_u = -categorical_crossentropy(py_u, t_u).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u,
                           log_py_u, log_px_zy_u)
        lb_u = lb_u.reshape(
            (self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2))
        inputs = {
            self.l_x_in: self.sym_x_u,
            self.l_a_in: a_x_u.reshape((-1, self.n_a))
        }
        y_u = get_output(self.l_qy,
                         inputs,
                         batch_norm_update_averages=True,
                         batch_norm_use_averages=False).mean(axis=(1, 2))
        y_u += 1e-8  # Ensure that we get no NANs when calculating the entropy
        y_u /= T.sum(y_u, axis=1, keepdims=True)
        lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1)

        # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients
        weight_priors = 0.0
        for p in self.trainable_model_params:
            if 'W' not in str(p):
                continue
            weight_priors += log_normal(p, 0, 1).sum()

        # Collect the lower bound and scale it with the weight priors.
        elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n
        lb_labeled = -lb_l.mean()
        lb_unlabeled = -lb_u.mean()
        log_px = log_px_zy_l.mean() + log_px_zy_u.mean()
        log_pz = log_pz_l.mean() + log_pz_u.mean()
        log_qz = log_qz_axy_l.mean() + log_qz_axy_u.mean()
        log_pa = log_pa_l.mean() + log_pa_u.mean()
        log_qa = log_qa_x_l.mean() + log_qa_x_u.mean()

        grads_collect = T.grad(elbo, self.trainable_model_params)
        params_collect = self.trainable_model_params
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads_collect, max_norm=max_norm)
        mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
        updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1,
                       sym_beta2)

        # Training function
        indices = self._srng.choice(size=[self.sym_bs_l],
                                    a=sh_train_x_l.shape[0],
                                    replace=False)
        x_batch_l = sh_train_x_l[indices]
        t_batch_l = sh_train_t_l[indices]
        x_batch_u = self.sh_train_x[self.batch_slice]
        if self.x_dist == 'bernoulli':  # Sample bernoulli input.
            x_batch_u = self._srng.binomial(size=x_batch_u.shape,
                                            n=1,
                                            p=x_batch_u,
                                            dtype=theano.config.floatX)
            x_batch_l = self._srng.binomial(size=x_batch_l.shape,
                                            n=1,
                                            p=x_batch_l,
                                            dtype=theano.config.floatX)

        givens = {
            self.sym_x_l: x_batch_l,
            self.sym_x_u: x_batch_u,
            self.sym_t_l: t_batch_l
        }
        inputs = [
            self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta,
            self.sym_lr, sym_beta1, sym_beta2, self.sym_samples,
            self.sym_warmup
        ]
        outputs = [
            elbo, lb_labeled, lb_unlabeled, log_px, log_pz, log_qz, log_pa,
            log_qa
        ]
        f_train = theano.function(inputs=inputs,
                                  outputs=outputs,
                                  givens=givens,
                                  updates=updates)

        # Default training args. Note that these can be changed during or prior to training.
        self.train_args['inputs']['batchsize_unlabeled'] = 100
        self.train_args['inputs']['batchsize_labeled'] = 100
        self.train_args['inputs']['beta'] = 0.1
        self.train_args['inputs']['learningrate'] = 3e-4
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['inputs']['samples'] = 1
        self.train_args['inputs']['warmup'] = 0.1
        self.train_args['outputs']['lb'] = '%0.3f'
        self.train_args['outputs']['lb-l'] = '%0.3f'
        self.train_args['outputs']['lb-u'] = '%0.3f'
        self.train_args['outputs']['px'] = '%0.3f'
        self.train_args['outputs']['pz'] = '%0.3f'
        self.train_args['outputs']['qz'] = '%0.3f'
        self.train_args['outputs']['pa'] = '%0.3f'
        self.train_args['outputs']['qa'] = '%0.3f'

        # Validation and test function
        y = get_output(self.l_qy, self.sym_x_l,
                       deterministic=True).mean(axis=(1, 2))
        class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100
        givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t}
        f_test = theano.function(inputs=[self.sym_samples],
                                 outputs=[class_err],
                                 givens=givens)

        # Test args.  Note that these can be changed during or prior to training.
        self.test_args['inputs']['samples'] = 1
        self.test_args['outputs']['test'] = '%0.2f%%'

        f_validate = None
        if validation_set is not None:
            givens = {
                self.sym_x_l: self.sh_valid_x,
                self.sym_t_l: self.sh_valid_t
            }
            f_validate = theano.function(inputs=[self.sym_samples],
                                         outputs=[class_err],
                                         givens=givens)
            # Default validation args. Note that these can be changed during or prior to training.
            self.validate_args['inputs']['samples'] = 1
            self.validate_args['outputs']['validation'] = '%0.2f%%'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
Exemple #32
0
def set_network_trainer(input_data,
                        input_mask,
                        target_data,
                        target_mask,
                        num_outputs,
                        network,
                        inner_loop_layers,
                        updater,
                        learning_rate,
                        grad_max_norm=10.,
                        l2_lambda=1e-5,
                        load_updater_params=None):
    # get one hot target
    one_hot_target_data = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1),
                                                 nb_class=num_outputs,
                                                 dtype=floatX)

    # get network output data
    network_outputs = get_output(inner_loop_layers + [
        network,
    ],
                                 deterministic=False)
    inner_feats = T.concatenate(network_outputs[:-1], axis=-1)
    predict_data = network_outputs[-1]
    num_seqs = predict_data.shape[0]

    # get prediction cost
    predict_data = T.reshape(x=predict_data,
                             newshape=(-1, num_outputs),
                             ndim=2)
    predict_data = predict_data - T.max(predict_data, axis=-1, keepdims=True)
    predict_data = predict_data - T.log(
        T.sum(T.exp(predict_data), axis=-1, keepdims=True))
    train_predict_cost = -T.sum(T.mul(one_hot_target_data, predict_data),
                                axis=-1)
    train_predict_cost = train_predict_cost * T.flatten(target_mask, 1)
    train_model_cost = train_predict_cost.sum() / num_seqs
    train_frame_cost = train_predict_cost.sum() / target_mask.sum()

    # get inner loop cost (num_batches x seq x features)
    train_sf_cost0 = T.var(inner_feats, axis=1).mean()  # intra var low
    train_sf_cost1 = -T.var(T.mean(inner_feats, axis=1),
                            axis=0).mean()  # inter var high
    train_sf_cost2 = T.sum(T.sqr(inner_feats[:, 1:, :] -
                                 inner_feats[:, :-1, :]),
                           axis=-1).mean()

    # get l2 cost
    train_l2_cost = regularize_network_params(network, penalty=l2) * l2_lambda

    # get network parameters
    network_params = get_all_params(network, trainable=True)

    # get network gradients
    network_grads = theano.grad(cost=train_model_cost + train_l2_cost +
                                train_sf_cost2 * 1.0,
                                wrt=network_params)

    if grad_max_norm > 0.:
        network_grads, network_grads_norm = total_norm_constraint(
            tensor_vars=network_grads,
            max_norm=grad_max_norm,
            return_norm=True)
    else:
        network_grads_norm = T.sqrt(
            sum(T.sum(grad**2) for grad in network_grads))

    # set updater
    train_updates, trainer_params = updater(
        loss_or_grads=network_grads,
        params=network_params,
        learning_rate=learning_rate,
        load_params_dict=load_updater_params)

    # get training (update) function
    training_fn = theano.function(
        inputs=[input_data, input_mask, target_data, target_mask],
        outputs=[
            train_frame_cost, network_grads_norm, train_sf_cost0,
            train_sf_cost1, train_sf_cost2
        ],
        updates=train_updates)
    return training_fn, trainer_params
Exemple #33
0
    def __init__(self, input_vars, target_vars, l_out, loss,
                 optimizer, learning_rate=0.001, id=None):
        if not isinstance(input_vars, Sequence):
            raise ValueError('input_vars should be a sequence, instead got %s' % (input_vars,))
        if not isinstance(target_vars, Sequence):
            raise ValueError('target_vars should be a sequence, instead got %s' % (input_vars,))

        self.get_options()

        self.input_vars = input_vars
        self.l_out = l_out
        self.loss = loss
        self.optimizer = optimizer
        self.id = id
        id_tag = (self.id + '/') if self.id else ''
        id_tag_log = (self.id + ': ') if self.id else ''

        if self.options.verbosity >= 6:
            output_model_structure(l_out)

        params = self.params()
        (monitored,
         train_loss_grads,
         synth_vars) = self.get_train_loss(target_vars, params)
        self.monitored_tags = monitored.keys()

        if self.options.true_grad_clipping:
            scaled_grads = total_norm_constraint(train_loss_grads, self.options.true_grad_clipping)
        else:
            scaled_grads = train_loss_grads

        updates = optimizer(scaled_grads, params, learning_rate=learning_rate)
        if not self.options.no_nan_suppression:
            # TODO: print_mode='all' somehow is always printing, even when
            # there are no NaNs. But tests are passing, even on GPU!
            updates = apply_nan_suppression(updates, print_mode='none')

        if self.options.detect_nans:
            mode = MonitorMode(post_func=detect_nan)
        else:
            mode = None

        if self.options.verbosity >= 2:
            print(id_tag_log + 'Compiling training function')
        params = input_vars + target_vars + synth_vars
        if self.options.verbosity >= 6:
            print('params = %s' % (params,))
        self.train_fn = theano.function(params, monitored.values(),
                                        updates=updates, mode=mode,
                                        name=id_tag + 'train', on_unused_input='warn')
        if self.options.run_dir and not self.options.no_graphviz:
            self.visualize_graphs({'loss': monitored['loss']},
                                  out_dir=self.options.run_dir)

        test_prediction = get_output(l_out, deterministic=True)
        if self.options.verbosity >= 2:
            print(id_tag_log + 'Compiling prediction function')
        if self.options.verbosity >= 6:
            print('params = %s' % (input_vars,))
        self.predict_fn = theano.function(input_vars, test_prediction, mode=mode,
                                          name=id_tag + 'predict', on_unused_input='ignore')

        if self.options.run_dir and not self.options.no_graphviz:
            self.visualize_graphs({'test_prediction': test_prediction},
                                  out_dir=self.options.run_dir)
Exemple #34
0
    def __init__(self,
                 network,
                 loss,
                 trn_data,
                 trn_inputs,
                 step=lu.adam,
                 lr=0.001,
                 lr_decay=1.0,
                 max_norm=0.1,
                 monitor=None,
                 val_frac=0.,
                 assemble_extra_inputs=None,
                 seed=None):
        """Construct and configure the trainer

        The trainer takes as inputs a neural network, a loss function and
        training data. During init the theano functions for training are
        compiled.

        Parameters
        ----------
        network : NeuralNet instance
            The neural network to train
        loss : theano variable
            Loss function to be computed for network training
        trn_data : tuple of arrays
            Training data in the form (params, stats)
        trn_inputs : list of theano variables
            Theano variables that should contain the the training data
        step : function
            Function to call for updates, will pass gradients and parameters
        lr : float
            initial learning rate
        lr_decay : float
            learning rate decay factor, learning rate for each epoch is
            set to lr * (lr_decay**epoch)
        max_norm : float
            Total norm constraint for gradients
        monitor : dict
            Dict containing theano variables (and names as keys) that should be
            recorded during training along with the loss function
        val_frac: float
            Fraction of dataset to use as validation set
        assemble_extra_inputs: function
            (optional) function to compute extra inputs needed to evaluate loss
        seed : int or None
            If provided, random number generator for batches will be seeded
        """
        self.network = network
        self.loss = loss
        self.trn_data = trn_data
        self.trn_inputs = trn_inputs

        self.seed = seed
        if seed is not None:
            self.rng = np.random.RandomState(seed=seed)
        else:
            self.rng = np.random.RandomState()

        # gradients
        grads = tt.grad(self.loss, self.network.aps)
        if max_norm is not None:
            grads = lu.total_norm_constraint(grads, max_norm=max_norm)

        # updates
        self.lr = lr
        self.lr_decay = lr_decay
        self.lr_op = theano.shared(np.array(self.lr, dtype=dtype))
        self.updates = step(grads, self.network.aps, learning_rate=self.lr_op)

        # check trn_data
        n_trn_data_list = set([x.shape[0] for x in trn_data])
        assert len(n_trn_data_list) == 1, 'trn_data elements got different len'
        self.n_trn_data = trn_data[0].shape[0]

        # outputs
        self.trn_outputs_names = ['loss']
        self.trn_outputs_nodes = [self.loss]
        if monitor is not None and len(monitor) > 0:
            monitor_names, monitor_nodes = zip(*monitor.items())
            self.trn_outputs_names += monitor_names
            self.trn_outputs_nodes += monitor_nodes

        # function for single update
        self.make_update = theano.function(inputs=self.trn_inputs,
                                           outputs=self.trn_outputs_nodes,
                                           updates=self.updates)

        self.assemble_extra_inputs = assemble_extra_inputs

        if not (val_frac == 0.):

            self.do_validation = True

            n_trn = int((1 - val_frac) * self.n_trn_data)
            self.val_data = [data[n_trn:] for data in trn_data
                             ].copy()  # copy() might be  overly prudent
            self.trn_data = [data[:n_trn] for data in trn_data].copy()

            # assemble extra inputs *once* for validation data
            if self.assemble_extra_inputs is not None:
                self.val_data = self.assemble_extra_inputs(tuple(
                    self.val_data))

            # prepare validation data
            self.val_inputs = [
                theano.shared(data.astype(dtype), borrow=True)
                for data in self.val_data
            ]

            # compile theano function for validation
            self.validate = theano.function(inputs=[],
                                            outputs=self.loss,
                                            givens=list(
                                                zip(self.trn_inputs,
                                                    self.val_inputs)))

            self.best_val_loss = np.inf

        else:

            self.do_validation = False

        # initialize variables
        self.loss = float('inf')
Exemple #35
0
def set_network_trainer(input_data,
                        input_mask,
                        target_data,
                        target_mask,
                        network_outputs,
                        updater,
                        learning_rate,
                        grad_max_norm=10.,
                        l2_lambda=1e-5,
                        inner_lambda=1e-2,
                        load_updater_params=None):
    network = network_outputs[-1]

    # get network output data
    output_data = get_output(network_outputs, deterministic=False)
    predict_data = output_data[-1]
    inner_hid_list = output_data[:-1]

    num_seqs = predict_data.shape[0]

    # get prediction cost
    predict_data = T.reshape(x=predict_data,
                             newshape=(-1, predict_data.shape[-1]),
                             ndim=2)
    predict_data = T.clip(predict_data, eps, 1.0 - eps)
    train_predict_cost = categorical_crossentropy(predictions=predict_data,
                                                  targets=T.flatten(
                                                      target_data, 1))
    train_predict_cost = train_predict_cost * T.flatten(target_mask, 1)
    train_model_cost = train_predict_cost.sum() / num_seqs
    train_frame_cost = train_predict_cost.sum() / target_mask.sum()

    # get regularizer cost
    train_regularizer_cost = regularize_network_params(network,
                                                       penalty=l2) * l2_lambda

    # reduce inner loop variance (over time)
    train_inner_cost = 0.
    num_inners = len(inner_hid_list)
    for inner_hid in inner_hid_list:
        # variance over time
        seq_var = T.var(input=inner_hid, axis=1)

        # mean over sample variance
        train_inner_cost += T.mean(seq_var)

    train_inner_cost /= num_inners

    # get network parameters
    network_params = get_all_params(network, trainable=True)

    # get network gradients
    network_grads = theano.grad(cost=train_model_cost +
                                train_regularizer_cost +
                                train_inner_cost * inner_lambda,
                                wrt=network_params)
    network_grads, network_grads_norm = total_norm_constraint(
        tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True)

    # set updater
    train_lr = theano.shared(lasagne.utils.floatX(learning_rate))
    train_updates, trainer_params = updater(
        loss_or_grads=network_grads,
        params=network_params,
        learning_rate=train_lr,
        load_params_dict=load_updater_params)

    # get training (update) function
    training_fn = theano.function(
        inputs=[input_data, input_mask, target_data, target_mask],
        outputs=[train_frame_cost, train_inner_cost, network_grads_norm],
        updates=train_updates)
    return training_fn, trainer_params
Exemple #36
0
    def __init__(self,
                 network,
                 loss,
                 trn_data,
                 trn_inputs,
                 step=lu.adam,
                 lr=0.001,
                 lr_decay=1.0,
                 max_norm=0.1,
                 monitor=None,
                 seed=None):
        """Construct and configure the trainer

        The trainer takes as inputs a neural network, a loss function and
        training data. During init the theano functions for training are
        compiled.

        Parameters
        ----------
        network : NeuralNet instance
            The neural network to train
        loss : theano variable
            Loss function to be computed for network training
        trn_data : tuple of arrays
            Training data in the form (params, stats)
        trn_inputs : list of theano variables
            Theano variables that should contain the the training data
        step : function
            Function to call for updates, will pass gradients and parameters
        lr : float
            initial learning rate
        lr_decay : float
            learning rate decay factor, learning rate for each epoch is
            set to lr * (lr_decay**epoch)
        max_norm : float
            Total norm constraint for gradients
        monitor : dict
            Dict containing theano variables (and names as keys) that should be
            recorded during training along with the loss function
        seed : int or None
            If provided, random number generator for batches will be seeded
        """
        self.network = network
        self.loss = loss
        self.trn_data = trn_data
        self.trn_inputs = trn_inputs

        self.seed = seed
        if seed is not None:
            self.rng = np.random.RandomState(seed=seed)
        else:
            self.rng = np.random.RandomState()

        # gradients
        grads = tt.grad(self.loss, self.network.aps)
        if max_norm is not None:
            grads = lu.total_norm_constraint(grads, max_norm=max_norm)

        # updates
        self.lr = lr
        self.lr_decay = lr_decay
        self.lr_op = theano.shared(np.array(self.lr, dtype=dtype))
        self.updates = step(grads, self.network.aps, learning_rate=self.lr_op)

        # check trn_data
        n_trn_data_list = set([x.shape[0] for x in trn_data])
        assert len(n_trn_data_list) == 1, 'trn_data elements got different len'
        self.n_trn_data = trn_data[0].shape[0]

        # outputs
        self.trn_outputs_names = ['loss']
        self.trn_outputs_nodes = [self.loss]
        if monitor is not None and len(monitor) > 0:
            monitor_names, monitor_nodes = zip(*monitor.items())
            self.trn_outputs_names += monitor_names
            self.trn_outputs_nodes += monitor_nodes

        # function for single update
        self.make_update = theano.function(inputs=self.trn_inputs,
                                           outputs=self.trn_outputs_nodes,
                                           updates=self.updates)

        # initialize variables
        self.loss = float('inf')