Exemple #1
0
def _get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'):
  print "Generating adadelta updates (implementation from dnn)"
  # compute list of weights updates
  gparams = T.grad(cost, params)

  accugrads, accudeltas = [], []
  for param in params:
    accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad'))
    accudeltas.append(build_shared_zeros(param.shape.eval(), 'accudelta'))

  # compute list of weights updates
  updates = OrderedDict()

  for accugrad, accudelta, param, gparam in zip(accugrads, accudeltas, params, gparams):
      # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
      agrad = rho * accugrad + (1 - rho) * gparam * gparam
      dx = - T.sqrt((accudelta + eps) / (agrad + eps)) * gparam
      updates[accudelta] = (rho * accudelta + (1 - rho) * dx * dx)
      if (max_norm > 0) and param.ndim == 2 and param.name != word_vec_name:
          W = param + dx
          col_norms = W.norm(2, axis=0)
          desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm))
          updates[param] = W * (desired_norms / (1e-7 + col_norms))
      else:
          updates[param] = param + dx
      updates[accugrad] = agrad
  return updates
Exemple #2
0
def get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'):
    """
    adadelta update rule, mostly from
    https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
    """
    print "Generating adadelta updates"
    updates = OrderedDict({})
    exp_sqr_grads = OrderedDict({})
    exp_sqr_ups = OrderedDict({})
    gparams = []
    for param in params:
        exp_sqr_grads[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name)
        gp = T.grad(cost, param)
        exp_sqr_ups[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name)
        gparams.append(gp)
    for param, gp in zip(params, gparams):
        exp_sg = exp_sqr_grads[param]
        exp_su = exp_sqr_ups[param]
        up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
        updates[exp_sg] = up_exp_sg
        step =  -(T.sqrt(exp_su + eps) / T.sqrt(up_exp_sg + eps)) * gp
        updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
        stepped_param = param + step
        # if (param.get_value(borrow=True).ndim == 2) and (param.name != word_vec_name):
        if max_norm and param.name != word_vec_name:
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm))
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param
    return updates
Exemple #3
0
def _get_adadelta_updates(cost,
                          params,
                          rho=0.95,
                          eps=1e-6,
                          max_norm=9,
                          word_vec_name='W_emb'):
    print "Generating adadelta updates (implementation from dnn)"
    # compute list of weights updates
    gparams = T.grad(cost, params)

    accugrads, accudeltas = [], []
    for param in params:
        accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad'))
        accudeltas.append(build_shared_zeros(param.shape.eval(), 'accudelta'))

    # compute list of weights updates
    updates = OrderedDict()

    for accugrad, accudelta, param, gparam in zip(accugrads, accudeltas,
                                                  params, gparams):
        # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
        agrad = rho * accugrad + (1 - rho) * gparam * gparam
        dx = -T.sqrt((accudelta + eps) / (agrad + eps)) * gparam
        updates[accudelta] = (rho * accudelta + (1 - rho) * dx * dx)
        if (max_norm > 0) and param.ndim == 2 and param.name != word_vec_name:
            W = param + dx
            col_norms = W.norm(2, axis=0)
            desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm))
            updates[param] = W * (desired_norms / (1e-7 + col_norms))
        else:
            updates[param] = param + dx
        updates[accugrad] = agrad
    return updates
Exemple #4
0
def get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'):
    print "Generating adadelta updates"
    updates = OrderedDict({})
    exp_sqr_grads = OrderedDict({})
    exp_sqr_ups = OrderedDict({})
    gparams = []
    for param in params:
        exp_sqr_grads[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name)
        gp = T.grad(cost, param)
        exp_sqr_ups[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name)
        gparams.append(gp)
    for param, gp in zip(params, gparams):
        exp_sg = exp_sqr_grads[param]
        exp_su = exp_sqr_ups[param]
        up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
        updates[exp_sg] = up_exp_sg
        step =  -(T.sqrt(exp_su + eps) / T.sqrt(up_exp_sg + eps)) * gp
        updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
        stepped_param = param + step
        # if (param.get_value(borrow=True).ndim == 2) and (param.name != word_vec_name):
        if max_norm and param.name != word_vec_name:
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm))
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param
    return updates
Exemple #5
0
def get_adagrad_updates(mean_cost, params, learning_rate=0.1, max_norm=9, _eps=1e-6):
    """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
    """
    print "Generating adagrad updates"
    # compute the gradients with respect to the model parameters
    gparams = T.grad(mean_cost, params)

    accugrads = []
    for param in params:
      accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad'))

    # compute list of weights updates
    updates = OrderedDict()
    for accugrad, param, gparam in zip(accugrads, params, gparams):
        # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
        agrad = accugrad + gparam * gparam
        dx = - (learning_rate / T.sqrt(agrad + _eps)) * gparam
        update = param + dx
        if max_norm:
            W = param + dx
            col_norms = W.norm(2, axis=0)
            desired_norms = T.clip(col_norms, 0, max_norm)
            update = W * (desired_norms / (1e-6 + col_norms))

        updates[param] = update
        updates[accugrad] = agrad
    return updates
Exemple #6
0
def get_adagrad_updates(mean_cost,
                        params,
                        learning_rate=0.1,
                        max_norm=9,
                        _eps=1e-6):
    """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
    """
    print "Generating adagrad updates"
    # compute the gradients with respect to the model parameters
    gparams = T.grad(mean_cost, params)

    accugrads = []
    for param in params:
        accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad'))

    # compute list of weights updates
    updates = OrderedDict()
    for accugrad, param, gparam in zip(accugrads, params, gparams):
        # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
        agrad = accugrad + gparam * gparam
        dx = -(learning_rate / T.sqrt(agrad + _eps)) * gparam
        update = param + dx
        if max_norm:
            W = param + dx
            col_norms = W.norm(2, axis=0)
            desired_norms = T.clip(col_norms, 0, max_norm)
            update = W * (desired_norms / (1e-6 + col_norms))

        updates[param] = update
        updates[accugrad] = agrad
    return updates