Example #1
0
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    """ Adadelta updates
    The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes.

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))``
    * ``param = param - learning_rate * update``
    * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form
    (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new)

    References
    ----------
    .. [1] Zeiler, M. D. (2012):
           ADADELTA: An Adaptive Learning Rate Method.
           arXiv Preprint arXiv:1212.5701.
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        delta_accu = cgt.shared(
            np.zeros(param.op.get_shape(), dtype=param.dtype))

        accu_new = rho * accu + (1 - rho) * grad**2
        updates.append((accu, accu_new))

        update = (grad * cgt.sqrt(delta_accu + epsilon) /
                  cgt.sqrt(accu_new + epsilon))
        updates.append((param, param - learning_rate * update))

        delta_accu_new = rho * delta_accu + (1 - rho) * update**2
        updates.append((delta_accu, delta_accu_new))

    return updates
Example #2
0
File: nn.py Project: EdsterG/cgt
def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    """ Adadelta updates
    The learning rate is scaled by the ratio of accumulated gradients to the ratio of accumulated step sizes.

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))``
    * ``param = param - learning_rate * update``
    * ``delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form
    (param, updates), (accumulated_grads, accumulated_grads_new), (step_accum, step_accum_new)

    References
    ----------
    .. [1] Zeiler, M. D. (2012):
           ADADELTA: An Adaptive Learning Rate Method.
           arXiv Preprint arXiv:1212.5701.
    """
    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        delta_accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))

        accu_new = rho * accu + (1 - rho) * grad ** 2
        updates.append((accu, accu_new))

        update = (grad * cgt.sqrt(delta_accu + epsilon) / cgt.sqrt(accu_new + epsilon))
        updates.append((param, param - learning_rate * update))

        delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2
        updates.append((delta_accu, delta_accu_new))

    return updates
Example #3
0
 def __init__(self, x, n_in, n_hid, n_out, nlayers=1, y=None, eps=None):
     super(GaussianMLP, self).__init__(x, n_in, n_hid, nlayers=nlayers, prefix="GaussianMLP_hidden")
     self.mu_layer = HiddenLayer(
         input=self.hidden_layers[-1].output,
         n_in=self.hidden_layers[-1].n_out,
         n_out=n_out,
         activation=None,
         prefix="GaussianMLP_mu"
     )
     # log(sigma^2)
     self.logvar_layer = HiddenLayer(
         input=self.hidden_layers[-1].output,
         n_in=self.hidden_layers[-1].n_out,
         n_out=n_out,
         activation=None,
         prefix="GaussianMLP_logvar"
     )
     self.mu = self.mu_layer.output
     self.var = cgt.exp(self.logvar_layer.output)
     self.sigma = cgt.sqrt(self.var)
     self.params = self.params + self.mu_layer.params +\
         self.logvar_layer.params
     # for use as encoder
     if eps is not None:
         assert(y is None)
         self.out = self.mu + self.sigma * eps
     # for use as decoder
     if y:
         assert(eps is None)
         self.out = cgt.sigmoid(self.mu)
         self.cost = -cgt.sum(log_diag_mvn(self.out, self.var)(y))
Example #4
0
def adagrad_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6):
    grads = cgt.grad(cost, params)
    updates = []
    for param, grad in zip(params, grads):
        value = param.op.get_value()
        accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        delta_accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))

        accu_new = rho * accu + (1 - rho) * grad**2
        updates.append((accu, accu_new))

        update = (grad * cgt.sqrt(delta_accu + epsilon) /
                  cgt.sqrt(accu_new + epsilon))
        updates.append((param, param - stepsize * update))

        delta_accu_new = rho * delta_accu + (1 - rho) * update**2
        updates.append((delta_accu, delta_accu_new))
    return updates
Example #5
0
def rmsprop_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6):
    grads = cgt.grad(cost, params)
    updates = []
    for p, g in zip(params, grads):
        acc = cgt.shared(p.op.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * cgt.square(g)
        gradient_scaling = cgt.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - stepsize * g))
    return updates
Example #6
0
def rmsprop_updates(cost, params, stepsize=0.001, rho=0.9, epsilon=1e-6):
    grads = cgt.grad(cost, params)
    updates = []
    for p, g in zip(params, grads):
        acc = cgt.shared(p.op.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * cgt.square(g)
        gradient_scaling = cgt.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - stepsize * g))
    return updates
Example #7
0
    def __init__(self, xdim, args, dec="bernoulli"):
        self.xdim = xdim
        self.hdim = args.hdim
        self.zdim = args.zdim
        self.lmbda = args.lmbda  # weight decay coefficient * 2
        self.x = cgt.matrix("x", dtype=cgt.floatX)
        self.eps = cgt.matrix("eps", dtype=cgt.floatX)

        self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps)
        if dec == "bernoulli":
            # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y)
            self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x)
        elif dec == "gaussian":
            self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x)
        else:
            raise RuntimeError("unrecognized decoder %" % dec)

        self.cost = (-cgt.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size
        self.params = self.enc_mlp.params + self.dec_mlp.params
        # L2 regularization
        self.gparams = [cgt.grad(self.cost, [p])[0] + self.lmbda * p for p in self.params]
        self.gaccums = [cgt.shared(np.zeros(p.op.get_value().shape, dtype=cgt.floatX)) for p in self.params]

        # XXX replace w/ adagrad update from nn
        ADAGRAD_EPS = 1e-10  # for stability
        self.updates = [
            (param, param - args.lr * gparam / cgt.sqrt(gaccum + cgt.square(gparam) + ADAGRAD_EPS))
            for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums)
        ]
        self.updates += [
            (gaccum, gaccum + cgt.square(gparam))
            for gaccum, gparam in zip(self.gaccums, self.gparams)
        ]

        self.train = cgt.function(
            [self.x, self.eps],
            self.cost,
            updates=self.updates
        )
        self.test = cgt.function(
            [self.x, self.eps],
            self.cost,
            updates=None
        )
        # can be used for semi-supervised learning for example
        self.encode = cgt.function(
            [self.x, self.eps],
            self.enc_mlp.out
        )
Example #8
0
File: nn.py Project: x724/cgt
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6):
    """RMSProp updates
    Divide learning rate by moving average of RMS gradients. See [1]

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form [(param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new)]

    References
    ----------
    .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015):
           RMSProp and equilibrated adaptive learning rates for non-convex optimization
           arXiv:1502.04390 http://arxiv.org/abs/1502.04390
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        accu_new = rho * accu + (1 - rho) * grad**2
        updates.append((accu, accu_new))
        updates.append(
            (param,
             param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))))

    return updates
Example #9
0
File: nn.py Project: EdsterG/cgt
def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6):
    """RMSProp updates
    Divide learning rate by moving average of RMS gradients. See [1]

    Math:
    * ``accu_new = rho * accu + (1 - rho) * grad ** 2``
    * ``param = param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    rho : float
        Controls decay of gradient moving average.
    epsilon : float
        Avoid division by 0 while scaling. Small constant.

    Returns
    -------
    list of tuples of the form (param, updates), (accumulated_RMS_grads, accumulated_RMS_grads_new)

    References
    ----------
    .. [1] Yann N. Dauphin, Harm de Vries, Junyoung Chung, Yoshua Bengio (2015):
           RMSProp and equilibrated adaptive learning rates for non-convex optimization
           arXiv:1502.04390 http://arxiv.org/abs/1502.04390
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        accu_new = rho * accu + (1 - rho) * grad ** 2
        updates.append((accu, accu_new))
        updates.append((param, param - (learning_rate * grad / cgt.sqrt(accu_new + epsilon))))

    return updates
Example #10
0
File: nn.py Project: x724/cgt
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6):
    """Adagrad updates
    The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients.

    Math:
    * ``accu_new = accu + grad ** 2``
    * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    epsilon: avoids division close to zero. Small float.

    Returns
    -------
    list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)]

    References
    ----------
    .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011):
           Adaptive subgradient methods for online learning and stochastic
           optimization. JMLR, 12:2121-2159.
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        value = param.op.get_value()
        accu = cgt.shared(np.zeros(value.shape, dtype=value.dtype))
        accu_new = accu + grad**2
        updates.append((accu, accu_new))
        updates.append(
            (param,
             param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)))

    return updates
Example #11
0
File: nn.py Project: EdsterG/cgt
def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6):
    """Adagrad updates
    The learning rate will be scaled by dividing it by the sqaure root of the sum of accumulated squared gradients.

    Math:
    * ``accu_new = accu + grad ** 2``
    * ``param = param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)``

    Parameters
    ----------
    cost : a scalar loss.
    params : a list of cgt shared variables. We generate update
            expressions w.r.t. these variables.
    learning_rate : float
        Tunes the size of the update step.
    epsilon: avoids division close to zero. Small float.

    Returns
    -------
    list of tuples of the form [(param, updates), (accumulated_grads, accumulated_grads_new)]

    References
    ----------
    .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011):
           Adaptive subgradient methods for online learning and stochastic
           optimization. JMLR, 12:2121-2159.
    """

    updates = []
    grads = cgt.grad(cost, params)

    for param, grad in zip(params, grads):
        assert isinstance(param.op, core.GetData)
        accu = cgt.shared(np.zeros(param.op.get_shape(), dtype=param.dtype))
        accu_new = accu + grad ** 2
        updates.append((accu, accu_new))
        updates.append((param, param - (learning_rate * grad) / cgt.sqrt(accu_new + epsilon)))

    return updates
Example #12
0
def sqrt(x):
    return cgt.sqrt(x)
Example #13
0
def sqrt(x):
    return cgt.sqrt(x)