Esempio n. 1
0
def momsgd(params, cost=None, gradients=None, learningrate=0.01, momentum=0.9, nesterov=True):
    # TODO: Docstring
    # Validate input
    assert not (cost is None and gradients is None), "Update function momsgd requires either a cost scalar or a " \
                                                     "list of gradients."

    # Compute gradients if requested
    if gradients is None and cost is not None:
        pdC = T.grad(cost, wrt=params)
        # Kill gradients if cost is nan
        dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC]
    else:
        dC = gradients

    # Init update list
    updates = []

    for param, dparam in zip(params, dC):
        # Check if layer is trainable. Skip if not.
        if not netutils.getbaggage(param, 'trainable', True):
            continue

        # Check if learningrate is to be overriden
        if netutils.getbaggage(param, 'learningrate', False):
            # Override
            lr = param.baggage['learningrate']
        else:
            # Nothing to override
            lr = learningrate

        # Fetch parameter shape
        paramshape = param.get_value().shape
        # ... and init initial momentum
        mom = th.shared(np.zeros(paramshape, dtype=th.config.floatX))
        # Compute velocity
        vel = momentum * mom - learningrate * dparam

        # Compute new parameters
        if nesterov:
            newparam = param + momentum * vel - lr * dparam
        else:
            newparam = param + vel

        # update update list
        updates.append((param, newparam))
        updates.append((mom, vel))

    # Return
    return updates
Esempio n. 2
0
def rmsprop(params, cost=None, gradients=None, learningrate=0.0005, rho=0.9, epsilon=1e-6):

    # Validate input
    assert not (cost is None and gradients is None), "Update function rmsprop requires either a cost scalar or a " \
                                                     "list of gradients."

    # Compute gradients if requested
    if gradients is None and cost is not None:
        pdC = T.grad(cost, wrt=params)
        # Kill gradients if cost is nan
        dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC]
    else:
        dC = gradients

    # Init update list
    updates = []

    for param, dparam in zip(params, dC):
        # Check if layer is trainable. Skip if not.
        if not netutils.getbaggage(param, 'trainable', True):
            continue

        paramshape = param.get_value().shape
        acc = th.shared(np.zeros(paramshape, dtype=th.config.floatX))
        newacc = rho * acc + (1 - rho) * dparam ** 2
        gradscale = T.sqrt(newacc + epsilon)
        dparam = dparam / gradscale
        updates.append((acc, newacc))
        updates.append((param, param - learningrate * dparam))

    return updates
Esempio n. 3
0
def Lp(params, p=2):
    """
    Given a list of parameters, compute the p-th power of its Lp norm.

    :type params: list
    :param params: Parameters to take the Lp norm of.

    :type p: int
    :param p: p of the Lp norm. Defaults to 2.

    :return: (Lp norm)^p
    """

    # Compute Lp^p
    lpn = sum(map(T.sum, map(lambda k: k ** p,
                             [param for param in params if netutils.getbaggage(param, 'regularizable', True)])))

    # Return
    return lpn
Esempio n. 4
0
def sgd(params, cost=None, gradients=None, learningrate=1e-4):
    """
    Computes the updates for Stochastic Gradient Descent (without momentum)

    :type params: list
    :param params: Network parameters.

    :type cost: theano.tensor.var.TensorVariable
    :param cost: Cost variable (scalar). Optional if the gradient is provided.

    :type gradients: list
    :param gradients: Gradient of a cost w.r.t. parameters. Optional if the cost is provided.

    :type learningrate: theano.tensor.var.TensorVariable or float
    :param learningrate: Learning rate of SGD. Can be a float (static) or a dynamic theano variable.

    :return: List of updates
    """

    # Validate input
    assert not (cost is None and gradients is None), "Update function sgd requires either a cost scalar or a list of " \
                                                     "gradients."

    # Compute gradients if requested
    if gradients is None and cost is not None:
        pdC = T.grad(cost, wrt=params)
        # Kill gradients if cost is nan
        dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC]
    else:
        dC = gradients

    # Compute updates
    upd = [(param, param - learningrate * dparam) for param, dparam in zip(params, dC)
           if netutils.getbaggage(param, 'trainable', True)]

    # Return
    return upd
Esempio n. 5
0
def adam(params, cost=None, gradients=None, learningrate=0.0002, beta1=0.9, beta2=0.999, epsilon=1e-8, eta=0.,
         gamma=0.55, iterstart=0):
    """
    Computes the updates for ADAM.

    :type params: list
    :param params: Network parameters.

    :type cost: theano.tensor.var.TensorVariable
    :param cost: Cost variable (scalar). Optional if the gradient is provided.

    :type gradients: list
    :param gradients: Gradient of a cost w.r.t. parameters. Optional if the cost is provided.

    :type learningrate: theano.tensor.var.TensorVariable or float
    :param learningrate: Learning rate of SGD. Can be a float (static) or a dynamic theano variable.

    :type beta1: float
    :param beta1: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980

    :type beta2: float
    :param beta2: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980

    :type epsilon: float
    :param epsilon: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980

    :type eta: float
    :param eta: Eta for noisy gradient. See Neelakantan et al. 2015: http://arxiv.org/pdf/1511.06807v1.pdf

    :type gamma: float
    :param gamma: Gamma for noisy gradient. See Neelakantan et al. 2015: http://arxiv.org/pdf/1511.06807v1.pdf

    :type iterstart: int or float
    :param iterstart: Adam anneals the learning rate with iterations. This parameter specifies the initial value of the
                      iteration count, such that the learning rate is scaled appropriately (or the model might jump out
                      of the potential minimum where it's at).

    :return: List of updates
    """

    # Validate input
    assert not (cost is None and gradients is None), "Update function adam requires either a cost scalar or a list of " \
                                                     "gradients."

    # Compute gradients if requested
    if gradients is None and cost is not None:
        pdC = T.grad(cost, wrt=params)
        # Kill gradients if cost is nan
        dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC]
    else:
        dC = gradients

    updates = []

    # Gradient noising
    if not (eta == 0):
        # RNG
        srng = RandomStreams()
        # Iteration counter
        itercount = th.shared(np.asarray(iterstart, dtype=th.config.floatX))
        # Add noise
        dC = [dparam + srng.normal(size=dparam.shape, std=T.sqrt(eta/(1 + itercount)**gamma), dtype='floatX')
              for dparam in dC]
        # Update itercount
        updates.append((itercount, itercount + 1))

    # Implementation as in reference paper, nothing spectacular here...
    tm1 = th.shared(np.asarray(iterstart, dtype=th.config.floatX))
    t = tm1 + 1
    at = T.sqrt(1-beta2**t)/(1-beta1**t)

    for param, dparam in zip(params, dC):
        # Check if layer is trainable. Skip if not.
        if not netutils.getbaggage(param, 'trainable', True):
            continue

        # Check if learningrate is to be overriden
        if netutils.getbaggage(param, 'learningrate', False):
            # Override
            lr = param.baggage['learningrate']
        else:
            # Nothing to override
            lr = learningrate

        paramshape = param.get_value().shape

        mtm1 = th.shared(np.zeros(paramshape, dtype=th.config.floatX))
        vtm1 = th.shared(np.zeros(paramshape, dtype=th.config.floatX))

        mt = beta1 * mtm1 + (1 - beta1) * dparam
        vt = beta2 * vtm1 + (1 - beta2) * dparam**2
        u = lr * at * mt/(T.sqrt(vt) + epsilon)

        updates.append((mtm1, mt))
        updates.append((vtm1, vt))
        updates.append((param, param - u))

    updates.append((tm1, t))

    return updates