Example #1
0
def test_get_or_compute_grads_raises():

    from lasagne.updates import get_or_compute_grads

    A = T.scalar()
    B = T.scalar()
    loss = A + B
    grads = get_or_compute_grads(loss, [A, B])

    assert get_or_compute_grads(grads, [A, B]) is grads

    with pytest.raises(ValueError):
        get_or_compute_grads(grads, [A])
Example #2
0
def test_get_or_compute_grads_raises():

    from lasagne.updates import get_or_compute_grads

    A = T.scalar()
    B = T.scalar()
    loss = A + B
    grads = get_or_compute_grads(loss, [A, B])

    assert get_or_compute_grads(grads, [A, B]) is grads

    with pytest.raises(ValueError):
        get_or_compute_grads(grads, [A])
Example #3
0
def opdac_rmsprop(q_vals, acts_t, u_acts, u_params,learning_rate,WhetherDirect):
    if WhetherDirect:
        #TODO: 这个一定要看一看,原文为什么不能直接求导?,因为u_params根本不在计算过程内
        return sgd(q_vals, u_params, learning_rate)
    else:
        q2a_grads=get_or_compute_grads(q_vals, acts_t)#TODO: theano好像只能对一个scalar的数求导,没有scarlar的时候怎么搞
        q2a_grads=T.sum(q2a_grads)#TODO: 这个逻辑是不对的,只是现在action只有一个,先这样胡写。后期可以参考使用theano的求jacobian
        a2w_grads=get_or_compute_grads(u_acts, u_params)
        grads=[a2w_grad*q2a_grads for a2w_grad in a2w_grads]#TODO:Done,这样乘 这两个dict怎么乘!
        updates = OrderedDict()
        for param, grad in zip(u_params, grads):
            updates[param] = param + learning_rate * grad#Done另外这个目标可是要让函数增大,要修正更新方向:已修正,改为+号

        return updates
Example #4
0
def sgd(loss_or_grads, params, learning_rate):
    """Stochastic Gradient Descent (SGD) updates
    Generates update expressions of the form:
    * ``param := param - learning_rate * gradient``
    Parameters
    ----------
    loss_or_grads : symbolic expression or list of expressions
        A scalar loss expression, or a list of gradient expressions
    params : list of shared variables
        The variables to generate update expressions for
    learning_rate : float or symbolic scalar
        The learning rate controlling the size of update steps
    Returns
    -------
    OrderedDict
        A dictionary mapping each parameter to its update expression
    """
    #现在的目标很简单,就是拿到这个正确的grads    
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()

    for param, grad in zip(params, grads):
        updates[param] = param - learning_rate * grad

    return updates
Example #5
0
def careful_rmsprop(loss_or_grads,
                    params,
                    learning_rate=1.0,
                    rho=0.9,
                    epsilon=1e-6,
                    grad_clipping=1.0e-2):
    """
  RMSProp with gradient clipping.
  :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled.
  :return: updates
  """
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    grads = total_norm_constraint(grads,
                                  max_norm=grad_clipping,
                                  epsilon=epsilon)

    # Using theano constant to prevent upcasting of float32
    one = T.constant(1)

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)
        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        accu_new = rho * accu + (one - rho) * grad**2
        updates[accu] = accu_new
        updates[param] = param - (learning_rate * grad /
                                  T.sqrt(accu_new + epsilon))

    return updates
Example #6
0
def adam(loss_or_grads, params, learning_rate=0.001, beta1=0.9,
         beta2=0.999, epsilon=1e-8):
    """
    Exact copy from Lasagne updates, except also return expressions for the
    update step of each param.
    """
    all_grads = LU.get_or_compute_grads(loss_or_grads, params)
    t_prev = theano.shared(lasagne.utils.floatX(0.))
    updates = OrderedDict()
    steps = list()
    one = T.constant(1)
    t = t_prev + 1
    a_t = learning_rate * T.sqrt(one - beta2 ** t) / (one - beta1 ** t)

    for param, g_t in zip(params, all_grads):
        value = param.get_value(borrow=True)
        m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)
        v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)

        m_t = beta1 * m_prev + (one - beta1) * g_t
        v_t = beta2 * v_prev + (one - beta2) * g_t ** 2
        step = a_t * m_t / (T.sqrt(v_t) + epsilon)

        updates[m_prev] = m_t
        updates[v_prev] = v_t
        updates[param] = param - step

        steps.append(step)

    updates[t_prev] = t
    return updates, steps
def adam_kingma(loss_or_grads, params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8):
    """
    ADAM update rules
    Default values are taken from [Kingma2014]

    References:
    [Kingma2014] Kingma, Diederik, and Jimmy Ba.
    "Adam: A Method for Stochastic Optimization."
    arXiv preprint arXiv:1412.6980 (2014).
    http://arxiv.org/pdf/1412.6980v4.pdf

    """
    updates = []
    all_grads = get_or_compute_grads(loss_or_grads, params)
    alpha = learning_rate
    t = theano.shared(np.float32(1))
    b1_t = b1*gamma**(t-1)  # Decay the first moment running average coefficient

    for theta_previous, g in zip(params, all_grads):
        m_previous = theano.shared(np.zeros(theta_previous.get_value().shape, dtype=theano.config.floatX))
        v_previous = theano.shared(np.zeros(theta_previous.get_value().shape, dtype=theano.config.floatX))

        m = b1_t*m_previous + (1 - b1_t)*g  # Update biased first moment estimate
        v = b2*v_previous + (1 - b2)*g**2   # Update biased second raw moment estimate
        m_hat = m / (1-b1**t)   # Compute bias-corrected first moment estimate
        v_hat = v / (1-b2**t)   # Compute bias-corrected second raw moment estimate
        theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e)  # Update parameters

        updates.append((m_previous, m))
        updates.append((v_previous, v))
        updates.append((theta_previous, theta))
    updates.append((t, t + 1.))
    return updates
def deepmind_rmsprop(loss_or_grads,
                     params,
                     learning_rate=0.00025,
                     rho=0.95,
                     epsilon=0.01):
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)

        acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                 broadcastable=param.broadcastable)
        acc_grad_new = rho * acc_grad + (1 - rho) * grad

        acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                broadcastable=param.broadcastable)
        acc_rms_new = rho * acc_rms + (1 - rho) * grad**2

        updates[acc_grad] = acc_grad_new
        updates[acc_rms] = acc_rms_new

        updates[param] = (
            param - learning_rate *
            (grad / T.sqrt(acc_rms_new - acc_grad_new**2 + epsilon)))

    return updates
Example #9
0
    def u(loss_or_grads, params, *args, **kwargs):
        grads = get_or_compute_grads(loss_or_grads, params)
        grads = total_norm_constraint(grads,
                                      max_norm=grad_clipping,
                                      epsilon=epsilon)

        return updates(grads, params, *args, **kwargs)
Example #10
0
def deepmind_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6):
    """RMSProp updates [1]_.
    Scale learning rates by dividing with the moving average of the root mean
    squared (RMS) gradients.
    Parameters
    ----------
    loss_or_grads : symbolic expression or list of expressions
        A scalar loss expression, or a list of gradient expressions
    params : list of shared variables
        The variables to generate update expressions for
    learning_rate : float or symbolic scalar
        The learning rate controlling the size of update steps
    rho : float or symbolic scalar
        Gradient moving average decay factor
    epsilon : float or symbolic scalar
        Small value added for numerical stability
    Returns
    -------
    OrderedDict
        A dictionary mapping each parameter to its update expression
    Notes
    -----
    `rho` should be between 0 and 1. A value of `rho` close to 1 will decay the
    moving average slowly and a value close to 0 will decay the moving average
    fast.
    Using the step size :math:`\\eta` and a decay factor :math:`\\rho` the
    learning rate :math:`\\eta_t` is calculated as:
    .. math::
       r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
       \\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}}
    References
    ----------
    .. [1] Tieleman, T. and Hinton, G. (2012):
           Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
           Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)
    """

    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)

        acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        acc_grad_new = rho * acc_grad + (1 - rho) * grad

        acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2


        updates[acc_grad] = acc_grad_new
        updates[acc_rms] = acc_rms_new

        updates[param] = (param - learning_rate *
                          (grad /
                           T.sqrt(acc_rms_new - acc_grad_new **2 + epsilon)))

    return updates
Example #11
0
def sign_rule(loss_or_grads, params, learning_rate):
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()

    for param, grad in zip(params, grads):
        updates[param] = param - learning_rate * T.sgn(grad)

    return updates
Example #12
0
def adam_svrg(loss_or_grads,
              params,
              learning_rate=0.001,
              beta1=0.9,
              beta2=0.999,
              epsilon=1e-8):
    all_grads = get_or_compute_grads(loss_or_grads, params)
    t_prev = []
    updates = []
    updates_of = []
    grads_adam = []
    for m_r in range(2):
        t_prev.append(theano.shared(utils.floatX(0.)))
        updates.append(OrderedDict())
        #        grads_adam.append([TT.matrix('eval_grad0'),TT.vector('eval_grad1'),TT.col('eval_grad3'),TT.vector('eval_grad4')])
        #        norm_adam.append([TT.matrix('eval_grad0'),TT.vector('eval_grad1'),TT.col('eval_grad3'),TT.vector('eval_grad4')])
        updates_of.append(OrderedDict())
        # Using theano constant to prevent upcasting of float32
        one = TT.constant(1)
        t = t_prev[-1] + 1
        if (m_r == 0):
            a_t = learning_rate * TT.sqrt(one - beta2**t) / (one - beta1**t)
        else:
            beta2 = 0.999
            a_t = learning_rate / 2 * TT.sqrt(one - beta2**t) / (one -
                                                                 beta1**t)
        i = 0
        l = []
        h = []
        for param, g_t in zip(params, all_grads):
            value = param.get_value(borrow=True)
            m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                   broadcastable=param.broadcastable)
            v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                   broadcastable=param.broadcastable)

            m_t = beta1 * m_prev + (one - beta1) * g_t
            v_t = beta2 * v_prev + (one - beta2) * g_t**2
            step = a_t * m_t / (TT.sqrt(v_t) + epsilon)
            #            eff_step = TT.sum(TT.square(step,None))
            h.append(TT.sum(TT.square(step)))
            l.append(TT.sum(TT.square(m_t)))
            updates[-1][m_prev] = m_t
            updates[-1][v_prev] = v_t
            updates_of[-1][param] = param - step
            i += 1

        updates[-1][t_prev[-1]] = t
        grads_adam.append(
            TT.sqrt(
                (h[0] + h[1] + h[2] + h[3] + h[4] + h[5] + h[6] + h[7] + h[8])
                / (l[0] + l[1] + l[2] + l[3] + l[4] + l[5] + l[6] + l[7] +
                   l[8])))
    return updates_of, grads_adam
Example #13
0
def sgdWithLrsClip(loss_or_grads,
                   params,
                   learning_rate=.01,
                   mu_lr=.01,
                   si_lr=.001,
                   focused_w_lr=.01,
                   momentum=.9,
                   verbose=False):
    '''
    Sames as sgdWithLrs bu applies clips after updates
    '''
    from collections import OrderedDict
    from lasagne.updates import get_or_compute_grads, apply_momentum
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    #momentum_params_list =[]
    f32 = np.float32
    if verbose:
        print("Params List", params)
    for param, grad in zip(params, grads):
        if verbose:
            print("param name", param.name, "shape:", param.eval().shape)
        #print("param name", param.name, "shape:", param.get_value().shape)

        #grad = clip_tensor(grad, -0.001, 0.001)
        if param.name.find('focus') >= 0 and param.name.find('mu') >= 0:
            updates[param] = param - mu_lr * grad
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            updates[param] = clip_tensor(updates[param], f32(0.01), f32(0.99))

        elif param.name.find('focus') >= 0 and param.name.find('si') >= 0:
            updates[param] = param - si_lr * grad
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            updates[param] = clip_tensor(updates[param], f32(0.01), f32(0.5))

        elif param.name.find('focus') >= 0 and param.name.find('W') >= 0:
            updates[param] = param - (focused_w_lr * grad)
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            #updates[param] =clip_tensor(updates[param], -0.5, 0.5)
        else:
            updates[param] = param - learning_rate * grad
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            #if param.name.find('W')>=0:
            #print (param, grad, learning_rate)
    return updates
Example #14
0
def santa_euler(loss_or_grads, params,
                learning_rate=1,
                lambda_=1e-5,
                sigma=0.99,
                A=1,
                burnin=0,
                rng=RandomStreams()):
    n = learning_rate
    tprev = theano.shared(utils.floatX(0.))
    t = tprev + 1
    all_grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    for param, g_t in zip(params, all_grads):
        s = rng.normal(size=param.shape)
        f = g_t
        b = A * t**lambda_

        value = param.get_value(borrow=True)
        vprev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                              broadcastable=param.broadcastable)
        aprev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                              broadcastable=param.broadcastable)

        gprev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                              broadcastable=param.broadcastable)

        if hasattr(n, "get_value"):
            n_ = n.get_value(borrow=True)
        else:
            n_ = n

        ufirst = (np.random.normal(size=value.shape) * np.sqrt(n_))
        ufirst = ufirst.astype(value.dtype)
        uprev = theano.shared(ufirst,
                              broadcastable=param.broadcastable)

        v = sigma * vprev + (1 - sigma) * f * f

        g = 1 / T.sqrt(lambda_ + T.sqrt(v))

        a = ifelse(t < burnin, aprev + (uprev * uprev - n / b), aprev)
        u = ifelse(t < burnin, (n / b) * (1 - g / gprev) / uprev + T.sqrt(2 * n / b * gprev) * s,
                   theano.shared(np.zeros(value.shape, dtype=value.dtype)))
        u = u + (1 - a) * uprev - n * g * f
        updates[param] = param + g * u
        updates[uprev] = u
        updates[aprev] = a
        updates[vprev] = v
        updates[gprev] = g
    updates[tprev] = t
    return updates
Example #15
0
    def u(loss_or_grads, params, *args, **kwargs):
        grads = get_or_compute_grads(loss_or_grads, params)

        if type(srng_or_seed) is int:
            srng = MRG_RandomStreams(srng_or_seed)
        elif srng_or_seed is None:
            srng = MRG_RandomStreams()
        else:
            srng = srng_or_seed

        noisy_grads = [
            g + srng.normal(size=g.shape, ndim=g.ndim, std=std) for g in grads
        ]

        return updates(noisy_grads, params, *args, **kwargs)
def adam_kingma(loss_or_grads,
                params,
                learning_rate=0.001,
                b1=0.9,
                b2=0.999,
                e=1e-8,
                gamma=1 - 1e-8):
    """
    ADAM update rules
    Default values are taken from [Kingma2014]

    References:
    [Kingma2014] Kingma, Diederik, and Jimmy Ba.
    "Adam: A Method for Stochastic Optimization."
    arXiv preprint arXiv:1412.6980 (2014).
    http://arxiv.org/pdf/1412.6980v4.pdf

    """
    updates = []
    all_grads = get_or_compute_grads(loss_or_grads, params)
    alpha = learning_rate
    t = theano.shared(np.float32(1))
    b1_t = b1 * gamma**(t - 1
                        )  # Decay the first moment running average coefficient

    for theta_previous, g in zip(params, all_grads):
        m_previous = theano.shared(
            np.zeros(theta_previous.get_value().shape,
                     dtype=theano.config.floatX))
        v_previous = theano.shared(
            np.zeros(theta_previous.get_value().shape,
                     dtype=theano.config.floatX))

        m = b1_t * m_previous + (
            1 - b1_t) * g  # Update biased first moment estimate
        v = b2 * v_previous + (
            1 - b2) * g**2  # Update biased second raw moment estimate
        m_hat = m / (1 - b1**t)  # Compute bias-corrected first moment estimate
        v_hat = v / (1 - b2**t
                     )  # Compute bias-corrected second raw moment estimate
        theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e
                                                    )  # Update parameters

        updates.append((m_previous, m))
        updates.append((v_previous, v))
        updates.append((theta_previous, theta))
    updates.append((t, t + 1.))
    return updates
Example #17
0
def amsgrad(loss_or_grads,
            params,
            learning_rate=0.001,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-8,
            bias_correction=True):
    all_grads = get_or_compute_grads(loss_or_grads, params)
    t_prev = theano.shared(utils.floatX(0.))
    updates = OrderedDict()

    # Using theano constant to prevent upcasting of float32
    one = T.constant(1)

    t = t_prev + 1

    if bias_correction:
        a_t = learning_rate * T.sqrt(one - beta2**t) / (one - beta1**t)
    else:
        a_t = learning_rate

    for param, g_t in zip(params, all_grads):
        value = param.get_value(borrow=True)
        m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)
        v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)
        v_hat_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                   broadcastable=param.broadcastable)

        m_t = beta1 * m_prev + (one - beta1) * g_t
        v_t = beta2 * v_prev + (one - beta2) * g_t**2
        v_hat_t = T.maximum(v_hat_prev, v_t)
        step = a_t * m_t / (T.sqrt(v_hat_t) + epsilon)

        updates[m_prev] = m_t
        updates[v_prev] = v_t
        updates[v_hat_prev] = v_hat_t
        updates[param] = param - step

    updates[t_prev] = t
    return updates
Example #18
0
def hard_rmsprop(loss_or_grads, params, learning_rate=1.0e-2, epsilon=1e-6):
    """
  Not an actual RMSProp: just normalizes the gradient, so it norm equal to the `learning rate` parameter.
  Don't use unless you have to.

  :param loss_or_grads: loss to minimize 
  :param params: params to optimize
  :param learning_rate: norm of the gradient
  :param epsilon: small number for computational stability.
  :return: 
  """
    grads = get_or_compute_grads(loss_or_grads, params)
    gnorm = T.sqrt(sum(T.sum(g**2) for g in grads) + epsilon)
    grads = [g / gnorm for g in grads]

    updates = OrderedDict()

    for param, grad in zip(params, grads):
        updates[param] = param - learning_rate * grad

    return updates
Example #19
0
def graves_rmsprop(loss_or_grads, params, learning_rate=1e-4, chi=0.95, alpha=0.9, epsilon=1e-4):
    r"""
    Alex Graves' RMSProp [1]_.

    .. math ::
        n_{i} &= \chi * n_i-1 + (1 - \chi) * grad^{2}\\
        g_{i} &= \chi * g_i-1 + (1 - \chi) * grad\\
        \Delta_{i} &= \alpha * Delta_{i-1} - learning_rate * grad /
                  sqrt(n_{i} - g_{i}^{2} + \epsilon)\\
        w_{i} &= w_{i-1} + \Delta_{i}

    References
    ----------
    .. [1] Graves, Alex.
           "Generating Sequences With Recurrent Neural Networks", p.23
           arXiv:1308.0850

    """
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)
        n = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                          broadcastable=param.broadcastable)
        g = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                          broadcastable=param.broadcastable)
        delta = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                              broadcastable=param.broadcastable)
        n_ip1 = chi * n + (1. - chi) * grad ** 2
        g_ip1 = chi * g + (1. - chi) * grad
        delta_ip1 = alpha * delta - learning_rate * grad / T.sqrt(n_ip1 + \
                    g_ip1 ** 2 + epsilon)
        updates[n] = n_ip1
        updates[g] = g_ip1
        updates[delta] = delta_ip1
        updates[param] = param + delta_ip1

    return updates
Example #20
0
def rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6):
    """
    Exact copy from Lasagne updates, except also return expressions for
    the update step of each param.
    """
    grads = LU.get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    steps = list()
    one = T.constant(1)
    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)
        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
            broadcastable=param.broadcastable)
        accu_new = rho * accu + (one - rho) * grad ** 2
        updates[accu] = accu_new
        # updates[param] = param - (learning_rate * grad /
        #                           T.sqrt(accu_new + epsilon))
        step = learning_rate * grad / T.sqrt(accu_new + epsilon)
        updates[param] = param - step
        steps.append(step)

    return updates, steps
Example #21
0
def test_get_or_compute_grads():

    from lasagne.updates import get_or_compute_grads

    A = theano.shared(1)
    B = theano.shared(1)
    loss = A + B
    grads = get_or_compute_grads(loss, [A, B])

    assert get_or_compute_grads(grads, [A, B]) is grads

    with pytest.raises(ValueError):
        get_or_compute_grads(grads, [A])

    C = T.scalar()
    with pytest.raises(ValueError):
        get_or_compute_grads(A + C, [A, C])
Example #22
0
def test_get_or_compute_grads():

    from lasagne.updates import get_or_compute_grads

    A = theano.shared(1)
    B = theano.shared(1)
    loss = A + B
    grads = get_or_compute_grads(loss, [A, B])

    assert get_or_compute_grads(grads, [A, B]) is grads

    with pytest.raises(ValueError):
        get_or_compute_grads(grads, [A])

    C = T.scalar()
    with pytest.raises(ValueError):
        get_or_compute_grads(A + C, [A, C])
Example #23
0
def cruel_rmsprop(loss_or_grads,
                  params,
                  learning_rate=1.0,
                  rho=0.9,
                  epsilon=1e-6,
                  grad_clipping=1.0e-2,
                  param_clipping=1.0e-2):
    """
  A version of careful RMSProp for Wassershtein GAN. 
  :param epsilon: small number for computational stability.
  :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled.
  :param param_clipping: after each update all params are clipped to [-`param_clipping`, `param_clipping`].
  :return: 
  """
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    grads = total_norm_constraint(grads,
                                  max_norm=grad_clipping,
                                  epsilon=epsilon)

    # Using theano constant to prevent upcasting of float32
    one = T.constant(1)

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)
        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        accu_new = rho * accu + (one - rho) * grad**2
        updates[accu] = accu_new

        updated = param - (learning_rate * grad / T.sqrt(accu_new + epsilon))

        if param_clipping is not None:
            updates[param] = T.clip(updated, -param_clipping, param_clipping)
        else:
            updates[param] = updated

    return updates
Example #24
0
def sgdWithLrs(loss_or_grads,
               params,
               learning_rate=.01,
               mu_lr=.01,
               si_lr=.001,
               focused_w_lr=.01,
               momentum=.9):
    '''
    # This function provides SGD with different learning rates to focus params mu, si, w
    '''
    from collections import OrderedDict
    from lasagne.updates import get_or_compute_grads, apply_momentum
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    momentum_params_list = []
    print(params)
    for param, grad in zip(params, grads):
        # import pdb; pdb.set_trace()
        #grad = clip_tensor(grad, -0.01, 0.01)
        if param.name.find('focus') >= 0 and param.name.find('mu') >= 0:
            updates[param] = param - mu_lr * grad
            momentum_params_list.append(param)

        elif param.name.find('focus') >= 0 and param.name.find('si') >= 0:
            updates[param] = param - si_lr * grad
            #momentum_params_list.append(param)

        elif param.name.find('focus') >= 0:
            updates[param] = param - (focused_w_lr * grad)
            momentum_params_list.append(param)

        else:
            updates[param] = param - learning_rate * grad
            momentum_params_list.append(param)
            #print (param, grad, learning_rate)
    return apply_momentum(updates,
                          params=momentum_params_list,
                          momentum=momentum)
Example #25
0
def deepmind_rmsprop(loss_or_grads, params, learning_rate=0.00025,
                     rho=0.95, epsilon=0.01):
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)

        acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                 broadcastable=param.broadcastable)
        acc_grad_new = rho * acc_grad + (1 - rho) * grad

        acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                broadcastable=param.broadcastable)
        acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2

        updates[acc_grad] = acc_grad_new
        updates[acc_rms] = acc_rms_new

        updates[param] = (param - learning_rate *
                          (grad /
                           T.sqrt(acc_rms_new - acc_grad_new ** 2 + epsilon)))

    return updates
def sgdWithLearningRateDecay(loss_or_grads, params, learningRate,
                             learningRateDecay):

    from lasagne.updates import get_or_compute_grads

    import theano.tensor as T
    import theano

    from collections import OrderedDict
    from lasagne import utils

    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()

    t_prev = theano.shared(utils.floatX(0.))
    one = T.constant(1)

    t = t_prev + 1

    clr = learningRate / (1 + t * learningRateDecay)

    # http://leon.bottou.org/publications/pdf/tricks-2012.pdf
    # for example suggests (section 5.2)
    # "use learning rates of the form
    #  gamma_t = gamma_0 / (1 + gamma_0 * lambda * t)
    # determine the best gamma_0 using a small training
    # data sample"
    # (lambda / 2 is the coefficient of the weights norm
    #  of L2 regularization)

    for param, grad in zip(params, grads):
        updates[param] = param - clr * grad

    updates[t_prev] = t

    return updates
Example #27
0
    def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, 
                 update_rule, batch_accumulator, randomState, frame_scale=255.0):
        """ Initialize environment

        Arguments:
            environment - the environment (class Env) 
            num_elements_in_batch - list of k integers for the number of each element kept as belief state
            num_actions - int
            discount - float
            learning_rate - float
            rho, rms_epsilon, momentum - float, float, float
            ...
            network_type - string 
            ...           
        """

        self._environment = environment
        
        self._batchSize = batchSize
        self._inputDimensions = self._environment.inputDimensions()
        self._nActions = self._environment.nActions()
        self._df = 0
        self.rho = rho
        self._lr = 0
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self._randomState = randomState
        
        lasagne.random.set_rng(self._randomState)

        self.update_counter = 0
        
        states=[]   # list of symbolic variables for each of the k element in the belief state
                    # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states=[] # idem than states at t+1 
        self.states_shared=[] # list of shared variable for each of the k element in the belief state
        self.next_states_shared=[] # idem that self.states_shared at t+1

        for i, dim in enumerate(self._inputDimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
                
            elif len(dim) == 1:            
                states.append( T.matrix("%s_%s" % ("state", i)) )
                next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
                
            self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
            self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
        
        print("Number of observations per state: {}".format(len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions))
                
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
        
        self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states)
        
        print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))

        self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(
            np.zeros((batchSize, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))


        q_vals = lasagne.layers.get_output(self.l_out)        
        
        next_q_vals = lasagne.layers.get_output(self.next_l_out)
        
        max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True)
        
        T_ones_like=T.ones_like(T.ones_like(terminals) - terminals)
        
        target = rewards + T_ones_like * thediscount * max_next_q_vals

        q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1))

        diff = target - q_val

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)

        for conv_param in self.l_outs_conv:
            for p in lasagne.layers.helper.get_all_params(conv_param):
                params.append(p)
        
            
        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared, ## actions not needed!
            terminals: self.terminals_shared
        }
        
        for i, x in enumerate(self.states_shared):
            givens[ states[i] ] = x 
        for i, x in enumerate(self.next_states_shared):
            givens[ next_states[i] ] = x
                
        if update_rule == 'deepmind_rmsprop':
            grads = get_or_compute_grads(loss, params)
            updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, thelr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        givens2={}
        for i, x in enumerate(self.states_shared):
            givens2[ states[i] ] = x 

        self._q_vals = theano.function([], q_vals,
                                      givens=givens2,
                                      on_unused_input='warn')
Example #28
0
    def u(loss_or_grads, params, *args, **kwargs):
        grads = get_or_compute_grads(loss_or_grads, params)
        gnorm = T.sqrt(sum(T.sum(g**2) for g in grads) + epsilon)
        grads = [g / gnorm for g in grads]

        return updates(grads, params, *args, **kwargs)
Example #29
0
def sgdWithWeightSupress(loss_or_grads,
                         params,
                         learning_rate=.01,
                         mu_lr=.01,
                         si_lr=.001,
                         focused_w_lr=.01,
                         momentum=.9,
                         verbose=False):
    ''' this update function masks focus weights after they are updated.
    The idea is that weights outside of the focus function must be suppressed
    to prevent weight memory when focus changes it            print("Hey weight shape::",mu_si_w[param.name].shape)
s position
    
    To do this I get mu and si values of the focus layer, calculate a Gauss,
    window scale it so the center is 1 but outside is close to 0, and then multiply
    it with the weights.
    '''
    from collections import OrderedDict
    from lasagne.updates import get_or_compute_grads, apply_momentum
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    #momentum_params_list =[]
    if verbose:
        print(params)
    for param, grad in zip(params, grads):

        #grad = clip_tensor(grad, -0.001, 0.001)
        if param.name.find('focus') >= 0 and param.name.find('mu') >= 0:
            updates[param] = param - mu_lr * grad
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            updates[param] = clip_tensor(updates[param], 0.01, 0.99)

        elif param.name.find('focus') >= 0 and param.name.find('si') >= 0:
            updates[param] = param - si_lr * grad
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            updates[param] = clip_tensor(updates[param], 0.01, 0.5)

        elif param.name.find('focus') >= 0 and param.name.find('W') >= 0:
            param_layer_name = param.name.split(".")[0]
            mu_name = param_layer_name + '.mu'

            si_name = param_layer_name + ".si"
            mu_si_w = get_params_values_wkey(params,
                                             [mu_name, si_name, param.name])
            from focusing import U_numeric
            us = U_numeric(np.linspace(0, 1, mu_si_w[param.name].shape[0]),
                           mu_si_w[mu_name],
                           mu_si_w[si_name],
                           1,
                           normed=False)

            updates[param] = (param - (focused_w_lr * grad))

            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            # here we are masking the weights, so they can not stay out of envelope
            us[us > 0.1] = 1.0
            updates[param] = updates[param] * us.T
            #updates[param] = updates[param]*, -0.5, 0.5)
        else:
            updates[param] = param - learning_rate * grad
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            #print (param, grad, learning_rate)
    return updates
Example #30
0
    def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta,
                 freeze_interval, batchSize, network_type, update_rule,
                 batch_accumulator, randomState):
        """ Initialize environment
        """
        QNetwork.__init__(self, environment, batchSize)

        self.rho = rho
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self._randomState = randomState

        lasagne.random.set_rng(self._randomState)

        self.update_counter = 0

        states = [
        ]  # list of symbolic variables for each of the k element in the belief state
        # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states = []  # idem than states at t+1
        self.states_shared = [
        ]  # list of shared variable for each of the k element in the belief state
        self.next_states_shared = []  # idem that self.states_shared at t+1

        for i, dim in enumerate(self._inputDimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))

            elif len(dim) == 1:
                states.append(T.matrix("%s_%s" % ("state", i)))
                next_states.append(T.matrix("%s_%s" % ("next_state", i)))

            self.states_shared.append(
                theano.shared(np.zeros((batchSize, ) + dim,
                                       dtype=theano.config.floatX),
                              borrow=False))
            self.next_states_shared.append(
                theano.shared(np.zeros((batchSize, ) + dim,
                                       dtype=theano.config.floatX),
                              borrow=False))

        print("Number of observations per state: {}".format(
            len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".
              format(self._inputDimensions))

        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)

        self.l_out, self.l_outs_conv, shape_after_conv = self._build(
            network_type, states)

        print(
            "Number of neurons after spatial and temporal convolution layers: {}"
            .format(shape_after_conv))

        self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(
            network_type, next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(np.zeros(
            (batchSize, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))

        self.actions_shared = theano.shared(np.zeros((batchSize, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))

        self.terminals_shared = theano.shared(np.zeros((batchSize, 1),
                                                       dtype='int32'),
                                              broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out)

        next_q_vals = lasagne.layers.get_output(self.next_l_out)

        max_next_q_vals = T.max(next_q_vals, axis=1, keepdims=True)

        T_ones_like = T.ones_like(T.ones_like(terminals) - terminals)

        target = rewards + T_ones_like * thediscount * max_next_q_vals

        q_val = q_vals[T.arange(batchSize),
                       actions.reshape((-1, ))].reshape((-1, 1))

        diff = target - q_val

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff**2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)

        for conv_param in self.l_outs_conv:
            for p in lasagne.layers.helper.get_all_params(conv_param):
                params.append(p)

        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared,  ## actions not needed!
            terminals: self.terminals_shared
        }

        for i, x in enumerate(self.states_shared):
            givens[states[i]] = x
        for i, x in enumerate(self.next_states_shared):
            givens[next_states[i]] = x

        if update_rule == 'deepmind_rmsprop':
            grads = get_or_compute_grads(loss, params)
            updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, thelr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([thediscount, thelr], [loss, q_vals],
                                      updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        givens2 = {}
        for i, x in enumerate(self.states_shared):
            givens2[states[i]] = x

        self._q_vals = theano.function([],
                                       q_vals,
                                       givens=givens2,
                                       on_unused_input='warn')
Example #31
0
def santa_sss(loss_or_grads, params,
              learning_rate=1,
              lambda_=1e-5,
              sigma=0.99,
              A=1,
              burnin=0,
              rng=RandomStreams()):
    n = learning_rate
    tprev = theano.shared(utils.floatX(0.))
    t = tprev + 1
    all_grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    for param, g_t in zip(params, all_grads):
        s = rng.normal(size=param.shape)
        f = g_t
        b = A * t**lambda_

        value = param.get_value(borrow=True)
        vprev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                              broadcastable=param.broadcastable)
        aprev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                              broadcastable=param.broadcastable)

        gprev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                              broadcastable=param.broadcastable)
        if hasattr(n, "get_value"):
            n_ = n.get_value(borrow=True)
        else:
            n_ = n
        ufirst = (np.random.normal(size=value.shape) * np.sqrt(n_))
        ufirst = ufirst.astype(theano.config.floatX)
        uprev = theano.shared(ufirst,
                              broadcastable=param.broadcastable)

        v = sigma * vprev + (1 - sigma) * f * f

        g = 1 / T.sqrt(lambda_ + T.sqrt(v))

        a = aprev + (uprev * uprev - n/b) / 2.
        u = T.exp(-a/2) * uprev
        u = u - g * f * n + T.sqrt(2 * gprev * n/b) * s + n/b*(1 - g / gprev) / uprev
        u = T.exp(-a/2) * u
        a = a + (u * u - n/b) / 2.

        a_explr = a
        u_explr = u

        a = aprev
        u = T.exp(-a/2.) * uprev
        u = u - g * f * n
        u = T.exp(-a/2.) * u
        u_refine = u
        a_refine = a

        a = ifelse(t < burnin, a_explr, a_refine)
        u = ifelse(t < burnin, u_explr, u_refine)
        updates[param] = param + g * uprev / 2. + g * u / 2.
        updates[uprev] = u
        updates[aprev] = a
        updates[vprev] = v
        updates[gprev] = g
    updates[tprev] = t
    return updates
Example #32
0
def adamax(loss_or_grads, params, learning_rate=0.002, beta1=0.9,
           beta2=0.95, epsilon=1e-8, scale_factor=None):
  """
  This version returns additional update to scale momentum params by the factor of `scale_factor`.
  Intended to be used for inner optimization in max-min problems.

  Adamax updates
  Adamax updates implemented as in [1]_. This is a variant of of the Adam
  algorithm based on the infinity norm.
  Parameters
  ----------
  loss_or_grads : symbolic expression or list of expressions
      A scalar loss expression, or a list of gradient expressions
  params : list of shared variables
      The variables to generate update expressions for
  learning_rate : float or symbolic scalar
      Learning rate
  beta1 : float or symbolic scalar
      Exponential decay rate for the first moment estimates.
  beta2 : float or symbolic scalar
      Exponential decay rate for the weighted infinity norm estimates.
  epsilon : float or symbolic scalar
      Constant for numerical stability.
  scale_factor: float or None
    Constant for scaling momentum parameters: first momentum is decreased by `scale_factor`,
    the second momentum is increased by the same factor.
    If None moments are set to zero.
  Returns
  -------
  OrderedDict
      A dictionary mapping each parameter to its update expression

  OrderedDict
      A dictionary mapping each parameter to its reset expression.

  References
  ----------
  .. [1] Kingma, Diederik, and Jimmy Ba (2014):
         Adam: A Method for Stochastic Optimization.
         arXiv preprint arXiv:1412.6980.
  """
  all_grads = get_or_compute_grads(loss_or_grads, params)
  t_prev = theano.shared(utils.floatX(0.))
  updates = OrderedDict()
  resets = OrderedDict()

  scale_factor = utils.floatX(scale_factor)

  # Using theano constant to prevent upcasting of float32
  one = T.constant(1)

  t = t_prev + 1
  a_t = learning_rate / (one - beta1 ** t)

  for param, g_t in zip(params, all_grads):
    value = param.get_value(borrow=True)
    m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                           broadcastable=param.broadcastable)
    u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                           broadcastable=param.broadcastable)

    m_t = beta1 * m_prev + (one - beta1) * g_t
    u_t = T.maximum(beta2 * u_prev, abs(g_t))
    step = a_t * m_t / (u_t + epsilon)

    updates[m_prev] = m_t
    updates[u_prev] = u_t
    updates[param] = param - step

    resets[m_prev] = (
      m_prev / scale_factor
      if scale_factor is not None else
      T.zeros(value.shape, value.dtype)
    )
    ### no need to reset second momentum.

  updates[t_prev] = t

  return updates, resets
Example #33
0
def sgdWithLrLayers(loss_or_grads,
                    params,
                    learning_rate=.01,
                    mu_lr=.01,
                    si_lr=.001,
                    focused_w_lr=.01,
                    momentum=.9):
    '''
    # This function updates each layer parameters with a different learning rate. 
    Under dev.
    '''
    from collections import OrderedDict
    from lasagne.updates import get_or_compute_grads, apply_momentum
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()
    #momentum_params_list =[]
    #print(params)
    for param, grad in zip(params, grads):
        # import pdb; pdb.set_trace()
        grad = clip_tensor(grad, -0.01, 0.01)
        if param.name.find('focus') >= 0 and param.name.find('mu') >= 0:
            updates[param] = param - mu_lr * grad
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum / 2)
            updates[param] = clip_tensor(updates[param], 0.05, 0.95)
            #momentum_params_list.append(param)
            #print (param,mu_lr)
            #print (param, grad, mu_lr)
        elif param.name.find('focus') >= 0 and param.name.find('si') >= 0:
            updates[param] = param - si_lr * grad
            #momentum_params_list.append(param)
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            updates[param] = clip_tensor(updates[param], 0.01, 0.5)

            #print (param,si_lr)
            #print (param, grad, si_lr)
            #print (param, grad, scaler_lr)
        elif param.name.find('focus') >= 0 and (param.name.find('W') >= 0 or
                                                param.name.find('bias') >= 0):
            level = int(str.split(param.name, '-')[1].split('.')[0])
            #print(param.name, level)
            updates[param] = param - (learning_rate * (1. /
                                                       (level + 1))) * grad
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            if (param.name.find('W') >= 0):
                updates[param] = clip_tensor(updates[param], -0.4, 0.4)
            #momentum_params_list.append(param)
            #print (param,focused_w_lr)
        elif param.name.find('W') >= 0 or param.name.find('b') >= 0:
            if param.name.find('-') >= 0:
                level = int(str.split(param.name, '-')[1].split('.')[0])
                updates[param] = param - (learning_rate * (1. / level)) * grad
                updates = apply_momentum(updates,
                                         params=[param],
                                         momentum=momentum)
            else:
                updates[param] = param - (learning_rate) * grad
            #momentum_params_list.append(param)
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            if (param.name.find('W') >= 0):
                updates[param] = clip_tensor(updates[param], -0.4, 0.4)

            if (param.name.find('b') >= 0):
                updates[param] = clip_tensor(updates[param], -1.0, 1.0)
        else:
            updates[param] = param - (learning_rate) * grad
            #momentum_params_list.append(param)
            updates = apply_momentum(updates,
                                     params=[param],
                                     momentum=momentum)
            if (param.name.find('beta') >= 0):
                updates[param] = clip_tensor(updates[param], -1., 1.)
            #print (param, grad, learning_rate)

    return updates