def test_get_or_compute_grads_raises(): from lasagne.updates import get_or_compute_grads A = T.scalar() B = T.scalar() loss = A + B grads = get_or_compute_grads(loss, [A, B]) assert get_or_compute_grads(grads, [A, B]) is grads with pytest.raises(ValueError): get_or_compute_grads(grads, [A])
def opdac_rmsprop(q_vals, acts_t, u_acts, u_params,learning_rate,WhetherDirect): if WhetherDirect: #TODO: 这个一定要看一看,原文为什么不能直接求导?,因为u_params根本不在计算过程内 return sgd(q_vals, u_params, learning_rate) else: q2a_grads=get_or_compute_grads(q_vals, acts_t)#TODO: theano好像只能对一个scalar的数求导,没有scarlar的时候怎么搞 q2a_grads=T.sum(q2a_grads)#TODO: 这个逻辑是不对的,只是现在action只有一个,先这样胡写。后期可以参考使用theano的求jacobian a2w_grads=get_or_compute_grads(u_acts, u_params) grads=[a2w_grad*q2a_grads for a2w_grad in a2w_grads]#TODO:Done,这样乘 这两个dict怎么乘! updates = OrderedDict() for param, grad in zip(u_params, grads): updates[param] = param + learning_rate * grad#Done另外这个目标可是要让函数增大,要修正更新方向:已修正,改为+号 return updates
def sgd(loss_or_grads, params, learning_rate): """Stochastic Gradient Descent (SGD) updates Generates update expressions of the form: * ``param := param - learning_rate * gradient`` Parameters ---------- loss_or_grads : symbolic expression or list of expressions A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to generate update expressions for learning_rate : float or symbolic scalar The learning rate controlling the size of update steps Returns ------- OrderedDict A dictionary mapping each parameter to its update expression """ #现在的目标很简单,就是拿到这个正确的grads grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): updates[param] = param - learning_rate * grad return updates
def careful_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, grad_clipping=1.0e-2): """ RMSProp with gradient clipping. :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled. :return: updates """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad**2 updates[accu] = accu_new updates[param] = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) return updates
def adam(loss_or_grads, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8): """ Exact copy from Lasagne updates, except also return expressions for the update step of each param. """ all_grads = LU.get_or_compute_grads(loss_or_grads, params) t_prev = theano.shared(lasagne.utils.floatX(0.)) updates = OrderedDict() steps = list() one = T.constant(1) t = t_prev + 1 a_t = learning_rate * T.sqrt(one - beta2 ** t) / (one - beta1 ** t) for param, g_t in zip(params, all_grads): value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (one - beta1) * g_t v_t = beta2 * v_prev + (one - beta2) * g_t ** 2 step = a_t * m_t / (T.sqrt(v_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t updates[param] = param - step steps.append(step) updates[t_prev] = t return updates, steps
def adam_kingma(loss_or_grads, params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8): """ ADAM update rules Default values are taken from [Kingma2014] References: [Kingma2014] Kingma, Diederik, and Jimmy Ba. "Adam: A Method for Stochastic Optimization." arXiv preprint arXiv:1412.6980 (2014). http://arxiv.org/pdf/1412.6980v4.pdf """ updates = [] all_grads = get_or_compute_grads(loss_or_grads, params) alpha = learning_rate t = theano.shared(np.float32(1)) b1_t = b1*gamma**(t-1) # Decay the first moment running average coefficient for theta_previous, g in zip(params, all_grads): m_previous = theano.shared(np.zeros(theta_previous.get_value().shape, dtype=theano.config.floatX)) v_previous = theano.shared(np.zeros(theta_previous.get_value().shape, dtype=theano.config.floatX)) m = b1_t*m_previous + (1 - b1_t)*g # Update biased first moment estimate v = b2*v_previous + (1 - b2)*g**2 # Update biased second raw moment estimate m_hat = m / (1-b1**t) # Compute bias-corrected first moment estimate v_hat = v / (1-b2**t) # Compute bias-corrected second raw moment estimate theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e) # Update parameters updates.append((m_previous, m)) updates.append((v_previous, v)) updates.append((theta_previous, theta)) updates.append((t, t + 1.)) return updates
def deepmind_rmsprop(loss_or_grads, params, learning_rate=0.00025, rho=0.95, epsilon=0.01): grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): value = param.get_value(borrow=True) acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_grad_new = rho * acc_grad + (1 - rho) * grad acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_rms_new = rho * acc_rms + (1 - rho) * grad**2 updates[acc_grad] = acc_grad_new updates[acc_rms] = acc_rms_new updates[param] = ( param - learning_rate * (grad / T.sqrt(acc_rms_new - acc_grad_new**2 + epsilon))) return updates
def u(loss_or_grads, params, *args, **kwargs): grads = get_or_compute_grads(loss_or_grads, params) grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon) return updates(grads, params, *args, **kwargs)
def deepmind_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): """RMSProp updates [1]_. Scale learning rates by dividing with the moving average of the root mean squared (RMS) gradients. Parameters ---------- loss_or_grads : symbolic expression or list of expressions A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to generate update expressions for learning_rate : float or symbolic scalar The learning rate controlling the size of update steps rho : float or symbolic scalar Gradient moving average decay factor epsilon : float or symbolic scalar Small value added for numerical stability Returns ------- OrderedDict A dictionary mapping each parameter to its update expression Notes ----- `rho` should be between 0 and 1. A value of `rho` close to 1 will decay the moving average slowly and a value close to 0 will decay the moving average fast. Using the step size :math:`\\eta` and a decay factor :math:`\\rho` the learning rate :math:`\\eta_t` is calculated as: .. math:: r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\ \\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}} References ---------- .. [1] Tieleman, T. and Hinton, G. (2012): Neural Networks for Machine Learning, Lecture 6.5 - rmsprop. Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20) """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): value = param.get_value(borrow=True) acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_grad_new = rho * acc_grad + (1 - rho) * grad acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2 updates[acc_grad] = acc_grad_new updates[acc_rms] = acc_rms_new updates[param] = (param - learning_rate * (grad / T.sqrt(acc_rms_new - acc_grad_new **2 + epsilon))) return updates
def sign_rule(loss_or_grads, params, learning_rate): grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): updates[param] = param - learning_rate * T.sgn(grad) return updates
def adam_svrg(loss_or_grads, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8): all_grads = get_or_compute_grads(loss_or_grads, params) t_prev = [] updates = [] updates_of = [] grads_adam = [] for m_r in range(2): t_prev.append(theano.shared(utils.floatX(0.))) updates.append(OrderedDict()) # grads_adam.append([TT.matrix('eval_grad0'),TT.vector('eval_grad1'),TT.col('eval_grad3'),TT.vector('eval_grad4')]) # norm_adam.append([TT.matrix('eval_grad0'),TT.vector('eval_grad1'),TT.col('eval_grad3'),TT.vector('eval_grad4')]) updates_of.append(OrderedDict()) # Using theano constant to prevent upcasting of float32 one = TT.constant(1) t = t_prev[-1] + 1 if (m_r == 0): a_t = learning_rate * TT.sqrt(one - beta2**t) / (one - beta1**t) else: beta2 = 0.999 a_t = learning_rate / 2 * TT.sqrt(one - beta2**t) / (one - beta1**t) i = 0 l = [] h = [] for param, g_t in zip(params, all_grads): value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (one - beta1) * g_t v_t = beta2 * v_prev + (one - beta2) * g_t**2 step = a_t * m_t / (TT.sqrt(v_t) + epsilon) # eff_step = TT.sum(TT.square(step,None)) h.append(TT.sum(TT.square(step))) l.append(TT.sum(TT.square(m_t))) updates[-1][m_prev] = m_t updates[-1][v_prev] = v_t updates_of[-1][param] = param - step i += 1 updates[-1][t_prev[-1]] = t grads_adam.append( TT.sqrt( (h[0] + h[1] + h[2] + h[3] + h[4] + h[5] + h[6] + h[7] + h[8]) / (l[0] + l[1] + l[2] + l[3] + l[4] + l[5] + l[6] + l[7] + l[8]))) return updates_of, grads_adam
def sgdWithLrsClip(loss_or_grads, params, learning_rate=.01, mu_lr=.01, si_lr=.001, focused_w_lr=.01, momentum=.9, verbose=False): ''' Sames as sgdWithLrs bu applies clips after updates ''' from collections import OrderedDict from lasagne.updates import get_or_compute_grads, apply_momentum grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() #momentum_params_list =[] f32 = np.float32 if verbose: print("Params List", params) for param, grad in zip(params, grads): if verbose: print("param name", param.name, "shape:", param.eval().shape) #print("param name", param.name, "shape:", param.get_value().shape) #grad = clip_tensor(grad, -0.001, 0.001) if param.name.find('focus') >= 0 and param.name.find('mu') >= 0: updates[param] = param - mu_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], f32(0.01), f32(0.99)) elif param.name.find('focus') >= 0 and param.name.find('si') >= 0: updates[param] = param - si_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], f32(0.01), f32(0.5)) elif param.name.find('focus') >= 0 and param.name.find('W') >= 0: updates[param] = param - (focused_w_lr * grad) updates = apply_momentum(updates, params=[param], momentum=momentum) #updates[param] =clip_tensor(updates[param], -0.5, 0.5) else: updates[param] = param - learning_rate * grad updates = apply_momentum(updates, params=[param], momentum=momentum) #if param.name.find('W')>=0: #print (param, grad, learning_rate) return updates
def santa_euler(loss_or_grads, params, learning_rate=1, lambda_=1e-5, sigma=0.99, A=1, burnin=0, rng=RandomStreams()): n = learning_rate tprev = theano.shared(utils.floatX(0.)) t = tprev + 1 all_grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, g_t in zip(params, all_grads): s = rng.normal(size=param.shape) f = g_t b = A * t**lambda_ value = param.get_value(borrow=True) vprev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) aprev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) gprev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) if hasattr(n, "get_value"): n_ = n.get_value(borrow=True) else: n_ = n ufirst = (np.random.normal(size=value.shape) * np.sqrt(n_)) ufirst = ufirst.astype(value.dtype) uprev = theano.shared(ufirst, broadcastable=param.broadcastable) v = sigma * vprev + (1 - sigma) * f * f g = 1 / T.sqrt(lambda_ + T.sqrt(v)) a = ifelse(t < burnin, aprev + (uprev * uprev - n / b), aprev) u = ifelse(t < burnin, (n / b) * (1 - g / gprev) / uprev + T.sqrt(2 * n / b * gprev) * s, theano.shared(np.zeros(value.shape, dtype=value.dtype))) u = u + (1 - a) * uprev - n * g * f updates[param] = param + g * u updates[uprev] = u updates[aprev] = a updates[vprev] = v updates[gprev] = g updates[tprev] = t return updates
def u(loss_or_grads, params, *args, **kwargs): grads = get_or_compute_grads(loss_or_grads, params) if type(srng_or_seed) is int: srng = MRG_RandomStreams(srng_or_seed) elif srng_or_seed is None: srng = MRG_RandomStreams() else: srng = srng_or_seed noisy_grads = [ g + srng.normal(size=g.shape, ndim=g.ndim, std=std) for g in grads ] return updates(noisy_grads, params, *args, **kwargs)
def adam_kingma(loss_or_grads, params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1 - 1e-8): """ ADAM update rules Default values are taken from [Kingma2014] References: [Kingma2014] Kingma, Diederik, and Jimmy Ba. "Adam: A Method for Stochastic Optimization." arXiv preprint arXiv:1412.6980 (2014). http://arxiv.org/pdf/1412.6980v4.pdf """ updates = [] all_grads = get_or_compute_grads(loss_or_grads, params) alpha = learning_rate t = theano.shared(np.float32(1)) b1_t = b1 * gamma**(t - 1 ) # Decay the first moment running average coefficient for theta_previous, g in zip(params, all_grads): m_previous = theano.shared( np.zeros(theta_previous.get_value().shape, dtype=theano.config.floatX)) v_previous = theano.shared( np.zeros(theta_previous.get_value().shape, dtype=theano.config.floatX)) m = b1_t * m_previous + ( 1 - b1_t) * g # Update biased first moment estimate v = b2 * v_previous + ( 1 - b2) * g**2 # Update biased second raw moment estimate m_hat = m / (1 - b1**t) # Compute bias-corrected first moment estimate v_hat = v / (1 - b2**t ) # Compute bias-corrected second raw moment estimate theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e ) # Update parameters updates.append((m_previous, m)) updates.append((v_previous, v)) updates.append((theta_previous, theta)) updates.append((t, t + 1.)) return updates
def amsgrad(loss_or_grads, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, bias_correction=True): all_grads = get_or_compute_grads(loss_or_grads, params) t_prev = theano.shared(utils.floatX(0.)) updates = OrderedDict() # Using theano constant to prevent upcasting of float32 one = T.constant(1) t = t_prev + 1 if bias_correction: a_t = learning_rate * T.sqrt(one - beta2**t) / (one - beta1**t) else: a_t = learning_rate for param, g_t in zip(params, all_grads): value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_hat_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (one - beta1) * g_t v_t = beta2 * v_prev + (one - beta2) * g_t**2 v_hat_t = T.maximum(v_hat_prev, v_t) step = a_t * m_t / (T.sqrt(v_hat_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t updates[v_hat_prev] = v_hat_t updates[param] = param - step updates[t_prev] = t return updates
def hard_rmsprop(loss_or_grads, params, learning_rate=1.0e-2, epsilon=1e-6): """ Not an actual RMSProp: just normalizes the gradient, so it norm equal to the `learning rate` parameter. Don't use unless you have to. :param loss_or_grads: loss to minimize :param params: params to optimize :param learning_rate: norm of the gradient :param epsilon: small number for computational stability. :return: """ grads = get_or_compute_grads(loss_or_grads, params) gnorm = T.sqrt(sum(T.sum(g**2) for g in grads) + epsilon) grads = [g / gnorm for g in grads] updates = OrderedDict() for param, grad in zip(params, grads): updates[param] = param - learning_rate * grad return updates
def graves_rmsprop(loss_or_grads, params, learning_rate=1e-4, chi=0.95, alpha=0.9, epsilon=1e-4): r""" Alex Graves' RMSProp [1]_. .. math :: n_{i} &= \chi * n_i-1 + (1 - \chi) * grad^{2}\\ g_{i} &= \chi * g_i-1 + (1 - \chi) * grad\\ \Delta_{i} &= \alpha * Delta_{i-1} - learning_rate * grad / sqrt(n_{i} - g_{i}^{2} + \epsilon)\\ w_{i} &= w_{i-1} + \Delta_{i} References ---------- .. [1] Graves, Alex. "Generating Sequences With Recurrent Neural Networks", p.23 arXiv:1308.0850 """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): value = param.get_value(borrow=True) n = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) g = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) delta = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) n_ip1 = chi * n + (1. - chi) * grad ** 2 g_ip1 = chi * g + (1. - chi) * grad delta_ip1 = alpha * delta - learning_rate * grad / T.sqrt(n_ip1 + \ g_ip1 ** 2 + epsilon) updates[n] = n_ip1 updates[g] = g_ip1 updates[delta] = delta_ip1 updates[param] = param + delta_ip1 return updates
def rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): """ Exact copy from Lasagne updates, except also return expressions for the update step of each param. """ grads = LU.get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() steps = list() one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new # updates[param] = param - (learning_rate * grad / # T.sqrt(accu_new + epsilon)) step = learning_rate * grad / T.sqrt(accu_new + epsilon) updates[param] = param - step steps.append(step) return updates, steps
def test_get_or_compute_grads(): from lasagne.updates import get_or_compute_grads A = theano.shared(1) B = theano.shared(1) loss = A + B grads = get_or_compute_grads(loss, [A, B]) assert get_or_compute_grads(grads, [A, B]) is grads with pytest.raises(ValueError): get_or_compute_grads(grads, [A]) C = T.scalar() with pytest.raises(ValueError): get_or_compute_grads(A + C, [A, C])
def cruel_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, grad_clipping=1.0e-2, param_clipping=1.0e-2): """ A version of careful RMSProp for Wassershtein GAN. :param epsilon: small number for computational stability. :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled. :param param_clipping: after each update all params are clipped to [-`param_clipping`, `param_clipping`]. :return: """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad**2 updates[accu] = accu_new updated = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) if param_clipping is not None: updates[param] = T.clip(updated, -param_clipping, param_clipping) else: updates[param] = updated return updates
def sgdWithLrs(loss_or_grads, params, learning_rate=.01, mu_lr=.01, si_lr=.001, focused_w_lr=.01, momentum=.9): ''' # This function provides SGD with different learning rates to focus params mu, si, w ''' from collections import OrderedDict from lasagne.updates import get_or_compute_grads, apply_momentum grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() momentum_params_list = [] print(params) for param, grad in zip(params, grads): # import pdb; pdb.set_trace() #grad = clip_tensor(grad, -0.01, 0.01) if param.name.find('focus') >= 0 and param.name.find('mu') >= 0: updates[param] = param - mu_lr * grad momentum_params_list.append(param) elif param.name.find('focus') >= 0 and param.name.find('si') >= 0: updates[param] = param - si_lr * grad #momentum_params_list.append(param) elif param.name.find('focus') >= 0: updates[param] = param - (focused_w_lr * grad) momentum_params_list.append(param) else: updates[param] = param - learning_rate * grad momentum_params_list.append(param) #print (param, grad, learning_rate) return apply_momentum(updates, params=momentum_params_list, momentum=momentum)
def deepmind_rmsprop(loss_or_grads, params, learning_rate=0.00025, rho=0.95, epsilon=0.01): grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): value = param.get_value(borrow=True) acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_grad_new = rho * acc_grad + (1 - rho) * grad acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2 updates[acc_grad] = acc_grad_new updates[acc_rms] = acc_rms_new updates[param] = (param - learning_rate * (grad / T.sqrt(acc_rms_new - acc_grad_new ** 2 + epsilon))) return updates
def sgdWithLearningRateDecay(loss_or_grads, params, learningRate, learningRateDecay): from lasagne.updates import get_or_compute_grads import theano.tensor as T import theano from collections import OrderedDict from lasagne import utils grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() t_prev = theano.shared(utils.floatX(0.)) one = T.constant(1) t = t_prev + 1 clr = learningRate / (1 + t * learningRateDecay) # http://leon.bottou.org/publications/pdf/tricks-2012.pdf # for example suggests (section 5.2) # "use learning rates of the form # gamma_t = gamma_0 / (1 + gamma_0 * lambda * t) # determine the best gamma_0 using a small training # data sample" # (lambda / 2 is the coefficient of the weights norm # of L2 regularization) for param, grad in zip(params, grads): updates[param] = param - clr * grad updates[t_prev] = t return updates
def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, update_rule, batch_accumulator, randomState, frame_scale=255.0): """ Initialize environment Arguments: environment - the environment (class Env) num_elements_in_batch - list of k integers for the number of each element kept as belief state num_actions - int discount - float learning_rate - float rho, rms_epsilon, momentum - float, float, float ... network_type - string ... """ self._environment = environment self._batchSize = batchSize self._inputDimensions = self._environment.inputDimensions() self._nActions = self._environment.nActions() self._df = 0 self.rho = rho self._lr = 0 self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self._randomState = randomState lasagne.random.set_rng(self._randomState) self.update_counter = 0 states=[] # list of symbolic variables for each of the k element in the belief state # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ] next_states=[] # idem than states at t+1 self.states_shared=[] # list of shared variable for each of the k element in the belief state self.next_states_shared=[] # idem that self.states_shared at t+1 for i, dim in enumerate(self._inputDimensions): if len(dim) == 3: states.append(T.tensor4("%s_%s" % ("state", i))) next_states.append(T.tensor4("%s_%s" % ("next_state", i))) elif len(dim) == 2: states.append(T.tensor3("%s_%s" % ("state", i))) next_states.append(T.tensor3("%s_%s" % ("next_state", i))) elif len(dim) == 1: states.append( T.matrix("%s_%s" % ("state", i)) ) next_states.append( T.matrix("%s_%s" % ("next_state", i)) ) self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False)) self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False)) print("Number of observations per state: {}".format(len(self.states_shared))) print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions)) rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX) thelr = T.scalar(name='thelr', dtype=theano.config.floatX) self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states) print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv)) self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states) self._resetQHat() self.rewards_shared = theano.shared( np.zeros((batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out) next_q_vals = lasagne.layers.get_output(self.next_l_out) max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True) T_ones_like=T.ones_like(T.ones_like(terminals) - terminals) target = rewards + T_ones_like * thediscount * max_next_q_vals q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1)) diff = target - q_val if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) for conv_param in self.l_outs_conv: for p in lasagne.layers.helper.get_all_params(conv_param): params.append(p) givens = { rewards: self.rewards_shared, actions: self.actions_shared, ## actions not needed! terminals: self.terminals_shared } for i, x in enumerate(self.states_shared): givens[ states[i] ] = x for i, x in enumerate(self.next_states_shared): givens[ next_states[i] ] = x if update_rule == 'deepmind_rmsprop': grads = get_or_compute_grads(loss, params) updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, thelr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates, givens=givens, on_unused_input='warn') givens2={} for i, x in enumerate(self.states_shared): givens2[ states[i] ] = x self._q_vals = theano.function([], q_vals, givens=givens2, on_unused_input='warn')
def u(loss_or_grads, params, *args, **kwargs): grads = get_or_compute_grads(loss_or_grads, params) gnorm = T.sqrt(sum(T.sum(g**2) for g in grads) + epsilon) grads = [g / gnorm for g in grads] return updates(grads, params, *args, **kwargs)
def sgdWithWeightSupress(loss_or_grads, params, learning_rate=.01, mu_lr=.01, si_lr=.001, focused_w_lr=.01, momentum=.9, verbose=False): ''' this update function masks focus weights after they are updated. The idea is that weights outside of the focus function must be suppressed to prevent weight memory when focus changes it print("Hey weight shape::",mu_si_w[param.name].shape) s position To do this I get mu and si values of the focus layer, calculate a Gauss, window scale it so the center is 1 but outside is close to 0, and then multiply it with the weights. ''' from collections import OrderedDict from lasagne.updates import get_or_compute_grads, apply_momentum grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() #momentum_params_list =[] if verbose: print(params) for param, grad in zip(params, grads): #grad = clip_tensor(grad, -0.001, 0.001) if param.name.find('focus') >= 0 and param.name.find('mu') >= 0: updates[param] = param - mu_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], 0.01, 0.99) elif param.name.find('focus') >= 0 and param.name.find('si') >= 0: updates[param] = param - si_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], 0.01, 0.5) elif param.name.find('focus') >= 0 and param.name.find('W') >= 0: param_layer_name = param.name.split(".")[0] mu_name = param_layer_name + '.mu' si_name = param_layer_name + ".si" mu_si_w = get_params_values_wkey(params, [mu_name, si_name, param.name]) from focusing import U_numeric us = U_numeric(np.linspace(0, 1, mu_si_w[param.name].shape[0]), mu_si_w[mu_name], mu_si_w[si_name], 1, normed=False) updates[param] = (param - (focused_w_lr * grad)) updates = apply_momentum(updates, params=[param], momentum=momentum) # here we are masking the weights, so they can not stay out of envelope us[us > 0.1] = 1.0 updates[param] = updates[param] * us.T #updates[param] = updates[param]*, -0.5, 0.5) else: updates[param] = param - learning_rate * grad updates = apply_momentum(updates, params=[param], momentum=momentum) #print (param, grad, learning_rate) return updates
def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, update_rule, batch_accumulator, randomState): """ Initialize environment """ QNetwork.__init__(self, environment, batchSize) self.rho = rho self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self._randomState = randomState lasagne.random.set_rng(self._randomState) self.update_counter = 0 states = [ ] # list of symbolic variables for each of the k element in the belief state # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ] next_states = [] # idem than states at t+1 self.states_shared = [ ] # list of shared variable for each of the k element in the belief state self.next_states_shared = [] # idem that self.states_shared at t+1 for i, dim in enumerate(self._inputDimensions): if len(dim) == 3: states.append(T.tensor4("%s_%s" % ("state", i))) next_states.append(T.tensor4("%s_%s" % ("next_state", i))) elif len(dim) == 2: states.append(T.tensor3("%s_%s" % ("state", i))) next_states.append(T.tensor3("%s_%s" % ("next_state", i))) elif len(dim) == 1: states.append(T.matrix("%s_%s" % ("state", i))) next_states.append(T.matrix("%s_%s" % ("next_state", i))) self.states_shared.append( theano.shared(np.zeros((batchSize, ) + dim, dtype=theano.config.floatX), borrow=False)) self.next_states_shared.append( theano.shared(np.zeros((batchSize, ) + dim, dtype=theano.config.floatX), borrow=False)) print("Number of observations per state: {}".format( len(self.states_shared))) print("For each observation, historySize + ponctualObs_i.shape: {}". format(self._inputDimensions)) rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX) thelr = T.scalar(name='thelr', dtype=theano.config.floatX) self.l_out, self.l_outs_conv, shape_after_conv = self._build( network_type, states) print( "Number of neurons after spatial and temporal convolution layers: {}" .format(shape_after_conv)) self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build( network_type, next_states) self._resetQHat() self.rewards_shared = theano.shared(np.zeros( (batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out) next_q_vals = lasagne.layers.get_output(self.next_l_out) max_next_q_vals = T.max(next_q_vals, axis=1, keepdims=True) T_ones_like = T.ones_like(T.ones_like(terminals) - terminals) target = rewards + T_ones_like * thediscount * max_next_q_vals q_val = q_vals[T.arange(batchSize), actions.reshape((-1, ))].reshape((-1, 1)) diff = target - q_val if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part else: loss = 0.5 * diff**2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) for conv_param in self.l_outs_conv: for p in lasagne.layers.helper.get_all_params(conv_param): params.append(p) givens = { rewards: self.rewards_shared, actions: self.actions_shared, ## actions not needed! terminals: self.terminals_shared } for i, x in enumerate(self.states_shared): givens[states[i]] = x for i, x in enumerate(self.next_states_shared): givens[next_states[i]] = x if update_rule == 'deepmind_rmsprop': grads = get_or_compute_grads(loss, params) updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, thelr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates, givens=givens, on_unused_input='warn') givens2 = {} for i, x in enumerate(self.states_shared): givens2[states[i]] = x self._q_vals = theano.function([], q_vals, givens=givens2, on_unused_input='warn')
def santa_sss(loss_or_grads, params, learning_rate=1, lambda_=1e-5, sigma=0.99, A=1, burnin=0, rng=RandomStreams()): n = learning_rate tprev = theano.shared(utils.floatX(0.)) t = tprev + 1 all_grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, g_t in zip(params, all_grads): s = rng.normal(size=param.shape) f = g_t b = A * t**lambda_ value = param.get_value(borrow=True) vprev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) aprev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) gprev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) if hasattr(n, "get_value"): n_ = n.get_value(borrow=True) else: n_ = n ufirst = (np.random.normal(size=value.shape) * np.sqrt(n_)) ufirst = ufirst.astype(theano.config.floatX) uprev = theano.shared(ufirst, broadcastable=param.broadcastable) v = sigma * vprev + (1 - sigma) * f * f g = 1 / T.sqrt(lambda_ + T.sqrt(v)) a = aprev + (uprev * uprev - n/b) / 2. u = T.exp(-a/2) * uprev u = u - g * f * n + T.sqrt(2 * gprev * n/b) * s + n/b*(1 - g / gprev) / uprev u = T.exp(-a/2) * u a = a + (u * u - n/b) / 2. a_explr = a u_explr = u a = aprev u = T.exp(-a/2.) * uprev u = u - g * f * n u = T.exp(-a/2.) * u u_refine = u a_refine = a a = ifelse(t < burnin, a_explr, a_refine) u = ifelse(t < burnin, u_explr, u_refine) updates[param] = param + g * uprev / 2. + g * u / 2. updates[uprev] = u updates[aprev] = a updates[vprev] = v updates[gprev] = g updates[tprev] = t return updates
def adamax(loss_or_grads, params, learning_rate=0.002, beta1=0.9, beta2=0.95, epsilon=1e-8, scale_factor=None): """ This version returns additional update to scale momentum params by the factor of `scale_factor`. Intended to be used for inner optimization in max-min problems. Adamax updates Adamax updates implemented as in [1]_. This is a variant of of the Adam algorithm based on the infinity norm. Parameters ---------- loss_or_grads : symbolic expression or list of expressions A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to generate update expressions for learning_rate : float or symbolic scalar Learning rate beta1 : float or symbolic scalar Exponential decay rate for the first moment estimates. beta2 : float or symbolic scalar Exponential decay rate for the weighted infinity norm estimates. epsilon : float or symbolic scalar Constant for numerical stability. scale_factor: float or None Constant for scaling momentum parameters: first momentum is decreased by `scale_factor`, the second momentum is increased by the same factor. If None moments are set to zero. Returns ------- OrderedDict A dictionary mapping each parameter to its update expression OrderedDict A dictionary mapping each parameter to its reset expression. References ---------- .. [1] Kingma, Diederik, and Jimmy Ba (2014): Adam: A Method for Stochastic Optimization. arXiv preprint arXiv:1412.6980. """ all_grads = get_or_compute_grads(loss_or_grads, params) t_prev = theano.shared(utils.floatX(0.)) updates = OrderedDict() resets = OrderedDict() scale_factor = utils.floatX(scale_factor) # Using theano constant to prevent upcasting of float32 one = T.constant(1) t = t_prev + 1 a_t = learning_rate / (one - beta1 ** t) for param, g_t in zip(params, all_grads): value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (one - beta1) * g_t u_t = T.maximum(beta2 * u_prev, abs(g_t)) step = a_t * m_t / (u_t + epsilon) updates[m_prev] = m_t updates[u_prev] = u_t updates[param] = param - step resets[m_prev] = ( m_prev / scale_factor if scale_factor is not None else T.zeros(value.shape, value.dtype) ) ### no need to reset second momentum. updates[t_prev] = t return updates, resets
def sgdWithLrLayers(loss_or_grads, params, learning_rate=.01, mu_lr=.01, si_lr=.001, focused_w_lr=.01, momentum=.9): ''' # This function updates each layer parameters with a different learning rate. Under dev. ''' from collections import OrderedDict from lasagne.updates import get_or_compute_grads, apply_momentum grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() #momentum_params_list =[] #print(params) for param, grad in zip(params, grads): # import pdb; pdb.set_trace() grad = clip_tensor(grad, -0.01, 0.01) if param.name.find('focus') >= 0 and param.name.find('mu') >= 0: updates[param] = param - mu_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum / 2) updates[param] = clip_tensor(updates[param], 0.05, 0.95) #momentum_params_list.append(param) #print (param,mu_lr) #print (param, grad, mu_lr) elif param.name.find('focus') >= 0 and param.name.find('si') >= 0: updates[param] = param - si_lr * grad #momentum_params_list.append(param) updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], 0.01, 0.5) #print (param,si_lr) #print (param, grad, si_lr) #print (param, grad, scaler_lr) elif param.name.find('focus') >= 0 and (param.name.find('W') >= 0 or param.name.find('bias') >= 0): level = int(str.split(param.name, '-')[1].split('.')[0]) #print(param.name, level) updates[param] = param - (learning_rate * (1. / (level + 1))) * grad updates = apply_momentum(updates, params=[param], momentum=momentum) if (param.name.find('W') >= 0): updates[param] = clip_tensor(updates[param], -0.4, 0.4) #momentum_params_list.append(param) #print (param,focused_w_lr) elif param.name.find('W') >= 0 or param.name.find('b') >= 0: if param.name.find('-') >= 0: level = int(str.split(param.name, '-')[1].split('.')[0]) updates[param] = param - (learning_rate * (1. / level)) * grad updates = apply_momentum(updates, params=[param], momentum=momentum) else: updates[param] = param - (learning_rate) * grad #momentum_params_list.append(param) updates = apply_momentum(updates, params=[param], momentum=momentum) if (param.name.find('W') >= 0): updates[param] = clip_tensor(updates[param], -0.4, 0.4) if (param.name.find('b') >= 0): updates[param] = clip_tensor(updates[param], -1.0, 1.0) else: updates[param] = param - (learning_rate) * grad #momentum_params_list.append(param) updates = apply_momentum(updates, params=[param], momentum=momentum) if (param.name.find('beta') >= 0): updates[param] = clip_tensor(updates[param], -1., 1.) #print (param, grad, learning_rate) return updates