def momsgd(params, cost=None, gradients=None, learningrate=0.01, momentum=0.9, nesterov=True): # TODO: Docstring # Validate input assert not (cost is None and gradients is None), "Update function momsgd requires either a cost scalar or a " \ "list of gradients." # Compute gradients if requested if gradients is None and cost is not None: pdC = T.grad(cost, wrt=params) # Kill gradients if cost is nan dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC] else: dC = gradients # Init update list updates = [] for param, dparam in zip(params, dC): # Check if layer is trainable. Skip if not. if not netutils.getbaggage(param, 'trainable', True): continue # Check if learningrate is to be overriden if netutils.getbaggage(param, 'learningrate', False): # Override lr = param.baggage['learningrate'] else: # Nothing to override lr = learningrate # Fetch parameter shape paramshape = param.get_value().shape # ... and init initial momentum mom = th.shared(np.zeros(paramshape, dtype=th.config.floatX)) # Compute velocity vel = momentum * mom - learningrate * dparam # Compute new parameters if nesterov: newparam = param + momentum * vel - lr * dparam else: newparam = param + vel # update update list updates.append((param, newparam)) updates.append((mom, vel)) # Return return updates
def rmsprop(params, cost=None, gradients=None, learningrate=0.0005, rho=0.9, epsilon=1e-6): # Validate input assert not (cost is None and gradients is None), "Update function rmsprop requires either a cost scalar or a " \ "list of gradients." # Compute gradients if requested if gradients is None and cost is not None: pdC = T.grad(cost, wrt=params) # Kill gradients if cost is nan dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC] else: dC = gradients # Init update list updates = [] for param, dparam in zip(params, dC): # Check if layer is trainable. Skip if not. if not netutils.getbaggage(param, 'trainable', True): continue paramshape = param.get_value().shape acc = th.shared(np.zeros(paramshape, dtype=th.config.floatX)) newacc = rho * acc + (1 - rho) * dparam ** 2 gradscale = T.sqrt(newacc + epsilon) dparam = dparam / gradscale updates.append((acc, newacc)) updates.append((param, param - learningrate * dparam)) return updates
def Lp(params, p=2): """ Given a list of parameters, compute the p-th power of its Lp norm. :type params: list :param params: Parameters to take the Lp norm of. :type p: int :param p: p of the Lp norm. Defaults to 2. :return: (Lp norm)^p """ # Compute Lp^p lpn = sum(map(T.sum, map(lambda k: k ** p, [param for param in params if netutils.getbaggage(param, 'regularizable', True)]))) # Return return lpn
def sgd(params, cost=None, gradients=None, learningrate=1e-4): """ Computes the updates for Stochastic Gradient Descent (without momentum) :type params: list :param params: Network parameters. :type cost: theano.tensor.var.TensorVariable :param cost: Cost variable (scalar). Optional if the gradient is provided. :type gradients: list :param gradients: Gradient of a cost w.r.t. parameters. Optional if the cost is provided. :type learningrate: theano.tensor.var.TensorVariable or float :param learningrate: Learning rate of SGD. Can be a float (static) or a dynamic theano variable. :return: List of updates """ # Validate input assert not (cost is None and gradients is None), "Update function sgd requires either a cost scalar or a list of " \ "gradients." # Compute gradients if requested if gradients is None and cost is not None: pdC = T.grad(cost, wrt=params) # Kill gradients if cost is nan dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC] else: dC = gradients # Compute updates upd = [(param, param - learningrate * dparam) for param, dparam in zip(params, dC) if netutils.getbaggage(param, 'trainable', True)] # Return return upd
def adam(params, cost=None, gradients=None, learningrate=0.0002, beta1=0.9, beta2=0.999, epsilon=1e-8, eta=0., gamma=0.55, iterstart=0): """ Computes the updates for ADAM. :type params: list :param params: Network parameters. :type cost: theano.tensor.var.TensorVariable :param cost: Cost variable (scalar). Optional if the gradient is provided. :type gradients: list :param gradients: Gradient of a cost w.r.t. parameters. Optional if the cost is provided. :type learningrate: theano.tensor.var.TensorVariable or float :param learningrate: Learning rate of SGD. Can be a float (static) or a dynamic theano variable. :type beta1: float :param beta1: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980 :type beta2: float :param beta2: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980 :type epsilon: float :param epsilon: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980 :type eta: float :param eta: Eta for noisy gradient. See Neelakantan et al. 2015: http://arxiv.org/pdf/1511.06807v1.pdf :type gamma: float :param gamma: Gamma for noisy gradient. See Neelakantan et al. 2015: http://arxiv.org/pdf/1511.06807v1.pdf :type iterstart: int or float :param iterstart: Adam anneals the learning rate with iterations. This parameter specifies the initial value of the iteration count, such that the learning rate is scaled appropriately (or the model might jump out of the potential minimum where it's at). :return: List of updates """ # Validate input assert not (cost is None and gradients is None), "Update function adam requires either a cost scalar or a list of " \ "gradients." # Compute gradients if requested if gradients is None and cost is not None: pdC = T.grad(cost, wrt=params) # Kill gradients if cost is nan dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC] else: dC = gradients updates = [] # Gradient noising if not (eta == 0): # RNG srng = RandomStreams() # Iteration counter itercount = th.shared(np.asarray(iterstart, dtype=th.config.floatX)) # Add noise dC = [dparam + srng.normal(size=dparam.shape, std=T.sqrt(eta/(1 + itercount)**gamma), dtype='floatX') for dparam in dC] # Update itercount updates.append((itercount, itercount + 1)) # Implementation as in reference paper, nothing spectacular here... tm1 = th.shared(np.asarray(iterstart, dtype=th.config.floatX)) t = tm1 + 1 at = T.sqrt(1-beta2**t)/(1-beta1**t) for param, dparam in zip(params, dC): # Check if layer is trainable. Skip if not. if not netutils.getbaggage(param, 'trainable', True): continue # Check if learningrate is to be overriden if netutils.getbaggage(param, 'learningrate', False): # Override lr = param.baggage['learningrate'] else: # Nothing to override lr = learningrate paramshape = param.get_value().shape mtm1 = th.shared(np.zeros(paramshape, dtype=th.config.floatX)) vtm1 = th.shared(np.zeros(paramshape, dtype=th.config.floatX)) mt = beta1 * mtm1 + (1 - beta1) * dparam vt = beta2 * vtm1 + (1 - beta2) * dparam**2 u = lr * at * mt/(T.sqrt(vt) + epsilon) updates.append((mtm1, mt)) updates.append((vtm1, vt)) updates.append((param, param - u)) updates.append((tm1, t)) return updates