class SGDOptimizer(BasicOptimizer): """Stochastic Gradient Descent Optimization Method """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates a Stochastic Gradient Descent optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] result.append((gradient, gradient_new)) return result def _model_update_exprs(self, alpha): updates = dict() for path, param in self.network.get_variables().items(): gradient = self._params[path + '_gradient'] updates[path] = -gradient self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] result.append((param, param + alpha * update)) return result
class BasicLayer(object, metaclass=ABCMeta): """Superclass for Neural Network Layers """ def __init__(self, layer_options, network, profile=False): """Saves some attributes that are common to all layers. :type layer_options: dict :param layer_options: dictionary of layer options :type network: Network :param network: the network object creating this layer :type profile: bool :param profile: if set to True, creates a Theano profile object """ self.name = layer_options['name'] self._input_layers = layer_options['input_layers'] self._params = Parameters() self._devices = layer_options['devices'] if 'size' in layer_options: self.output_size = int(layer_options['size']) else: self.output_size = \ sum([x.output_size for x in self._input_layers]) # Convolutional layers may produce two-dimensional output. In that case, # the state matrix is four-dimensional and the size of the last # dimension is self.output_depth. if 'depth' in layer_options: self.output_depth = int(layer_options['depth']) else: self.output_depth = 1 if 'reverse_time' in layer_options: self._reverse_time = bool(layer_options['reverse_time']) else: self._reverse_time = False logging.debug( "- %s name=%s inputs=[%s] size=%d depth=%d%s devices=[%s]", self.__class__.__name__, self.name, ', '.join([x.name for x in self._input_layers]), self.output_size, self.output_depth, ' reverse,' if self._reverse_time else '', ', '.join([str(x) for x in self._devices])) self._network = network self._profile = profile @abstractmethod def create_structure(self): """Creates the symbolic graph of this layer. Sets self.output to a symbolic matrix that describes the output of this layer. """ assert False def get_state(self, state): """Pulls parameter values from Theano shared variables. If there already is a parameter in the state, it will be replaced, so it has to have the same number of elements. :type state: h5py.File :param state: HDF5 file for storing the neural network parameters """ self._params.get_state(state) def set_state(self, state): """Sets the values of Theano shared variables. :type state: h5py.File :param state: HDF5 file that contains the neural network parameters """ self._params.set_state(state) def num_params(self): """Returns the number of parameters in this layer. This method is used just for reporting the number of parameters in the model. Normally there is just one set of parameters. :rtype: int :returns: the number of parameters used by the layer """ return self._params.total_size def get_variables(self): """Returns a dictionary of the shared variables. This function is used by the optimizers to create optimization parameters that are specific to network parameters, and compute gradients with regard to the parameters. Normally there is just one set of parameters. :rtype: dict :returns: mapping from parameter path to Theano shared variables """ return self._params.get_variables() def _param_path(self, param_name, device=None): """Returns the HDF5 path used to address a parameter. :type param_name: str :param param_name: name of a parameter within this layer :type device: str :param device: ``None`` for parameters that reside on the default device only; otherwise returns the path used to address the part of the parameter that resides on the given device :rtype: str :returns: full path of the parameter in a HDF5 file. """ result = 'layers/' + self.name + '/' + param_name if device is not None: result += '/' + device return result def _get_param(self, param_name, device=None): """Returns a Theano tensor variable by parameter name. :type param_name: str :param param_name: name of a parameter within the layer :type device: str :param device: ``None`` for parameters that reside on the default device only; otherwise returns the part of the parameter that resides on the given device :rtype: TensorVariable :returns: the corresponding tensor variable """ return self._params[self._param_path(param_name, device)] def _init_weight(self, param_name, shape, scale=None, count=1, split_to_devices=False): """Generates a weight matrix from “standard normal” distribution. If ``shape`` contains two dimensions that match, generates an orthogonal matrix. In that case scale is ignored. Orthogonal weights are useful for two reasons: 1. Multiplying by an orthogonal weight preserves the norm of the input vector, which should help avoid exploding and vanishing gradients. 2. The row and column vectors are orthonormal to one another, which should help avoid two vectors learning to produce the same features. If ``count`` is specified, creates a concatenation of several similar submatrices (same shape but different content). If ``split_to_devices`` is set to ``True``, splits the weight to equal parts on the last dimension, and creates one parameter for each device. If also ``count`` is specified, each device will have an equal part of every submatrix. :type shape: list or tuple of ints :param shape: sizes of the weight dimensions; normally the first one is the dimensionality of the input data and the second one is the dimensionality of the output data :type scale: float :param scale: if other than ``None``, the matrix will be scaled by this factor, unless an orthogonal matrix is created :type count: int :param count: concatenate this many weight matrices with the same shape :type split_to_devices: bool :param split_to_devices: if set to ``True``, creates on every device a parameter that contains one part of the weight """ path = self._param_path(param_name) weight = random_matrix(shape, scale, count) if not split_to_devices: self._params.add(path, random_matrix(shape, scale, count)) elif (len(self._devices) == 1) and (self._devices[0] is None): # This layer has not been assigned to a specific device. self._params.add(path, random_matrix(shape, scale, count)) else: self._split_to_devices(path, weight, shape[-1]) def _init_bias(self, param_name, shape, value=None, split_to_devices=False): """Initializes a bias vector with given value. If ``value`` is not given, initializes the vector with zero value. If ``value`` is a list, creates a concatenation of as many vectors as there are elements in the list. If ``split_to_devices`` is set to ``True``, splits the array to equal parts on the last dimension, and creates one parameter for each device. If ``value`` is a list, each device will have an equal part of every submatrix. :type param_name: str :param param_name: name for the parameter within the layer :type shape: int or tuple of ints :param shape: size of the vector, or a tuple of the sizes of each dimension (in case ``value`` is a list, each part will have this size) :type value: float, numpy.ndarray or list :param value: the value or array to initialize the elements to, or a list of values or arrays to create a concatenation of vectors :type split_to_devices: bool :param split_to_devices: if set to ``True``, creates on every device a parameter that contains one part of the array """ path = self._param_path(param_name) bias = matrix_from_value(shape, value) if not split_to_devices: self._params.add(path, matrix_from_value(shape, value)) elif (len(self._devices) == 1) and (self._devices[0] is None): # This layer has not been assigned to a specific device. self._params.add(path, matrix_from_value(shape, value)) else: self._split_to_devices(path, bias, shape[-1]) def _split_to_devices(self, path, value, part_size): """Splits a matrix to equal parts on the last dimension, and creates a parameter on each device. If the matrix consists of submatrices, each device will have an equal part of every submatrix, whose size is specified by ``part_size``. :type path: str :param path: base path for the parameters that will be prefixed by the device string :type value: numpy.ndarray :param value: a matrix that will be split to give the initial value of the parameters :type part_size: int :param part_size: size of the last dimension of ``value``, or if ``value`` consists of multiple submatrices, size of one submatrix """ part_count = value.shape[-1] // part_size if part_count * part_size != value.shape[-1]: raise ValueError("Last dimension is not a multiple of part size.") split_sizes = self._size_per_device(part_size) split_start = 0 for device, split_size in zip(self._devices, split_sizes): assert device is not None split_end = split_start + split_size ranges = [] for part_index in range(part_count): part_start = part_index * part_size ranges.extend( range(part_start + split_start, part_start + split_end)) split_start = split_end self._params.add(path + '/' + device, value[..., ranges], device) def _size_per_device(self, total_size): """Returns ``total_size`` divided for each device. :type total_size: int :param total_size: total size of a parameter :rtype: list of ints :returns: ``total_size`` divided into as many parts as there are devices assigned to this layer """ num_devices = len(self._devices) if num_devices < 1: raise RuntimeError("No devices assigned to this layer.") if total_size < num_devices: raise ValueError( "Cannot split matrix of size {} to {} devices.".format( total_size, num_devices)) result = [] quotient, remainder = divmod(total_size, num_devices) start_index = 0 for i in range(1, num_devices + 1): end_index = i * quotient + min(i, remainder) result.append(end_index - start_index) start_index = end_index assert len(result) == num_devices assert sum(result) == total_size assert end_index == total_size return result def _tensor_preact(self, input_matrix, param_name): """Helper function that creates a pre-activation of ``input_matrix`` by multiplying it by a weight matrix and adding a bias. ``input_matrix`` and the result normally have the shape of a mini-batch: the first dimension is the time step and the second dimension is the sequence. The last dimension is always the data vector. The size of the input data vector should equal to the first dimension of the weight vector, and the second dimension of the weight vector defines the size of the output data vector. :type input_matrix: TensorVariable :param input_matrix: the preactivations will be computed by multiplying the data vectors (the last dimension of this matrix) by the weight matrix, and adding bias :type param_name: str :param param_name: name of a parameter group that contains a weight matrix and a bias vector :rtype: TensorVariable :returns: a matrix that has the same number of dimensions as ``input_matrix``, but the data vectors (the last dimension of this matrix) are the preactivations """ weight = self._params[self._param_path(param_name) + '/W'] bias = self._params[self._param_path(param_name) + '/b'] return tensor.dot(input_matrix, weight) + bias
class RMSPropSGDOptimizer(BasicOptimizer): """RMSProp Variation of Stochastic Gradient Descent Optimization Method At the time of writing, RMSProp is an unpublished method. Usually people cite slide 29 of Lecture 6 of Geoff Hinton's Coursera class: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf The idea is simply to maintain a running average of the squared gradient for each parameter, and divide the gradient by the root of the mean squared gradient (RMS). This makes RMSProp take steps near 1 whenever the gradient is of constant magnitude, and larger steps whenever the local scale of the gradient starts to increase. """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates an RMSProp SGD optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) # Initialize mean squared gradient to ones, otherwise the first # update will be divided by close to zero. self._params.add(path + '_mean_sqr_gradient', numpy.ones_like(param.get_value())) # geometric rate for averaging gradients if 'gradient_decay_rate' not in optimization_options: raise ValueError("Gradient decay rate is not given in training " "options.") self._gamma = optimization_options['gradient_decay_rate'] super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] ms_gradient_new = \ self._gamma * ms_gradient + \ (1.0 - self._gamma) * tensor.sqr(gradient_new) result.append((gradient, gradient_new)) result.append((ms_gradient, ms_gradient_new)) return result def _model_update_exprs(self, alpha): updates = dict() for path, param in self.network.get_variables().items(): gradient = self._params[path + '_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] rms_gradient = tensor.sqrt(ms_gradient + self._epsilon) updates[path] = -gradient / rms_gradient self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] result.append((param, param + alpha * update)) return result
class AdamOptimizer(BasicOptimizer): """Adam Optimization Method D. P. Kingma, J. Ba (2015) Adam: A Method for Stochastic Optimization The International Conference on Learning Representations (ICLR), San Diego """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates an Adam optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() float_type = numpy.dtype(theano.config.floatX).type self._params.add('optimizer/timestep', float_type(0.0)) for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) self._params.add(path + '_mean_gradient', numpy.zeros_like(param.get_value())) self._params.add(path + '_mean_sqr_gradient', numpy.zeros_like(param.get_value())) # geometric rate for averaging gradients if not 'gradient_decay_rate' in optimization_options: raise ValueError("Gradient decay rate is not given in training " "options.") self._gamma_m = optimization_options['gradient_decay_rate'] # geometric rate for averaging squared gradients if not 'sqr_gradient_decay_rate' in optimization_options: raise ValueError("Squared gradient decay rate is not given in " "optimization options.") self._gamma_ms = optimization_options['sqr_gradient_decay_rate'] # momentum if not 'momentum' in optimization_options: raise ValueError("Momentum is not given in optimization options.") self._momentum = optimization_options['momentum'] super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] m_gradient = self._params[path + '_mean_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] m_gradient_new = \ self._gamma_m * m_gradient + \ (1.0 - self._gamma_m) * gradient ms_gradient_new = \ self._gamma_ms * ms_gradient + \ (1.0 - self._gamma_ms) * tensor.sqr(gradient) result.append((gradient, gradient_new)) result.append((m_gradient, m_gradient_new)) result.append((ms_gradient, ms_gradient_new)) return result def _model_update_exprs(self, alpha): timestep = self._params['optimizer/timestep'] timestep_new = timestep + 1.0 alpha *= tensor.sqrt(1.0 - (self._gamma_ms ** timestep_new)) alpha /= 1.0 - (self._gamma_m ** timestep_new) updates = dict() for path, param in self.network.get_variables().items(): m_gradient = self._params[path + '_mean_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] rms_gradient = tensor.sqrt(ms_gradient) + self._epsilon updates[path] = -m_gradient / rms_gradient self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] result.append((param, param + alpha * update)) result.append((timestep, timestep_new)) return result
class NesterovOptimizer(BasicOptimizer): """Nesterov Momentum Optimization Method Normally Nesterov momentum is implemented by first taking a step towards the previous update direction, calculating gradient at that position, using the gradient to obtain the new update direction, and finally updating the parameters. We use an alternative formulation that requires the gradient to be computed only at the current parameter values, described here: https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617 v_{t} = mu * v_{t-1} - lr * gradient(params_{t-1}) params_{t} = params_{t-1} + mu * v_{t} - lr * gradient(params_{t-1}) """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates a Nesterov momentum optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) self._params.add(path + '_velocity', numpy.zeros_like(param.get_value())) # momentum if 'momentum' not in optimization_options: raise ValueError("Momentum is not given in optimization options.") self._momentum = optimization_options['momentum'] super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] result.append((gradient, gradient_new)) return result def _model_update_exprs(self, alpha): updates = dict() for path, param in self.network.get_variables().items(): gradient = self._params[path + '_gradient'] updates[path] = -gradient self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] velocity = self._params[path + '_velocity'] velocity_new = self._momentum * velocity + alpha * update param_new = param + self._momentum * velocity_new + alpha * update result.append((velocity, velocity_new)) result.append((param, param_new)) return result
class AdadeltaOptimizer(BasicOptimizer): """ADADELTA Optimization Method ADADELTA optimization method has been derived from AdaGrad. AdaGrad accumulates the sum of squared gradients over all time, which is used to scale the learning rate smaller and smaller. ADADELTA uses an exponentially decaying average of the squared gradients. This implementation scales the parameter updates by the learning rate hyperparameter. The original paper does not include such scaling, corresponding to learning rate 1. M. D. Zeiler (2012) ADADELTA: An adaptive learning rate method http://arxiv.org/abs/1212.5701 """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates an Adadelta optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() for path, param in network.get_variables().items(): self._params.add(path + "_gradient", numpy.zeros_like(param.get_value())) self._params.add(path + "_mean_sqr_gradient", numpy.zeros_like(param.get_value())) self._params.add(path + "_mean_sqr_velocity", numpy.zeros_like(param.get_value())) # geometric rate for averaging gradients if not "gradient_decay_rate" in optimization_options: raise ValueError("Gradient decay rate is not given in optimization " "options.") self._gamma = optimization_options["gradient_decay_rate"] super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + "_gradient"] ms_gradient = self._params[path + "_mean_sqr_gradient"] ms_gradient_new = self._gamma * ms_gradient + (1.0 - self._gamma) * tensor.sqr(gradient_new) result.append((gradient, gradient_new)) result.append((ms_gradient, ms_gradient_new)) return result def _model_update_exprs(self, alpha): updates = dict() for path, param in self.network.get_variables().items(): gradient = self._params[path + "_gradient"] ms_gradient = self._params[path + "_mean_sqr_gradient"] ms_velocity = self._params[path + "_mean_sqr_velocity"] # rms_velocity quantity lags behind rms_gradient by 1 time step, # due to the recurrence relationship for velocity. rms_gradient = tensor.sqrt(ms_gradient + self._epsilon) rms_velocity = tensor.sqrt(ms_velocity + self._epsilon) velocity = -gradient * rms_velocity / rms_gradient updates[path] = velocity self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] ms_velocity = self._params[path + "_mean_sqr_velocity"] ms_velocity_new = self._gamma * ms_velocity + (1.0 - self._gamma) * tensor.sqr(update) param_new = param + alpha * update result.append((ms_velocity, ms_velocity_new)) result.append((param, param_new)) return result
class RMSPropSGDOptimizer(BasicOptimizer): """RMSProp Variation of Stochastic Gradient Descent Optimization Method At the time of writing, RMSProp is an unpublished method. Usually people cite slide 29 of Lecture 6 of Geoff Hinton's Coursera class: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf The idea is simply to maintain a running average of the squared gradient for each parameter, and divide the gradient by the root of the mean squared gradient (RMS). This makes RMSProp take steps near 1 whenever the gradient is of constant magnitude, and larger steps whenever the local scale of the gradient starts to increase. """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates an RMSProp SGD optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) # Initialize mean squared gradient to ones, otherwise the first # update will be divided by close to zero. self._params.add(path + '_mean_sqr_gradient', numpy.ones_like(param.get_value())) # geometric rate for averaging gradients if not 'gradient_decay_rate' in optimization_options: raise ValueError("Gradient decay rate is not given in training " "options.") self._gamma = optimization_options['gradient_decay_rate'] super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] ms_gradient_new = \ self._gamma * ms_gradient + \ (1.0 - self._gamma) * tensor.sqr(gradient_new) result.append((gradient, gradient_new)) result.append((ms_gradient, ms_gradient_new)) return result def _model_update_exprs(self, alpha): updates = dict() for path, param in self.network.get_variables().items(): gradient = self._params[path + '_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] rms_gradient = tensor.sqrt(ms_gradient + self._epsilon) updates[path] = -gradient / rms_gradient self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] result.append((param, param + alpha * update)) return result
class AdaGradOptimizer(BasicOptimizer): """AdaGrad Optimization Method AdaGrad is a simple extension of Stochastic Gradient Descent that adapts the step size for each component, based on how frequently each component occurs in the gradients. At each update, the learning rate is divided by the root of the sum of squared gradients. (Actually, in this simpler form of the algorithm, the squared gradient is used to approximate the outer product of the gradient vector by itself.) J. Duchi, E. Hazan, Y. Singer (2011) Adaptive Subgradient Methods for Online Learning and Stochastic Optimization Journal of Machine Learning Research 12: 2121-2159 Note: When using a learning rate decreasing schedule, perhaps a running average of the historical gradients would be better than a sum. """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates an AdaGrad optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) self._params.add(path + '_sum_sqr_gradient', numpy.zeros_like(param.get_value())) super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] ss_gradient = self._params[path + '_sum_sqr_gradient'] ss_gradient_new = ss_gradient + tensor.sqr(gradient_new) result.append((gradient, gradient_new)) result.append((ss_gradient, ss_gradient_new)) return result def _model_update_exprs(self, alpha): updates = dict() for path, param in self.network.get_variables().items(): gradient = self._params[path + '_gradient'] ss_gradient = self._params[path + '_sum_sqr_gradient'] rss_gradient = tensor.sqrt(ss_gradient + self._epsilon) updates[path] = -gradient / rss_gradient self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] result.append((param, param + alpha * update)) return result
class AdamOptimizer(BasicOptimizer): """Adam Optimization Method D. P. Kingma, J. Ba (2015) Adam: A Method for Stochastic Optimization The International Conference on Learning Representations (ICLR), San Diego """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates an Adam optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() float_type = numpy.dtype(theano.config.floatX).type self._params.add('optimizer/timestep', float_type(0.0)) for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) self._params.add(path + '_mean_gradient', numpy.zeros_like(param.get_value())) self._params.add(path + '_mean_sqr_gradient', numpy.zeros_like(param.get_value())) # geometric rate for averaging gradients if 'gradient_decay_rate' not in optimization_options: raise ValueError("Gradient decay rate is not given in training " "options.") self._gamma_m = optimization_options['gradient_decay_rate'] # geometric rate for averaging squared gradients if 'sqr_gradient_decay_rate' not in optimization_options: raise ValueError("Squared gradient decay rate is not given in " "optimization options.") self._gamma_ms = optimization_options['sqr_gradient_decay_rate'] # momentum if 'momentum' not in optimization_options: raise ValueError("Momentum is not given in optimization options.") self._momentum = optimization_options['momentum'] super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] m_gradient = self._params[path + '_mean_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] m_gradient_new = \ self._gamma_m * m_gradient + \ (1.0 - self._gamma_m) * gradient ms_gradient_new = \ self._gamma_ms * ms_gradient + \ (1.0 - self._gamma_ms) * tensor.sqr(gradient) result.append((gradient, gradient_new)) result.append((m_gradient, m_gradient_new)) result.append((ms_gradient, ms_gradient_new)) return result def _model_update_exprs(self, alpha): timestep = self._params['optimizer/timestep'] timestep_new = timestep + 1.0 alpha *= tensor.sqrt(1.0 - (self._gamma_ms ** timestep_new)) alpha /= 1.0 - (self._gamma_m ** timestep_new) updates = dict() for path, param in self.network.get_variables().items(): m_gradient = self._params[path + '_mean_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] rms_gradient = tensor.sqrt(ms_gradient) + self._epsilon updates[path] = -m_gradient / rms_gradient self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] result.append((param, param + alpha * update)) result.append((timestep, timestep_new)) return result
class AdadeltaOptimizer(BasicOptimizer): """ADADELTA Optimization Method ADADELTA optimization method has been derived from AdaGrad. AdaGrad accumulates the sum of squared gradients over all time, which is used to scale the learning rate smaller and smaller. ADADELTA uses an exponentially decaying average of the squared gradients. This implementation scales the parameter updates by the learning rate hyperparameter. The original paper does not include such scaling, corresponding to learning rate 1. M. D. Zeiler (2012) ADADELTA: An adaptive learning rate method http://arxiv.org/abs/1212.5701 """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates an Adadelta optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) self._params.add(path + '_mean_sqr_gradient', numpy.zeros_like(param.get_value())) self._params.add(path + '_mean_sqr_velocity', numpy.zeros_like(param.get_value())) # geometric rate for averaging gradients if 'gradient_decay_rate' not in optimization_options: raise ValueError("Gradient decay rate is not given in optimization " "options.") self._gamma = optimization_options['gradient_decay_rate'] super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] ms_gradient_new = \ self._gamma * ms_gradient + \ (1.0 - self._gamma) * tensor.sqr(gradient_new) result.append((gradient, gradient_new)) result.append((ms_gradient, ms_gradient_new)) return result def _model_update_exprs(self, alpha): updates = dict() for path, param in self.network.get_variables().items(): gradient = self._params[path + '_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] ms_velocity = self._params[path + '_mean_sqr_velocity'] # rms_velocity quantity lags behind rms_gradient by 1 time step, # due to the recurrence relationship for velocity. rms_gradient = tensor.sqrt(ms_gradient + self._epsilon) rms_velocity = tensor.sqrt(ms_velocity + self._epsilon) velocity = -gradient * rms_velocity / rms_gradient updates[path] = velocity self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] ms_velocity = self._params[path + '_mean_sqr_velocity'] ms_velocity_new = self._gamma * ms_velocity + \ (1.0 - self._gamma) * tensor.sqr(update) param_new = param + alpha * update result.append((ms_velocity, ms_velocity_new)) result.append((param, param_new)) return result
class RMSPropNesterovOptimizer(BasicOptimizer): """RMSProp Variation of Nesterov Momentum Optimization Method At the time of writing, RMSProp is an unpublished method. Usually people cite slide 29 of Lecture 6 of Geoff Hinton's Coursera class: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf The idea is simply to maintain a running average of the squared gradient for each parameter, and divide the gradient by the root of the mean squared gradient (RMS). This makes RMSProp take steps near 1 whenever the gradient is of constant magnitude, and larger steps whenever the local scale of the gradient starts to increase. RMSProp has been implemented over many optimization methods. This implementation is based on the Nesterov Momentum method. We use an alternative formulation that requires the gradient to be computed only at the current parameter values, described here: https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617 except that we divide the gradient by the RMS gradient: rmsprop_{t-1} = -lr * gradient(params_{t-1}) / rms_gradient(params_{t-1}) v_{t} = mu * v_{t-1} + rmsprop_{t-1} params_{t} = params_{t-1} + mu * v_{t} + rmsprop_{t-1} """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates an RMSProp momentum optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) # Initialize mean squared gradient to ones, otherwise the first # update will be divided by close to zero. self._params.add(path + '_mean_sqr_gradient', numpy.ones_like(param.get_value())) self._params.add(path + '_velocity', numpy.zeros_like(param.get_value())) # geometric rate for averaging gradients if 'gradient_decay_rate' not in optimization_options: raise ValueError("Gradient decay rate is not given in training " "options.") self._gamma = optimization_options['gradient_decay_rate'] # momentum if 'momentum' not in optimization_options: raise ValueError("Momentum is not given in optimization options.") self._momentum = optimization_options['momentum'] super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] ms_gradient_new = \ self._gamma * ms_gradient + \ (1.0 - self._gamma) * tensor.sqr(gradient_new) result.append((gradient, gradient_new)) result.append((ms_gradient, ms_gradient_new)) return result def _model_update_exprs(self, alpha): updates = dict() for path, param in self.network.get_variables().items(): gradient = self._params[path + '_gradient'] ms_gradient = self._params[path + '_mean_sqr_gradient'] rms_gradient = tensor.sqrt(ms_gradient + self._epsilon) updates[path] = -gradient / rms_gradient self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] velocity = self._params[path + '_velocity'] velocity_new = self._momentum * velocity + alpha * update param_new = param + self._momentum * velocity_new + alpha * update result.append((velocity, velocity_new)) result.append((param, param_new)) return result
class BasicLayer(object, metaclass=ABCMeta): """Superclass for Neural Network Layers """ def __init__(self, layer_options, network, profile=False): """Saves some attributes that are common to all layers. :type layer_options: dict :param layer_options: dictionary of layer options :type network: Network :param network: the network object creating this layer :type profile: bool :param profile: if set to True, creates a Theano profile object """ self.name = layer_options['name'] self.input_layers = layer_options['input_layers'] self.params = Parameters() self._devices = layer_options['devices'] if 'size' in layer_options: self.output_size = int(layer_options['size']) else: self.output_size = \ sum([x.output_size for x in self.input_layers]) logging.debug("- %s name=%s inputs=[%s] size=%d, devices=[%s]", self.__class__.__name__, self.name, ', '.join([x.name for x in self.input_layers]), self.output_size, ', '.join([str(x) for x in self._devices])) self._network = network self._profile = profile @abstractmethod def create_structure(self): """Creates the symbolic graph of this layer. Sets self.output to a symbolic matrix that describes the output of this layer. """ assert False def _param_path(self, param_name, device=None): """Returns the HDF5 path used to address a parameter. :type param_name: str :param param_name: name of a parameter within this layer :type device: str :param device: ``None`` for parameters that reside on the default device only; otherwise returns the path used to address the part of the parameter that resides on the given device :rtype: str :returns: full path of the parameter in a HDF5 file. """ result = 'layers/' + self.name + '/' + param_name if not device is None: result += '/' + device return result def _get_param(self, param_name, device=None): """Returns a Theano tensor variable by parameter name. :type param_name: str :param param_name: name of a parameter within the layer :type device: str :param device: ``None`` for parameters that reside on the default device only; otherwise returns the part of the parameter that resides on the given device :rtype: TensorVariable :returns: the corresponding tensor variable """ return self.params[self._param_path(param_name, device)] def _init_weight(self, param_name, shape, scale=None, count=1, split_to_devices=False): """Generates a weight matrix from “standard normal” distribution. If ``shape`` contains two dimensions that match, generates an orthogonal matrix. In that case scale is ignored. Orthogonal weights are useful for two reasons: 1. Multiplying by an orthogonal weight preserves the norm of the input vector, which should help avoid exploding and vanishing gradients. 2. The row and column vectors are orthonormal to one another, which should help avoid two vectors learning to produce the same features. If ``count`` is specified, creates a concatenation of several similar submatrices (same shape but different content). If ``split_to_devices`` is set to ``True``, splits the weight to equal parts on the last dimension, and creates one parameter for each device. If also ``count`` is specified, each device will have an equal part of every submatrix. :type shape: list or tuple of ints :param shape: sizes of the weight dimensions; normally the first one is the dimensionality of the input data and the second one is the dimensionality of the output data :type scale: float :param scale: if other than ``None``, the matrix will be scaled by this factor, unless an orthogonal matrix is created :type count: int :param count: concatenate this many weight matrices with the same shape :type split_to_devices: bool :param split_to_devices: if set to ``True``, creates on every device a parameter that contains one part of the weight """ path = self._param_path(param_name) weight = random_matrix(shape, scale, count) if not split_to_devices: self.params.add(path, random_matrix(shape, scale, count)) elif (len(self._devices) == 1) and (self._devices[0] == None): # This layer has not been assigned to a specific device. self.params.add(path, random_matrix(shape, scale, count)) else: self._split_to_devices(path, weight, shape[-1]) def _init_bias(self, param_name, shape, value=None, split_to_devices=False): """Initializes a bias vector with given value. If ``value`` is not given, initializes the vector with zero value. If ``value`` is a list, creates a concatenation of as many vectors as there are elements in the list. If ``split_to_devices`` is set to ``True``, splits the array to equal parts on the last dimension, and creates one parameter for each device. If ``value`` is a list, each device will have an equal part of every submatrix. :type param_name: str :param param_name: name for the parameter within the layer :type shape: int or tuple of ints :param shape: size of the vector, or a tuple of the sizes of each dimension (in case ``value`` is a list, each part will have this size) :type value: float, numpy.ndarray or list :param value: the value or array to initialize the elements to, or a list of values or arrays to create a concatenation of vectors :type split_to_devices: bool :param split_to_devices: if set to ``True``, creates on every device a parameter that contains one part of the array """ path = self._param_path(param_name) bias = matrix_from_value(shape, value) if not split_to_devices: self.params.add(path, matrix_from_value(shape, value)) elif (len(self._devices) == 1) and (self._devices[0] == None): # This layer has not been assigned to a specific device. self.params.add(path, matrix_from_value(shape, value)) else: self._split_to_devices(path, bias, shape[-1]) def _split_to_devices(self, path, value, part_size): """Splits a matrix to equal parts on the last dimension, and creates a parameter on each device. If the matrix consists of submatrices, each device will have an equal part of every submatrix, whose size is specified by ``part_size``. :type path: str :param path: base path for the parameters that will be prefixed by the device string :type value: numpy.ndarray :param value: a matrix that will be split to give the initial value of the parameters :type part_size: int :param part_size: size of the last dimension of ``value``, or if ``value`` consists of multiple submatrices, size of one submatrix """ part_count = value.shape[-1] // part_size if part_count * part_size != value.shape[-1]: raise ValueError("Last dimension is not a multiple of part size.") split_sizes = self._size_per_device(part_size) split_start = 0 for device, split_size in zip(self._devices, split_sizes): assert not device is None split_end = split_start + split_size ranges = [] for part_index in range(part_count): part_start = part_index * part_size ranges.extend(range(part_start + split_start, part_start + split_end)) split_start = split_end self.params.add(path + '/' + device, value[..., ranges], device) def _size_per_device(self, total_size): """Returns ``total_size`` divided for each device. :type total_size: int :param total_size: total size of a parameter :rtype: list of ints :returns: ``total_size`` divided into as many parts as there are devices assigned to this layer """ num_devices = len(self._devices) if num_devices < 1: raise RuntimeError("No devices assigned to this layer.") if total_size < num_devices: raise ValueError("Cannot split matrix of size {} to {} devices." .format(total_size, num_devices)) result = [] quotient, remainder = divmod(total_size, num_devices) start_index = 0 for i in range(1, num_devices + 1): end_index = i * quotient + min(i, remainder) result.append(end_index - start_index) start_index = end_index assert len(result) == num_devices assert sum(result) == total_size assert end_index == total_size return result def _tensor_preact(self, input_matrix, param_name): """Helper function that creates a pre-activation of ``input_matrix`` by multiplying it by a weight matrix and adding a bias. ``input_matrix`` and the result normally have the shape of a mini-batch: the first dimension is the time step and the second dimension is the sequence. The last dimension is always the data vector. The size of the input data vector should equal to the first dimension of the weight vector, and the second dimension of the weight vector defines the size of the output data vector. :type input_matrix: TensorVariable :param input_matrix: the preactivations will be computed by multiplying the data vectors (the last dimension of this matrix) by the weight matrix, and adding bias :type param_name: str :param param_name: name of a parameter group that contains a weight matrix and a bias vector :rtype: TensorVariable :returns: a matrix tha has the same number of dimensions as ``input_matrix``, but the data vectors (the last dimension of this matrix) are the preactivations """ weight = self.params[self._param_path(param_name) + '/W'] bias = self.params[self._param_path(param_name) + '/b'] return tensor.dot(input_matrix, weight) + bias
class NesterovOptimizer(BasicOptimizer): """Nesterov Momentum Optimization Method Normally Nesterov momentum is implemented by first taking a step towards the previous update direction, calculating gradient at that position, using the gradient to obtain the new update direction, and finally updating the parameters. We use an alternative formulation that requires the gradient to be computed only at the current parameter values, described here: https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617 v_{t} = mu * v_{t-1} - lr * gradient(params_{t-1}) params_{t} = params_{t-1} + mu * v_{t} - lr * gradient(params_{t-1}) """ def __init__(self, optimization_options, network, *args, **kwargs): """Creates a Nesterov momentum optimizer. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object """ self._params = Parameters() for path, param in network.get_variables().items(): self._params.add(path + '_gradient', numpy.zeros_like(param.get_value())) self._params.add(path + '_velocity', numpy.zeros_like(param.get_value())) # momentum if not 'momentum' in optimization_options: raise ValueError("Momentum is not given in optimization options.") self._momentum = optimization_options['momentum'] super().__init__(optimization_options, network, *args, **kwargs) def _gradient_update_exprs(self): result = [] for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs): gradient = self._params[path + '_gradient'] result.append((gradient, gradient_new)) return result def _model_update_exprs(self, alpha): updates = dict() for path, param in self.network.get_variables().items(): gradient = self._params[path + '_gradient'] updates[path] = -gradient self._normalize(updates) result = [] for path, param in self.network.get_variables().items(): update = updates[path] velocity = self._params[path + '_velocity'] velocity_new = self._momentum * velocity + alpha * update param_new = param + self._momentum * velocity_new + alpha * update result.append((velocity, velocity_new)) result.append((param, param_new)) return result