Example #1
0
class SGDOptimizer(BasicOptimizer):
    """Stochastic Gradient Descent Optimization Method
    """

    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates a Stochastic Gradient Descent optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            result.append((gradient, gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            updates[path] = -gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            result.append((param, param + alpha * update))
        return result
Example #2
0
class SGDOptimizer(BasicOptimizer):
    """Stochastic Gradient Descent Optimization Method
    """
    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates a Stochastic Gradient Descent optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            result.append((gradient, gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            updates[path] = -gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            result.append((param, param + alpha * update))
        return result
Example #3
0
class BasicLayer(object, metaclass=ABCMeta):
    """Superclass for Neural Network Layers
    """
    def __init__(self, layer_options, network, profile=False):
        """Saves some attributes that are common to all layers.

        :type layer_options: dict
        :param layer_options: dictionary of layer options

        :type network: Network
        :param network: the network object creating this layer

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.name = layer_options['name']
        self._input_layers = layer_options['input_layers']
        self._params = Parameters()
        self._devices = layer_options['devices']

        if 'size' in layer_options:
            self.output_size = int(layer_options['size'])
        else:
            self.output_size = \
                sum([x.output_size for x in self._input_layers])

        # Convolutional layers may produce two-dimensional output. In that case,
        # the state matrix is four-dimensional and the size of the last
        # dimension is self.output_depth.
        if 'depth' in layer_options:
            self.output_depth = int(layer_options['depth'])
        else:
            self.output_depth = 1

        if 'reverse_time' in layer_options:
            self._reverse_time = bool(layer_options['reverse_time'])
        else:
            self._reverse_time = False

        logging.debug(
            "- %s name=%s inputs=[%s] size=%d depth=%d%s devices=[%s]",
            self.__class__.__name__, self.name,
            ', '.join([x.name for x in self._input_layers]), self.output_size,
            self.output_depth, ' reverse,' if self._reverse_time else '',
            ', '.join([str(x) for x in self._devices]))

        self._network = network
        self._profile = profile

    @abstractmethod
    def create_structure(self):
        """Creates the symbolic graph of this layer.

        Sets self.output to a symbolic matrix that describes the output of this
        layer.
        """

        assert False

    def get_state(self, state):
        """Pulls parameter values from Theano shared variables.

        If there already is a parameter in the state, it will be replaced, so it
        has to have the same number of elements.

        :type state: h5py.File
        :param state: HDF5 file for storing the neural network parameters
        """

        self._params.get_state(state)

    def set_state(self, state):
        """Sets the values of Theano shared variables.

        :type state: h5py.File
        :param state: HDF5 file that contains the neural network parameters
        """

        self._params.set_state(state)

    def num_params(self):
        """Returns the number of parameters in this layer.

        This method is used just for reporting the number of parameters in the
        model. Normally there is just one set of parameters.

        :rtype: int
        :returns: the number of parameters used by the layer
        """

        return self._params.total_size

    def get_variables(self):
        """Returns a dictionary of the shared variables.

        This function is used by the optimizers to create optimization
        parameters that are specific to network parameters, and compute
        gradients with regard to the parameters. Normally there is just one set
        of parameters.

        :rtype: dict
        :returns: mapping from parameter path to Theano shared variables
        """

        return self._params.get_variables()

    def _param_path(self, param_name, device=None):
        """Returns the HDF5 path used to address a parameter.

        :type param_name: str
        :param param_name: name of a parameter within this layer

        :type device: str
        :param device: ``None`` for parameters that reside on the default device
                       only; otherwise returns the path used to address the part
                       of the parameter that resides on the given device

        :rtype: str
        :returns: full path of the parameter in a HDF5 file.
        """

        result = 'layers/' + self.name + '/' + param_name
        if device is not None:
            result += '/' + device
        return result

    def _get_param(self, param_name, device=None):
        """Returns a Theano tensor variable by parameter name.

        :type param_name: str
        :param param_name: name of a parameter within the layer

        :type device: str
        :param device: ``None`` for parameters that reside on the default device
                       only; otherwise returns the part of the parameter that
                       resides on the given device

        :rtype: TensorVariable
        :returns: the corresponding tensor variable
        """

        return self._params[self._param_path(param_name, device)]

    def _init_weight(self,
                     param_name,
                     shape,
                     scale=None,
                     count=1,
                     split_to_devices=False):
        """Generates a weight matrix from “standard normal” distribution.

        If ``shape`` contains two dimensions that match, generates an orthogonal
        matrix. In that case scale is ignored. Orthogonal weights are useful for
        two reasons:

        1. Multiplying by an orthogonal weight preserves the norm of the
           input vector, which should help avoid exploding and vanishing
           gradients.
        2. The row and column vectors are orthonormal to one another, which
           should help avoid two vectors learning to produce the same features.

        If ``count`` is specified, creates a concatenation of several similar
        submatrices (same shape but different content).

        If ``split_to_devices`` is set to ``True``, splits the weight to equal
        parts on the last dimension, and creates one parameter for each device.
        If also ``count`` is specified, each device will have an equal part of
        every submatrix.

        :type shape: list or tuple of ints
        :param shape: sizes of the weight dimensions; normally the first one is
                      the dimensionality of the input data and the second one is
                      the dimensionality of the output data

        :type scale: float
        :param scale: if other than ``None``, the matrix will be scaled by this
                      factor, unless an orthogonal matrix is created

        :type count: int
        :param count: concatenate this many weight matrices with the same shape

        :type split_to_devices: bool
        :param split_to_devices: if set to ``True``, creates on every device a
                                 parameter that contains one part of the weight
        """

        path = self._param_path(param_name)
        weight = random_matrix(shape, scale, count)
        if not split_to_devices:
            self._params.add(path, random_matrix(shape, scale, count))
        elif (len(self._devices) == 1) and (self._devices[0] is None):
            # This layer has not been assigned to a specific device.
            self._params.add(path, random_matrix(shape, scale, count))
        else:
            self._split_to_devices(path, weight, shape[-1])

    def _init_bias(self,
                   param_name,
                   shape,
                   value=None,
                   split_to_devices=False):
        """Initializes a bias vector with given value.

        If ``value`` is not given, initializes the vector with zero value. If
        ``value`` is a list, creates a concatenation of as many vectors as there
        are elements in the list.

        If ``split_to_devices`` is set to ``True``, splits the array to equal
        parts on the last dimension, and creates one parameter for each device.
        If ``value`` is a list, each device will have an equal part of every
        submatrix.

        :type param_name: str
        :param param_name: name for the parameter within the layer

        :type shape: int or tuple of ints
        :param shape: size of the vector, or a tuple of the sizes of each
                      dimension (in case ``value`` is a list, each part will
                      have this size)

        :type value: float, numpy.ndarray or list
        :param value: the value or array to initialize the elements to, or a
                      list of values or arrays to create a concatenation of
                      vectors

        :type split_to_devices: bool
        :param split_to_devices: if set to ``True``, creates on every device a
                                 parameter that contains one part of the array
        """

        path = self._param_path(param_name)
        bias = matrix_from_value(shape, value)
        if not split_to_devices:
            self._params.add(path, matrix_from_value(shape, value))
        elif (len(self._devices) == 1) and (self._devices[0] is None):
            # This layer has not been assigned to a specific device.
            self._params.add(path, matrix_from_value(shape, value))
        else:
            self._split_to_devices(path, bias, shape[-1])

    def _split_to_devices(self, path, value, part_size):
        """Splits a matrix to equal parts on the last dimension, and creates a
        parameter on each device.

        If the matrix consists of submatrices, each device will have an equal
        part of every submatrix, whose size is specified by ``part_size``.

        :type path: str
        :param path: base path for the parameters that will be prefixed by the
                     device string

        :type value: numpy.ndarray
        :param value: a matrix that will be split to give the initial value of
                      the parameters

        :type part_size: int
        :param part_size: size of the last dimension of ``value``, or if
                          ``value`` consists of multiple submatrices, size of
                          one submatrix
        """

        part_count = value.shape[-1] // part_size
        if part_count * part_size != value.shape[-1]:
            raise ValueError("Last dimension is not a multiple of part size.")

        split_sizes = self._size_per_device(part_size)
        split_start = 0
        for device, split_size in zip(self._devices, split_sizes):
            assert device is not None
            split_end = split_start + split_size
            ranges = []
            for part_index in range(part_count):
                part_start = part_index * part_size
                ranges.extend(
                    range(part_start + split_start, part_start + split_end))
            split_start = split_end
            self._params.add(path + '/' + device, value[..., ranges], device)

    def _size_per_device(self, total_size):
        """Returns ``total_size`` divided for each device.

        :type total_size: int
        :param total_size: total size of a parameter

        :rtype: list of ints
        :returns: ``total_size`` divided into as many parts as there are devices
                  assigned to this layer
        """

        num_devices = len(self._devices)
        if num_devices < 1:
            raise RuntimeError("No devices assigned to this layer.")
        if total_size < num_devices:
            raise ValueError(
                "Cannot split matrix of size {} to {} devices.".format(
                    total_size, num_devices))

        result = []
        quotient, remainder = divmod(total_size, num_devices)
        start_index = 0
        for i in range(1, num_devices + 1):
            end_index = i * quotient + min(i, remainder)
            result.append(end_index - start_index)
            start_index = end_index

        assert len(result) == num_devices
        assert sum(result) == total_size
        assert end_index == total_size

        return result

    def _tensor_preact(self, input_matrix, param_name):
        """Helper function that creates a pre-activation of ``input_matrix`` by
        multiplying it by a weight matrix and adding a bias.

        ``input_matrix`` and the result normally have the shape of a mini-batch:
        the first dimension is the time step and the second dimension is the
        sequence. The last dimension is always the data vector. The size of the
        input data vector should equal to the first dimension of the weight
        vector, and the second dimension of the weight vector defines the size
        of the output data vector.

        :type input_matrix: TensorVariable
        :param input_matrix: the preactivations will be computed by multiplying
                             the data vectors (the last dimension of this
                             matrix) by the weight matrix, and adding bias

        :type param_name: str
        :param param_name: name of a parameter group that contains a weight
                           matrix and a bias vector

        :rtype: TensorVariable
        :returns: a matrix that has the same number of dimensions as
                  ``input_matrix``, but the data vectors (the last dimension of
                  this matrix) are the preactivations
        """

        weight = self._params[self._param_path(param_name) + '/W']
        bias = self._params[self._param_path(param_name) + '/b']
        return tensor.dot(input_matrix, weight) + bias
class RMSPropSGDOptimizer(BasicOptimizer):
    """RMSProp Variation of Stochastic Gradient Descent Optimization Method

    At the time of writing, RMSProp is an unpublished method. Usually people
    cite slide 29 of Lecture 6 of Geoff Hinton's Coursera class:
    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf

    The idea is simply to maintain a running average of the squared gradient for
    each parameter, and divide the gradient by the root of the mean squared
    gradient (RMS). This makes RMSProp take steps near 1 whenever the gradient
    is of constant magnitude, and larger steps whenever the local scale of the
    gradient starts to increase.
    """

    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates an RMSProp SGD optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            # Initialize mean squared gradient to ones, otherwise the first
            # update will be divided by close to zero.
            self._params.add(path + '_mean_sqr_gradient',
                             numpy.ones_like(param.get_value()))

        # geometric rate for averaging gradients
        if 'gradient_decay_rate' not in optimization_options:
            raise ValueError("Gradient decay rate is not given in training "
                             "options.")
        self._gamma = optimization_options['gradient_decay_rate']

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            ms_gradient_new = \
                self._gamma * ms_gradient + \
                (1.0 - self._gamma) * tensor.sqr(gradient_new)
            result.append((gradient, gradient_new))
            result.append((ms_gradient, ms_gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            rms_gradient = tensor.sqrt(ms_gradient + self._epsilon)
            updates[path] = -gradient / rms_gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            result.append((param, param + alpha * update))
        return result
Example #5
0
class AdamOptimizer(BasicOptimizer):
    """Adam Optimization Method

    D. P. Kingma, J. Ba (2015)
    Adam: A Method for Stochastic Optimization
    The International Conference on Learning Representations (ICLR), San Diego
    """

    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates an Adam optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()

        float_type = numpy.dtype(theano.config.floatX).type
        self._params.add('optimizer/timestep', float_type(0.0))

        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_mean_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_mean_sqr_gradient',
                             numpy.zeros_like(param.get_value()))

        # geometric rate for averaging gradients
        if not 'gradient_decay_rate' in optimization_options:
            raise ValueError("Gradient decay rate is not given in training "
                             "options.")
        self._gamma_m = optimization_options['gradient_decay_rate']

        # geometric rate for averaging squared gradients
        if not 'sqr_gradient_decay_rate' in optimization_options:
            raise ValueError("Squared gradient decay rate is not given in "
                             "optimization options.")
        self._gamma_ms = optimization_options['sqr_gradient_decay_rate']

        # momentum
        if not 'momentum' in optimization_options:
            raise ValueError("Momentum is not given in optimization options.")
        self._momentum = optimization_options['momentum']

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            m_gradient = self._params[path + '_mean_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            m_gradient_new = \
                self._gamma_m * m_gradient + \
                (1.0 - self._gamma_m) * gradient
            ms_gradient_new = \
                self._gamma_ms * ms_gradient + \
                (1.0 - self._gamma_ms) * tensor.sqr(gradient)
            result.append((gradient, gradient_new))
            result.append((m_gradient, m_gradient_new))
            result.append((ms_gradient, ms_gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        timestep = self._params['optimizer/timestep']
        timestep_new = timestep + 1.0
        alpha *= tensor.sqrt(1.0 - (self._gamma_ms ** timestep_new))
        alpha /= 1.0 - (self._gamma_m ** timestep_new)

        updates = dict()
        for path, param in self.network.get_variables().items():
            m_gradient = self._params[path + '_mean_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            rms_gradient = tensor.sqrt(ms_gradient) + self._epsilon
            updates[path] = -m_gradient / rms_gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            result.append((param, param + alpha * update))
        result.append((timestep, timestep_new))
        return result
Example #6
0
class NesterovOptimizer(BasicOptimizer):
    """Nesterov Momentum Optimization Method

    Normally Nesterov momentum is implemented by first taking a step towards
    the previous update direction, calculating gradient at that position,
    using the gradient to obtain the new update direction, and finally
    updating the parameters. We use an alternative formulation that requires
    the gradient to be computed only at the current parameter values,
    described here:
    https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617

    v_{t} = mu * v_{t-1} - lr * gradient(params_{t-1})
    params_{t} = params_{t-1} + mu * v_{t} - lr * gradient(params_{t-1})
    """
    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates a Nesterov momentum optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_velocity',
                             numpy.zeros_like(param.get_value()))

        # momentum
        if 'momentum' not in optimization_options:
            raise ValueError("Momentum is not given in optimization options.")
        self._momentum = optimization_options['momentum']

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            result.append((gradient, gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            updates[path] = -gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            velocity = self._params[path + '_velocity']
            velocity_new = self._momentum * velocity + alpha * update
            param_new = param + self._momentum * velocity_new + alpha * update
            result.append((velocity, velocity_new))
            result.append((param, param_new))
        return result
Example #7
0
class AdadeltaOptimizer(BasicOptimizer):
    """ADADELTA Optimization Method

    ADADELTA optimization method has been derived from AdaGrad. AdaGrad
    accumulates the sum of squared gradients over all time, which is used to
    scale the learning rate smaller and smaller. ADADELTA uses an exponentially
    decaying average of the squared gradients.

    This implementation scales the parameter updates by the learning rate
    hyperparameter. The original paper does not include such scaling,
    corresponding to learning rate 1.

    M. D. Zeiler (2012)
    ADADELTA: An adaptive learning rate method
    http://arxiv.org/abs/1212.5701
    """

    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates an Adadelta optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + "_gradient", numpy.zeros_like(param.get_value()))
            self._params.add(path + "_mean_sqr_gradient", numpy.zeros_like(param.get_value()))
            self._params.add(path + "_mean_sqr_velocity", numpy.zeros_like(param.get_value()))

        # geometric rate for averaging gradients
        if not "gradient_decay_rate" in optimization_options:
            raise ValueError("Gradient decay rate is not given in optimization " "options.")
        self._gamma = optimization_options["gradient_decay_rate"]

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(), self._gradient_exprs):
            gradient = self._params[path + "_gradient"]
            ms_gradient = self._params[path + "_mean_sqr_gradient"]
            ms_gradient_new = self._gamma * ms_gradient + (1.0 - self._gamma) * tensor.sqr(gradient_new)
            result.append((gradient, gradient_new))
            result.append((ms_gradient, ms_gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + "_gradient"]
            ms_gradient = self._params[path + "_mean_sqr_gradient"]
            ms_velocity = self._params[path + "_mean_sqr_velocity"]
            # rms_velocity quantity lags behind rms_gradient by 1 time step,
            # due to the recurrence relationship for velocity.
            rms_gradient = tensor.sqrt(ms_gradient + self._epsilon)
            rms_velocity = tensor.sqrt(ms_velocity + self._epsilon)
            velocity = -gradient * rms_velocity / rms_gradient
            updates[path] = velocity
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            ms_velocity = self._params[path + "_mean_sqr_velocity"]
            ms_velocity_new = self._gamma * ms_velocity + (1.0 - self._gamma) * tensor.sqr(update)
            param_new = param + alpha * update
            result.append((ms_velocity, ms_velocity_new))
            result.append((param, param_new))
        return result
Example #8
0
class RMSPropSGDOptimizer(BasicOptimizer):
    """RMSProp Variation of Stochastic Gradient Descent Optimization Method

    At the time of writing, RMSProp is an unpublished method. Usually people
    cite slide 29 of Lecture 6 of Geoff Hinton's Coursera class:
    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf 

    The idea is simply to maintain a running average of the squared gradient for
    each parameter, and divide the gradient by the root of the mean squared
    gradient (RMS). This makes RMSProp take steps near 1 whenever the gradient
    is of constant magnitude, and larger steps whenever the local scale of the
    gradient starts to increase.
    """

    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates an RMSProp SGD optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            # Initialize mean squared gradient to ones, otherwise the first
            # update will be divided by close to zero.
            self._params.add(path + '_mean_sqr_gradient',
                             numpy.ones_like(param.get_value()))

        # geometric rate for averaging gradients
        if not 'gradient_decay_rate' in optimization_options:
            raise ValueError("Gradient decay rate is not given in training "
                             "options.")
        self._gamma = optimization_options['gradient_decay_rate']

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            ms_gradient_new = \
                self._gamma * ms_gradient + \
                (1.0 - self._gamma) * tensor.sqr(gradient_new)
            result.append((gradient, gradient_new))
            result.append((ms_gradient, ms_gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            rms_gradient = tensor.sqrt(ms_gradient + self._epsilon)
            updates[path] = -gradient / rms_gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            result.append((param, param + alpha * update))
        return result
Example #9
0
class AdaGradOptimizer(BasicOptimizer):
    """AdaGrad Optimization Method

    AdaGrad is a simple extension of Stochastic Gradient Descent that adapts the
    step size for each component, based on how frequently each component occurs
    in the gradients. At each update, the learning rate is divided by the root
    of the sum of squared gradients. (Actually, in this simpler form of the
    algorithm, the squared gradient is used to approximate the outer product of
    the gradient vector by itself.)

    J. Duchi, E. Hazan, Y. Singer (2011)
    Adaptive Subgradient Methods for Online Learning and Stochastic Optimization
    Journal of Machine Learning Research 12: 2121-2159

    Note: When using a learning rate decreasing schedule, perhaps a running
    average of the historical gradients would be better than a sum.
    """

    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates an AdaGrad optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_sum_sqr_gradient',
                             numpy.zeros_like(param.get_value()))

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            ss_gradient = self._params[path + '_sum_sqr_gradient']
            ss_gradient_new = ss_gradient + tensor.sqr(gradient_new)
            result.append((gradient, gradient_new))
            result.append((ss_gradient, ss_gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            ss_gradient = self._params[path + '_sum_sqr_gradient']
            rss_gradient = tensor.sqrt(ss_gradient + self._epsilon)
            updates[path] = -gradient / rss_gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            result.append((param, param + alpha * update))
        return result
Example #10
0
class AdamOptimizer(BasicOptimizer):
    """Adam Optimization Method

    D. P. Kingma, J. Ba (2015)
    Adam: A Method for Stochastic Optimization
    The International Conference on Learning Representations (ICLR), San Diego
    """

    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates an Adam optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()

        float_type = numpy.dtype(theano.config.floatX).type
        self._params.add('optimizer/timestep', float_type(0.0))

        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_mean_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_mean_sqr_gradient',
                             numpy.zeros_like(param.get_value()))

        # geometric rate for averaging gradients
        if 'gradient_decay_rate' not in optimization_options:
            raise ValueError("Gradient decay rate is not given in training "
                             "options.")
        self._gamma_m = optimization_options['gradient_decay_rate']

        # geometric rate for averaging squared gradients
        if 'sqr_gradient_decay_rate' not in optimization_options:
            raise ValueError("Squared gradient decay rate is not given in "
                             "optimization options.")
        self._gamma_ms = optimization_options['sqr_gradient_decay_rate']

        # momentum
        if 'momentum' not in optimization_options:
            raise ValueError("Momentum is not given in optimization options.")
        self._momentum = optimization_options['momentum']

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            m_gradient = self._params[path + '_mean_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            m_gradient_new = \
                self._gamma_m * m_gradient + \
                (1.0 - self._gamma_m) * gradient
            ms_gradient_new = \
                self._gamma_ms * ms_gradient + \
                (1.0 - self._gamma_ms) * tensor.sqr(gradient)
            result.append((gradient, gradient_new))
            result.append((m_gradient, m_gradient_new))
            result.append((ms_gradient, ms_gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        timestep = self._params['optimizer/timestep']
        timestep_new = timestep + 1.0
        alpha *= tensor.sqrt(1.0 - (self._gamma_ms ** timestep_new))
        alpha /= 1.0 - (self._gamma_m ** timestep_new)

        updates = dict()
        for path, param in self.network.get_variables().items():
            m_gradient = self._params[path + '_mean_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            rms_gradient = tensor.sqrt(ms_gradient) + self._epsilon
            updates[path] = -m_gradient / rms_gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            result.append((param, param + alpha * update))
        result.append((timestep, timestep_new))
        return result
Example #11
0
class AdadeltaOptimizer(BasicOptimizer):
    """ADADELTA Optimization Method

    ADADELTA optimization method has been derived from AdaGrad. AdaGrad
    accumulates the sum of squared gradients over all time, which is used to
    scale the learning rate smaller and smaller. ADADELTA uses an exponentially
    decaying average of the squared gradients.

    This implementation scales the parameter updates by the learning rate
    hyperparameter. The original paper does not include such scaling,
    corresponding to learning rate 1.

    M. D. Zeiler (2012)
    ADADELTA: An adaptive learning rate method
    http://arxiv.org/abs/1212.5701
    """

    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates an Adadelta optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_mean_sqr_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_mean_sqr_velocity',
                             numpy.zeros_like(param.get_value()))

        # geometric rate for averaging gradients
        if 'gradient_decay_rate' not in optimization_options:
            raise ValueError("Gradient decay rate is not given in optimization "
                             "options.")
        self._gamma = optimization_options['gradient_decay_rate']

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            ms_gradient_new = \
                self._gamma * ms_gradient + \
                (1.0 - self._gamma) * tensor.sqr(gradient_new)
            result.append((gradient, gradient_new))
            result.append((ms_gradient, ms_gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            ms_velocity = self._params[path + '_mean_sqr_velocity']
            # rms_velocity quantity lags behind rms_gradient by 1 time step,
            # due to the recurrence relationship for velocity.
            rms_gradient = tensor.sqrt(ms_gradient + self._epsilon)
            rms_velocity = tensor.sqrt(ms_velocity + self._epsilon)
            velocity = -gradient * rms_velocity / rms_gradient
            updates[path] = velocity
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            ms_velocity = self._params[path + '_mean_sqr_velocity']
            ms_velocity_new = self._gamma * ms_velocity + \
                              (1.0 - self._gamma) * tensor.sqr(update)
            param_new = param + alpha * update
            result.append((ms_velocity, ms_velocity_new))
            result.append((param, param_new))
        return result
Example #12
0
class RMSPropNesterovOptimizer(BasicOptimizer):
    """RMSProp Variation of Nesterov Momentum Optimization Method

    At the time of writing, RMSProp is an unpublished method. Usually people
    cite slide 29 of Lecture 6 of Geoff Hinton's Coursera class:
    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf

    The idea is simply to maintain a running average of the squared gradient for
    each parameter, and divide the gradient by the root of the mean squared
    gradient (RMS). This makes RMSProp take steps near 1 whenever the gradient
    is of constant magnitude, and larger steps whenever the local scale of the
    gradient starts to increase.

    RMSProp has been implemented over many optimization methods. This
    implementation is based on the Nesterov Momentum method. We use an
    alternative formulation that requires the gradient to be computed only at
    the current parameter values, described here:
    https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617
    except that we divide the gradient by the RMS gradient:

    rmsprop_{t-1} = -lr * gradient(params_{t-1}) / rms_gradient(params_{t-1})
    v_{t} = mu * v_{t-1} + rmsprop_{t-1}
    params_{t} = params_{t-1} + mu * v_{t} + rmsprop_{t-1}
    """
    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates an RMSProp momentum optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            # Initialize mean squared gradient to ones, otherwise the first
            # update will be divided by close to zero.
            self._params.add(path + '_mean_sqr_gradient',
                             numpy.ones_like(param.get_value()))
            self._params.add(path + '_velocity',
                             numpy.zeros_like(param.get_value()))

        # geometric rate for averaging gradients
        if 'gradient_decay_rate' not in optimization_options:
            raise ValueError("Gradient decay rate is not given in training "
                             "options.")
        self._gamma = optimization_options['gradient_decay_rate']

        # momentum
        if 'momentum' not in optimization_options:
            raise ValueError("Momentum is not given in optimization options.")
        self._momentum = optimization_options['momentum']

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            ms_gradient_new = \
                self._gamma * ms_gradient + \
                (1.0 - self._gamma) * tensor.sqr(gradient_new)
            result.append((gradient, gradient_new))
            result.append((ms_gradient, ms_gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            ms_gradient = self._params[path + '_mean_sqr_gradient']
            rms_gradient = tensor.sqrt(ms_gradient + self._epsilon)
            updates[path] = -gradient / rms_gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            velocity = self._params[path + '_velocity']
            velocity_new = self._momentum * velocity + alpha * update
            param_new = param + self._momentum * velocity_new + alpha * update
            result.append((velocity, velocity_new))
            result.append((param, param_new))
        return result
Example #13
0
class AdaGradOptimizer(BasicOptimizer):
    """AdaGrad Optimization Method

    AdaGrad is a simple extension of Stochastic Gradient Descent that adapts the
    step size for each component, based on how frequently each component occurs
    in the gradients. At each update, the learning rate is divided by the root
    of the sum of squared gradients. (Actually, in this simpler form of the
    algorithm, the squared gradient is used to approximate the outer product of
    the gradient vector by itself.)

    J. Duchi, E. Hazan, Y. Singer (2011)
    Adaptive Subgradient Methods for Online Learning and Stochastic Optimization
    Journal of Machine Learning Research 12: 2121-2159

    Note: When using a learning rate decreasing schedule, perhaps a running
    average of the historical gradients would be better than a sum.
    """
    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates an AdaGrad optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_sum_sqr_gradient',
                             numpy.zeros_like(param.get_value()))

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            ss_gradient = self._params[path + '_sum_sqr_gradient']
            ss_gradient_new = ss_gradient + tensor.sqr(gradient_new)
            result.append((gradient, gradient_new))
            result.append((ss_gradient, ss_gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            ss_gradient = self._params[path + '_sum_sqr_gradient']
            rss_gradient = tensor.sqrt(ss_gradient + self._epsilon)
            updates[path] = -gradient / rss_gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            result.append((param, param + alpha * update))
        return result
Example #14
0
class BasicLayer(object, metaclass=ABCMeta):
    """Superclass for Neural Network Layers
    """

    def __init__(self, layer_options, network, profile=False):
        """Saves some attributes that are common to all layers.

        :type layer_options: dict
        :param layer_options: dictionary of layer options

        :type network: Network
        :param network: the network object creating this layer

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.name = layer_options['name']
        self.input_layers = layer_options['input_layers']
        self.params = Parameters()
        self._devices = layer_options['devices']

        if 'size' in layer_options:
            self.output_size = int(layer_options['size'])
        else:
            self.output_size = \
                sum([x.output_size for x in self.input_layers])

        logging.debug("- %s name=%s inputs=[%s] size=%d, devices=[%s]",
            self.__class__.__name__,
            self.name,
            ', '.join([x.name for x in self.input_layers]),
            self.output_size,
            ', '.join([str(x) for x in self._devices]))

        self._network = network
        self._profile = profile

    @abstractmethod
    def create_structure(self):
        """Creates the symbolic graph of this layer.

        Sets self.output to a symbolic matrix that describes the output of this
        layer.
        """

        assert False

    def _param_path(self, param_name, device=None):
        """Returns the HDF5 path used to address a parameter.

        :type param_name: str
        :param param_name: name of a parameter within this layer

        :type device: str
        :param device: ``None`` for parameters that reside on the default device
                       only; otherwise returns the path used to address the part
                       of the parameter that resides on the given device

        :rtype: str
        :returns: full path of the parameter in a HDF5 file.
        """

        result = 'layers/' + self.name + '/' + param_name
        if not device is None:
            result += '/' + device
        return result

    def _get_param(self, param_name, device=None):
        """Returns a Theano tensor variable by parameter name.

        :type param_name: str
        :param param_name: name of a parameter within the layer

        :type device: str
        :param device: ``None`` for parameters that reside on the default device
                       only; otherwise returns the part of the parameter that
                       resides on the given device

        :rtype: TensorVariable
        :returns: the corresponding tensor variable
        """

        return self.params[self._param_path(param_name, device)]

    def _init_weight(self, param_name, shape, scale=None, count=1,
                     split_to_devices=False):
        """Generates a weight matrix from “standard normal” distribution.

        If ``shape`` contains two dimensions that match, generates an orthogonal
        matrix. In that case scale is ignored. Orthogonal weights are useful for
        two reasons:

        1. Multiplying by an orthogonal weight preserves the norm of the
           input vector, which should help avoid exploding and vanishing
           gradients.
        2. The row and column vectors are orthonormal to one another, which
           should help avoid two vectors learning to produce the same features.

        If ``count`` is specified, creates a concatenation of several similar
        submatrices (same shape but different content).

        If ``split_to_devices`` is set to ``True``, splits the weight to equal
        parts on the last dimension, and creates one parameter for each device.
        If also ``count`` is specified, each device will have an equal part of
        every submatrix.

        :type shape: list or tuple of ints
        :param shape: sizes of the weight dimensions; normally the first one is
                      the dimensionality of the input data and the second one is
                      the dimensionality of the output data

        :type scale: float
        :param scale: if other than ``None``, the matrix will be scaled by this
                      factor, unless an orthogonal matrix is created

        :type count: int
        :param count: concatenate this many weight matrices with the same shape

        :type split_to_devices: bool
        :param split_to_devices: if set to ``True``, creates on every device a
                                 parameter that contains one part of the weight
        """

        path = self._param_path(param_name)
        weight = random_matrix(shape, scale, count)
        if not split_to_devices:
            self.params.add(path, random_matrix(shape, scale, count))
        elif (len(self._devices) == 1) and (self._devices[0] == None):
            # This layer has not been assigned to a specific device.
            self.params.add(path, random_matrix(shape, scale, count))
        else:
            self._split_to_devices(path, weight, shape[-1])

    def _init_bias(self, param_name, shape, value=None, split_to_devices=False):
        """Initializes a bias vector with given value.

        If ``value`` is not given, initializes the vector with zero value. If
        ``value`` is a list, creates a concatenation of as many vectors as there
        are elements in the list.

        If ``split_to_devices`` is set to ``True``, splits the array to equal
        parts on the last dimension, and creates one parameter for each device.
        If ``value`` is a list, each device will have an equal part of every
        submatrix.

        :type param_name: str
        :param param_name: name for the parameter within the layer

        :type shape: int or tuple of ints
        :param shape: size of the vector, or a tuple of the sizes of each
                      dimension (in case ``value`` is a list, each part will
                      have this size)

        :type value: float, numpy.ndarray or list
        :param value: the value or array to initialize the elements to, or a
                      list of values or arrays to create a concatenation of
                      vectors

        :type split_to_devices: bool
        :param split_to_devices: if set to ``True``, creates on every device a
                                 parameter that contains one part of the array
        """

        path = self._param_path(param_name)
        bias = matrix_from_value(shape, value)
        if not split_to_devices:
            self.params.add(path, matrix_from_value(shape, value))
        elif (len(self._devices) == 1) and (self._devices[0] == None):
            # This layer has not been assigned to a specific device.
            self.params.add(path, matrix_from_value(shape, value))
        else:
            self._split_to_devices(path, bias, shape[-1])

    def _split_to_devices(self, path, value, part_size):
        """Splits a matrix to equal parts on the last dimension, and creates a
        parameter on each device.

        If the matrix consists of submatrices, each device will have an equal
        part of every submatrix, whose size is specified by ``part_size``.

        :type path: str
        :param path: base path for the parameters that will be prefixed by the
                     device string

        :type value: numpy.ndarray
        :param value: a matrix that will be split to give the initial value of
                      the parameters

        :type part_size: int
        :param part_size: size of the last dimension of ``value``, or if
                          ``value`` consists of multiple submatrices, size of
                          one submatrix
        """

        part_count = value.shape[-1] // part_size
        if part_count * part_size != value.shape[-1]:
            raise ValueError("Last dimension is not a multiple of part size.")

        split_sizes = self._size_per_device(part_size)
        split_start = 0
        for device, split_size in zip(self._devices, split_sizes):
            assert not device is None
            split_end = split_start + split_size
            ranges = []
            for part_index in range(part_count):
                part_start = part_index * part_size
                ranges.extend(range(part_start + split_start,
                                    part_start + split_end))
            split_start = split_end
            self.params.add(path + '/' + device, value[..., ranges], device)

    def _size_per_device(self, total_size):
        """Returns ``total_size`` divided for each device.

        :type total_size: int
        :param total_size: total size of a parameter

        :rtype: list of ints
        :returns: ``total_size`` divided into as many parts as there are devices
                  assigned to this layer
        """

        num_devices = len(self._devices)
        if num_devices < 1:
            raise RuntimeError("No devices assigned to this layer.")
        if total_size < num_devices:
            raise ValueError("Cannot split matrix of size {} to {} devices."
                             .format(total_size, num_devices))

        result = []
        quotient, remainder = divmod(total_size, num_devices)
        start_index = 0
        for i in range(1, num_devices + 1):
            end_index = i * quotient + min(i, remainder)
            result.append(end_index - start_index)
            start_index = end_index

        assert len(result) == num_devices
        assert sum(result) == total_size
        assert end_index == total_size

        return result

    def _tensor_preact(self, input_matrix, param_name):
        """Helper function that creates a pre-activation of ``input_matrix`` by
        multiplying it by a weight matrix and adding a bias.

        ``input_matrix`` and the result normally have the shape of a mini-batch:
        the first dimension is the time step and the second dimension is the
        sequence. The last dimension is always the data vector. The size of the
        input data vector should equal to the first dimension of the weight
        vector, and the second dimension of the weight vector defines the size
        of the output data vector.

        :type input_matrix: TensorVariable
        :param input_matrix: the preactivations will be computed by multiplying
                             the data vectors (the last dimension of this
                             matrix) by the weight matrix, and adding bias

        :type param_name: str
        :param param_name: name of a parameter group that contains a weight
                           matrix and a bias vector

        :rtype: TensorVariable
        :returns: a matrix tha has the same number of dimensions as
                  ``input_matrix``, but the data vectors (the last dimension of
                  this matrix) are the preactivations
        """

        weight = self.params[self._param_path(param_name) + '/W']
        bias = self.params[self._param_path(param_name) + '/b']
        return tensor.dot(input_matrix, weight) + bias
Example #15
0
class NesterovOptimizer(BasicOptimizer):
    """Nesterov Momentum Optimization Method

    Normally Nesterov momentum is implemented by first taking a step towards
    the previous update direction, calculating gradient at that position,
    using the gradient to obtain the new update direction, and finally
    updating the parameters. We use an alternative formulation that requires
    the gradient to be computed only at the current parameter values,
    described here:
    https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617

    v_{t} = mu * v_{t-1} - lr * gradient(params_{t-1})
    params_{t} = params_{t-1} + mu * v_{t} - lr * gradient(params_{t-1})
    """

    def __init__(self, optimization_options, network, *args, **kwargs):
        """Creates a Nesterov momentum optimizer.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object
        """

        self._params = Parameters()
        for path, param in network.get_variables().items():
            self._params.add(path + '_gradient',
                             numpy.zeros_like(param.get_value()))
            self._params.add(path + '_velocity',
                             numpy.zeros_like(param.get_value()))

        # momentum
        if not 'momentum' in optimization_options:
            raise ValueError("Momentum is not given in optimization options.")
        self._momentum = optimization_options['momentum']

        super().__init__(optimization_options, network, *args, **kwargs)

    def _gradient_update_exprs(self):
        result = []
        for path, gradient_new in zip(self.network.get_variables(),
                                      self._gradient_exprs):
            gradient = self._params[path + '_gradient']
            result.append((gradient, gradient_new))
        return result

    def _model_update_exprs(self, alpha):
        updates = dict()
        for path, param in self.network.get_variables().items():
            gradient = self._params[path + '_gradient']
            updates[path] = -gradient
        self._normalize(updates)

        result = []
        for path, param in self.network.get_variables().items():
            update = updates[path]
            velocity = self._params[path + '_velocity']
            velocity_new = self._momentum * velocity + alpha * update
            param_new = param + self._momentum * velocity_new + alpha * update
            result.append((velocity, velocity_new))
            result.append((param, param_new))
        return result