Ejemplo n.º 1
0
class BasicLayer(object, metaclass=ABCMeta):
    """Superclass for Neural Network Layers
    """
    def __init__(self, layer_options, network, profile=False):
        """Saves some attributes that are common to all layers.

        :type layer_options: dict
        :param layer_options: dictionary of layer options

        :type network: Network
        :param network: the network object creating this layer

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.name = layer_options['name']
        self._input_layers = layer_options['input_layers']
        self._params = Parameters()
        self._devices = layer_options['devices']

        if 'size' in layer_options:
            self.output_size = int(layer_options['size'])
        else:
            self.output_size = \
                sum([x.output_size for x in self._input_layers])

        # Convolutional layers may produce two-dimensional output. In that case,
        # the state matrix is four-dimensional and the size of the last
        # dimension is self.output_depth.
        if 'depth' in layer_options:
            self.output_depth = int(layer_options['depth'])
        else:
            self.output_depth = 1

        if 'reverse_time' in layer_options:
            self._reverse_time = bool(layer_options['reverse_time'])
        else:
            self._reverse_time = False

        logging.debug(
            "- %s name=%s inputs=[%s] size=%d depth=%d%s devices=[%s]",
            self.__class__.__name__, self.name,
            ', '.join([x.name for x in self._input_layers]), self.output_size,
            self.output_depth, ' reverse,' if self._reverse_time else '',
            ', '.join([str(x) for x in self._devices]))

        self._network = network
        self._profile = profile

    @abstractmethod
    def create_structure(self):
        """Creates the symbolic graph of this layer.

        Sets self.output to a symbolic matrix that describes the output of this
        layer.
        """

        assert False

    def get_state(self, state):
        """Pulls parameter values from Theano shared variables.

        If there already is a parameter in the state, it will be replaced, so it
        has to have the same number of elements.

        :type state: h5py.File
        :param state: HDF5 file for storing the neural network parameters
        """

        self._params.get_state(state)

    def set_state(self, state):
        """Sets the values of Theano shared variables.

        :type state: h5py.File
        :param state: HDF5 file that contains the neural network parameters
        """

        self._params.set_state(state)

    def num_params(self):
        """Returns the number of parameters in this layer.

        This method is used just for reporting the number of parameters in the
        model. Normally there is just one set of parameters.

        :rtype: int
        :returns: the number of parameters used by the layer
        """

        return self._params.total_size

    def get_variables(self):
        """Returns a dictionary of the shared variables.

        This function is used by the optimizers to create optimization
        parameters that are specific to network parameters, and compute
        gradients with regard to the parameters. Normally there is just one set
        of parameters.

        :rtype: dict
        :returns: mapping from parameter path to Theano shared variables
        """

        return self._params.get_variables()

    def _param_path(self, param_name, device=None):
        """Returns the HDF5 path used to address a parameter.

        :type param_name: str
        :param param_name: name of a parameter within this layer

        :type device: str
        :param device: ``None`` for parameters that reside on the default device
                       only; otherwise returns the path used to address the part
                       of the parameter that resides on the given device

        :rtype: str
        :returns: full path of the parameter in a HDF5 file.
        """

        result = 'layers/' + self.name + '/' + param_name
        if device is not None:
            result += '/' + device
        return result

    def _get_param(self, param_name, device=None):
        """Returns a Theano tensor variable by parameter name.

        :type param_name: str
        :param param_name: name of a parameter within the layer

        :type device: str
        :param device: ``None`` for parameters that reside on the default device
                       only; otherwise returns the part of the parameter that
                       resides on the given device

        :rtype: TensorVariable
        :returns: the corresponding tensor variable
        """

        return self._params[self._param_path(param_name, device)]

    def _init_weight(self,
                     param_name,
                     shape,
                     scale=None,
                     count=1,
                     split_to_devices=False):
        """Generates a weight matrix from “standard normal” distribution.

        If ``shape`` contains two dimensions that match, generates an orthogonal
        matrix. In that case scale is ignored. Orthogonal weights are useful for
        two reasons:

        1. Multiplying by an orthogonal weight preserves the norm of the
           input vector, which should help avoid exploding and vanishing
           gradients.
        2. The row and column vectors are orthonormal to one another, which
           should help avoid two vectors learning to produce the same features.

        If ``count`` is specified, creates a concatenation of several similar
        submatrices (same shape but different content).

        If ``split_to_devices`` is set to ``True``, splits the weight to equal
        parts on the last dimension, and creates one parameter for each device.
        If also ``count`` is specified, each device will have an equal part of
        every submatrix.

        :type shape: list or tuple of ints
        :param shape: sizes of the weight dimensions; normally the first one is
                      the dimensionality of the input data and the second one is
                      the dimensionality of the output data

        :type scale: float
        :param scale: if other than ``None``, the matrix will be scaled by this
                      factor, unless an orthogonal matrix is created

        :type count: int
        :param count: concatenate this many weight matrices with the same shape

        :type split_to_devices: bool
        :param split_to_devices: if set to ``True``, creates on every device a
                                 parameter that contains one part of the weight
        """

        path = self._param_path(param_name)
        weight = random_matrix(shape, scale, count)
        if not split_to_devices:
            self._params.add(path, random_matrix(shape, scale, count))
        elif (len(self._devices) == 1) and (self._devices[0] is None):
            # This layer has not been assigned to a specific device.
            self._params.add(path, random_matrix(shape, scale, count))
        else:
            self._split_to_devices(path, weight, shape[-1])

    def _init_bias(self,
                   param_name,
                   shape,
                   value=None,
                   split_to_devices=False):
        """Initializes a bias vector with given value.

        If ``value`` is not given, initializes the vector with zero value. If
        ``value`` is a list, creates a concatenation of as many vectors as there
        are elements in the list.

        If ``split_to_devices`` is set to ``True``, splits the array to equal
        parts on the last dimension, and creates one parameter for each device.
        If ``value`` is a list, each device will have an equal part of every
        submatrix.

        :type param_name: str
        :param param_name: name for the parameter within the layer

        :type shape: int or tuple of ints
        :param shape: size of the vector, or a tuple of the sizes of each
                      dimension (in case ``value`` is a list, each part will
                      have this size)

        :type value: float, numpy.ndarray or list
        :param value: the value or array to initialize the elements to, or a
                      list of values or arrays to create a concatenation of
                      vectors

        :type split_to_devices: bool
        :param split_to_devices: if set to ``True``, creates on every device a
                                 parameter that contains one part of the array
        """

        path = self._param_path(param_name)
        bias = matrix_from_value(shape, value)
        if not split_to_devices:
            self._params.add(path, matrix_from_value(shape, value))
        elif (len(self._devices) == 1) and (self._devices[0] is None):
            # This layer has not been assigned to a specific device.
            self._params.add(path, matrix_from_value(shape, value))
        else:
            self._split_to_devices(path, bias, shape[-1])

    def _split_to_devices(self, path, value, part_size):
        """Splits a matrix to equal parts on the last dimension, and creates a
        parameter on each device.

        If the matrix consists of submatrices, each device will have an equal
        part of every submatrix, whose size is specified by ``part_size``.

        :type path: str
        :param path: base path for the parameters that will be prefixed by the
                     device string

        :type value: numpy.ndarray
        :param value: a matrix that will be split to give the initial value of
                      the parameters

        :type part_size: int
        :param part_size: size of the last dimension of ``value``, or if
                          ``value`` consists of multiple submatrices, size of
                          one submatrix
        """

        part_count = value.shape[-1] // part_size
        if part_count * part_size != value.shape[-1]:
            raise ValueError("Last dimension is not a multiple of part size.")

        split_sizes = self._size_per_device(part_size)
        split_start = 0
        for device, split_size in zip(self._devices, split_sizes):
            assert device is not None
            split_end = split_start + split_size
            ranges = []
            for part_index in range(part_count):
                part_start = part_index * part_size
                ranges.extend(
                    range(part_start + split_start, part_start + split_end))
            split_start = split_end
            self._params.add(path + '/' + device, value[..., ranges], device)

    def _size_per_device(self, total_size):
        """Returns ``total_size`` divided for each device.

        :type total_size: int
        :param total_size: total size of a parameter

        :rtype: list of ints
        :returns: ``total_size`` divided into as many parts as there are devices
                  assigned to this layer
        """

        num_devices = len(self._devices)
        if num_devices < 1:
            raise RuntimeError("No devices assigned to this layer.")
        if total_size < num_devices:
            raise ValueError(
                "Cannot split matrix of size {} to {} devices.".format(
                    total_size, num_devices))

        result = []
        quotient, remainder = divmod(total_size, num_devices)
        start_index = 0
        for i in range(1, num_devices + 1):
            end_index = i * quotient + min(i, remainder)
            result.append(end_index - start_index)
            start_index = end_index

        assert len(result) == num_devices
        assert sum(result) == total_size
        assert end_index == total_size

        return result

    def _tensor_preact(self, input_matrix, param_name):
        """Helper function that creates a pre-activation of ``input_matrix`` by
        multiplying it by a weight matrix and adding a bias.

        ``input_matrix`` and the result normally have the shape of a mini-batch:
        the first dimension is the time step and the second dimension is the
        sequence. The last dimension is always the data vector. The size of the
        input data vector should equal to the first dimension of the weight
        vector, and the second dimension of the weight vector defines the size
        of the output data vector.

        :type input_matrix: TensorVariable
        :param input_matrix: the preactivations will be computed by multiplying
                             the data vectors (the last dimension of this
                             matrix) by the weight matrix, and adding bias

        :type param_name: str
        :param param_name: name of a parameter group that contains a weight
                           matrix and a bias vector

        :rtype: TensorVariable
        :returns: a matrix that has the same number of dimensions as
                  ``input_matrix``, but the data vectors (the last dimension of
                  this matrix) are the preactivations
        """

        weight = self._params[self._param_path(param_name) + '/W']
        bias = self._params[self._param_path(param_name) + '/b']
        return tensor.dot(input_matrix, weight) + bias