Example #1
0
    def add_recurrent_state(self, size):
        """Adds a recurrent state variable and returns its index.

        Used by recurrent layers to add a state variable that has to be passed
        from one time step to the next, when generating text or computing
        lattice probabilities.

        :type size: int
        :param size: size of the state vector

        :rtype size: int
        :param size: index of the new recurrent state variable
        """

        index = len(self.recurrent_state_size)
        assert index == len(self.recurrent_state_input)

        # The variables are in the structure of a mini-batch (3-dimensional
        # array) to keep the layer functions general.
        variable = tensor.tensor3('network/recurrent_state_' + str(index),
                                  dtype=theano.config.floatX)
        variable.tag.test_value = test_value(size=(1, 16, size), max_value=1.0)

        self.recurrent_state_size.append(size)
        self.recurrent_state_input.append(variable)

        return index
Example #2
0
    def add_recurrent_state(self, size):
        """Adds a recurrent state variable and returns its index.

        Used by recurrent layers to add a state variable that has to be passed
        from one time step to the next, when generating text or computing
        lattice probabilities.

        :type size: int
        :param size: size of the state vector

        :rtype size: int
        :param size: index of the new recurrent state variable
        """

        index = len(self.recurrent_state_size)
        assert index == len(self.recurrent_state_input)

        # The variables are in the structure of a mini-batch (3-dimensional
        # array) to keep the layer functions general.
        variable = tensor.tensor3('network/recurrent_state_' + str(index),
                                  dtype=theano.config.floatX)
        variable.tag.test_value = test_value(size=(1, 16, size), high=1.0)

        self.recurrent_state_size.append(size)
        self.recurrent_state_input.append(variable)

        return index
Example #3
0
    def create_structure(self):
        """Creates the symbolic matrix that describes the network input.

        The tensor variable will be set to a matrix of word IDs, with
        [ number of time steps * number of sequences ] elements. When generating
        text, the matrix will contain only one element.
        """

        self.output = tensor.matrix('network/input', dtype='int64')
        self.output.tag.test_value = test_value(
            size=(100, 16),
            max_value=self.output_size)
Example #4
0
    def create_batch_structure(self):
        """Creates the network structure for mini-batch processing.

        Creates the symbolic matrix self.output, which describes the output
        probability of the next input word at each time step and sequence. The
        shape will be the same as that of self.input, except that it will
        contain one less time step.
        """

        # mask is used to mask out the rest of the input matrix, when a sequence
        # is shorter than the maximum sequence length. The mask is kept as int8
        # data type, which is how Tensor stores booleans.
        self.mask = tensor.matrix('network/mask', dtype='int8')
        self.mask.tag.test_value = test_value(
            size=(100, 16),
            max_value=True)

        # Dropout layer needs to know whether we are training or evaluating.
        self.is_training = tensor.scalar('network/is_training', dtype='int8')
        self.is_training.tag.test_value = 1

        for layer in self.layers.values():
            layer.create_structure()

        self.input = self.network_input.output
        self.output = self.output_layer.output

        # The input at the next time step is what the output (predicted word)
        # should be.
        word_ids = self.input[1:].flatten()
        output_probs = self.output[:-1].flatten()

        # An index to a flattened input matrix times the vocabulary size can be
        # used to index the same location in the output matrix. The word ID is
        # added to index the probability of that word.
        target_indices = \
            tensor.arange(word_ids.shape[0]) * self.dictionary.num_classes() \
            + word_ids
        target_probs = output_probs[target_indices]

        # Reshape to a matrix. Now we have one less time step.
        num_time_steps = self.input.shape[0] - 1
        num_sequences = self.input.shape[1]
        self.prediction_probs = target_probs.reshape(
            [num_time_steps, num_sequences])
Example #5
0
    def add_recurrent_state(self, size):
        """Adds a recurrent state variable and returns its index.

        Used by recurrent layers to add a state variable that has to be passed
        from one time step to the next, when generating text using one-step
        processing.
        """

        index = len(self.recurrent_state_size)
        assert index == len(self.recurrent_state_input)

        # The variables are in the structure of a mini-batch (3-dimensional
        # array) to keep the layer functions general.
        variable = tensor.tensor3('network/recurrent_state_' + str(index),
                                  dtype=theano.config.floatX)
        variable.tag.test_value = test_value(size=(1, 1, size), max_value=1.0)

        self.recurrent_state_size.append(size)
        self.recurrent_state_input.append(variable)

        return index
Example #6
0
    def __init__(self, vocabulary, architecture,
                 predict_next_distribution=False, profile=False):
        """Initializes the neural network parameters for all layers, and
        creates Theano shared variables from them.

        :type vocabulary: Vocabulary
        :param vocabulary: mapping between word IDs and word classes

        :type architecture: Architecture
        :param architecture: an object that describes the network architecture

        :type predict_next_distribution: bool
        :param predict_next_distribution: if set to True, creates a network that
            produces the probability distribution for the next word (instead of
            of target probabilities for a mini-batch)

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.vocabulary = vocabulary
        self.architecture = architecture
        self.predict_next_distribution = predict_next_distribution

        M1 = 2147483647
        M2 = 2147462579
        random_seed = [
            numpy.random.randint(0, M1),
            numpy.random.randint(0, M1),
            numpy.random.randint(1, M1),
            numpy.random.randint(0, M2),
            numpy.random.randint(0, M2),
            numpy.random.randint(1, M2)]
        self.random = RandomStreams(random_seed)

        # Word and class inputs will be available to NetworkInput layers.
        self.word_input = tensor.matrix('network/word_input', dtype='int64')
        self.word_input.tag.test_value = test_value(
            size=(100, 16),
            max_value=vocabulary.num_words())
        self.class_input = tensor.matrix('network/class_input', dtype='int64')
        self.class_input.tag.test_value = test_value(
            size=(100, 16),
            max_value=vocabulary.num_classes())

        # Recurrent layers will create these lists, used to initialize state
        # variables of appropriate sizes, for doing forward passes one step at a
        # time.
        self.recurrent_state_input = []
        self.recurrent_state_size = []

        # Create the layers.
        logging.debug("Creating layers.")
        self.layers = OrderedDict()
        for input_options in architecture.inputs:
            input = NetworkInput(input_options, self)
            self.layers[input.name] = input
        for layer_description in architecture.layers:
            layer_options = self._layer_options_from_description(
                layer_description)
            if layer_options['name'] == architecture.output_layer:
                layer_options['size'] = vocabulary.num_classes()
            layer = create_layer(layer_options, self, profile=profile)
            self.layers[layer.name] = layer
        self.output_layer = self.layers[architecture.output_layer]

        # This list will be filled by the recurrent layers to contain the
        # recurrent state outputs, for doing forward passes one step at a time.
        self.recurrent_state_output = [None] * len(self.recurrent_state_size)

        # Create initial parameter values.
        logging.debug("Initializing parameters.")
        self.param_init_values = OrderedDict()
        num_params = 0
        for layer in self.layers.values():
            for name, value in layer.param_init_values.items():
                logging.debug("- %s size=%d", name, value.size)
                num_params += value.size
            self.param_init_values.update(layer.param_init_values)
        logging.debug("Total number of parameters: %d", num_params)

        # Create Theano shared variables.
        self.params = {name: theano.shared(value, name)
                       for name, value in self.param_init_values.items()}
        for layer in self.layers.values():
            layer.set_params(self.params)

        # mask is used to mask out the rest of the input matrix, when a sequence
        # is shorter than the maximum sequence length. The mask is kept as int8
        # data type, which is how Tensor stores booleans. When the network is
        # used to predict the probability distribution of the next word, the
        # matrix contains only one word ID.
        if self.predict_next_distribution:
            self.mask = tensor.alloc(numpy.int8(1), 1, 1)
        else:
            self.mask = tensor.matrix('network/mask', dtype='int8')
            self.mask.tag.test_value = test_value(
                size=(100, 16),
                max_value=True)

        # Dropout layer needs to know whether we are training or evaluating.
        self.is_training = tensor.scalar('network/is_training', dtype='int8')
        self.is_training.tag.test_value = 1

        for layer in self.layers.values():
            layer.create_structure()
Example #7
0
    def __init__(self,
                 network,
                 use_shortlist=True,
                 exclude_unk=False,
                 profile=False):
        """Creates two Theano function, ``self._target_logprobs_function()``,
        which computes the log probabilities predicted by the neural network for
        the words in a mini-batch, and ``self._total_logprob_function()``, which
        returns the total log probability.

        Both functions take as arguments four matrices:

        1. Word IDs in the shape of a mini-batch. The functions will only use
           the input words (not the last time step).
        2. Class IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        3. Class membership probabilities in the shape of a mini-batch, but only
           for the output words (not the first time step).
        4. Mask in the shape of a mini-batch, but only for the output words (not
           for the first time step).

        ``self._target_logprobs_function()`` will return a matrix of predicted
        log probabilities for the output words (excluding the first time step)
        and the mask. ``<unk>`` tokens are also masked out if ``exclude_unk`` is
        set to ``True``. ``self._total_logprob_function()`` will return the
        total log probability of the predicted (unmasked) words and the number
        of those words.

        :type network: Network
        :param network: the neural network object

        :type use_shortlist: bool
        :param use_shortlist: if ``True``, the ``<unk>`` probability is
                              distributed among the out-of-shortlist words

        :type exclude_unk: bool
        :param exclude_unk: if set to ``True``, ``<unk>`` tokens are excluded
                            from probability computation

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self._vocabulary = network.vocabulary
        self._unk_id = self._vocabulary.word_to_id['<unk>']

        # The functions take as input a mini-batch of word IDs and class IDs,
        # and slice input and target IDs for the network.
        batch_word_ids = tensor.matrix('textscorer/batch_word_ids',
                                       dtype='int64')
        batch_word_ids.tag.test_value = test_value(
            size=(21, 4), high=self._vocabulary.num_words())
        batch_class_ids = tensor.matrix('textscorer/batch_class_ids',
                                        dtype='int64')
        batch_class_ids.tag.test_value = test_value(
            size=(21, 4), high=self._vocabulary.num_classes())
        membership_probs = tensor.matrix('textscorer/membership_probs',
                                         dtype=theano.config.floatX)
        membership_probs.tag.test_value = test_value(size=(20, 4), high=1.0)

        # Convert out-of-shortlist words to <unk> in input.
        shortlist_size = self._vocabulary.num_shortlist_words()
        input_word_ids = batch_word_ids[:-1]
        oos_indices = tensor.ge(input_word_ids, shortlist_size).nonzero()
        input_word_ids = tensor.set_subtensor(input_word_ids[oos_indices],
                                              self._unk_id)
        # Out-of-shortlist words are already in <unk> class, because the don't
        # have own classes.
        input_class_ids = batch_class_ids[:-1]
        target_class_ids = batch_class_ids[1:]
        # Target word IDs are not used by the network. We need also the actual
        # out-of-shortlist words.
        target_word_ids = batch_word_ids[1:]

        logprobs = tensor.log(network.target_probs())
        # Add logprobs from the class membership of the predicted word.
        logprobs += tensor.log(membership_probs)

        mask = network.mask
        if use_shortlist and network.oos_logprobs is not None:
            # The probability of out-of-shortlist words (which is the <unk>
            # probability) is multiplied by the fraction of the actual word
            # within the set of OOS words.
            logprobs += network.oos_logprobs[target_word_ids]
            # Always exclude OOV words when using a shortlist - No probability
            # mass is left for them.
            mask *= tensor.neq(target_word_ids, self._unk_id)
        elif exclude_unk:
            # If requested, ignore OOS and OOV probabilities.
            mask *= tensor.neq(target_word_ids, self._unk_id)
            mask *= tensor.lt(target_word_ids, shortlist_size)
        logprobs *= tensor.cast(mask, theano.config.floatX)

        # Ignore unused input variables, because is_training is only used by
        # dropout layer.
        self._target_logprobs_function = theano.function(
            [batch_word_ids, batch_class_ids, membership_probs, network.mask],
            [logprobs, mask],
            givens=[(network.input_word_ids, input_word_ids),
                    (network.input_class_ids, input_class_ids),
                    (network.target_class_ids, target_class_ids),
                    (network.is_training, numpy.int8(0))],
            name='target_logprobs',
            on_unused_input='ignore',
            profile=profile)
        self._total_logprob_function = theano.function(
            [batch_word_ids, batch_class_ids, membership_probs, network.mask],
            [logprobs.sum(), mask.sum()],
            givens=[(network.input_word_ids, input_word_ids),
                    (network.input_class_ids, input_class_ids),
                    (network.target_class_ids, target_class_ids),
                    (network.is_training, numpy.int8(0))],
            name='total_logprob',
            on_unused_input='ignore',
            profile=profile)

        # These are updated by score_line().
        self.num_words = 0
        self.num_unks = 0
Example #8
0
    def __init__(self, network, ignore_unk=False, unk_penalty=None,
                 profile=False):
        """Creates two Theano function, ``self._target_logprobs_function()``,
        which computes the log probabilities predicted by the neural network for
        the words in a mini-batch, and ``self._total_logprob_function()``, which
        returns the total log probability.

        Both functions take as arguments four matrices:
        1. Word IDs in the shape of a mini-batch. The functions will only use
           the input words (not the last time step).
        2. Class IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        3. Class membership probabilities in the shape of a mini-batch, but only
           for the output words (not the first time step).
        4. Mask in the shape of a mini-batch, but only for the output words (not
           for the first time step).

        ``self._target_logprobs_function()`` will return a matrix of predicted
        log probabilities for the output words (excluding the first time step)
        and the mask after possibly applying special UNK handling.
        ``self._total_logprob_function()`` will return the total log probability
        of the predicted (unmasked) words and the number of those words.

        :type network: Network
        :param network: the neural network object

        :type ignore_unk: bool
        :param ignore_unk: if set to True, <unk> tokens are excluded from
                           perplexity computation

        :type unk_penalty: float
        :param unk_penalty: if set to othern than None, used as <unk> token
                            score

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self._ignore_unk = ignore_unk
        self._unk_penalty = unk_penalty
        self._vocabulary = network.vocabulary
        self._unk_id = network.vocabulary.word_to_id['<unk>']

        # The functions take as input a mini-batch of word IDs and class IDs,
        # and slice input and target IDs for the network.
        batch_word_ids = tensor.matrix('textscorer/batch_word_ids',
                                       dtype='int64')
        batch_word_ids.tag.test_value = test_value(
            size=(101, 16), high=self._vocabulary.num_words())
        batch_class_ids = tensor.matrix('textscorer/batch_class_ids',
                                        dtype='int64')
        batch_class_ids.tag.test_value = test_value(
            size=(101, 16), high=self._vocabulary.num_classes())
        membership_probs = tensor.matrix('textscorer/membership_probs',
                                         dtype=theano.config.floatX)
        membership_probs.tag.test_value = test_value(
            size=(100, 16), high=1.0)

        logprobs = tensor.log(network.target_probs())
        # Add logprobs from the class membership of the predicted word at each
        # time step of each sequence.
        logprobs += tensor.log(membership_probs)
        # If requested, predict <unk> with constant score.
        target_word_ids = batch_word_ids[1:]
        if not self._unk_penalty is None:
            unk_mask = tensor.eq(target_word_ids, self._unk_id)
            unk_indices = unk_mask.nonzero()
            logprobs = tensor.set_subtensor(logprobs[unk_indices],
                                            self._unk_penalty)
        # Ignore logprobs predicting a word that is past the sequence end, and
        # possibly also those that are predicting <unk> token.
        mask = network.mask
        if self._ignore_unk:
            mask *= tensor.neq(target_word_ids, self._unk_id)
        logprobs *= tensor.cast(mask, theano.config.floatX)

        # Ignore unused input variables, because is_training is only used by
        # dropout layer.
        self._target_logprobs_function = theano.function(
            [batch_word_ids, batch_class_ids, membership_probs, network.mask],
            [logprobs, mask],
            givens=[(network.input_word_ids, batch_word_ids[:-1]),
                    (network.input_class_ids, batch_class_ids[:-1]),
                    (network.target_class_ids, batch_class_ids[1:]),
                    (network.is_training, numpy.int8(0))],
            name='target_logprobs',
            on_unused_input='ignore',
            profile=profile)
        self._total_logprob_function = theano.function(
            [batch_word_ids, batch_class_ids, membership_probs, network.mask],
            [logprobs.sum(), mask.sum()],
            givens=[(network.input_word_ids, batch_word_ids[:-1]),
                    (network.input_class_ids, batch_class_ids[:-1]),
                    (network.target_class_ids, batch_class_ids[1:]),
                    (network.is_training, numpy.int8(0))],
            name='total_logprob',
            on_unused_input='ignore',
            profile=profile)
Example #9
0
    def __init__(self, optimization_options, network, profile=False):
        """Creates Theano functions for training a neural network language
        model.

        The subclass constructor is expected to create the optimizer parameters
        in ``self._params``. This constructor will then create two update
        functions, ``self.gradient_update_function``, which updates the gradient
        parameters and returns the cost, and ``self.model_update_function``,
        which updates model state given the gradients and the learning rate.

        The gradient update functions takes as arguments three matrices:

        1. Word IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        2. Class IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        3. Mask in the shape of a mini-batch, but only for the output words (not
           for the first time step).

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.network = network

        float_type = numpy.dtype(theano.config.floatX).type
        self.float_type = float_type

        try:
            # numerical stability / smoothing term to prevent divide-by-zero
            self._epsilon = float_type(optimization_options['epsilon'])
            # learning rate / step size
            self.learning_rate = float_type(
                optimization_options['learning_rate'])
            # weights for training files
            self._weights = optimization_options['weights']
            # maximum norm for parameter updates
            self._max_gradient_norm = float_type(
                optimization_options['max_gradient_norm'])
            # cost function
            cost_function = optimization_options['cost_function']
            # number of noise samples for sampling based output
            num_noise_samples = optimization_options['num_noise_samples']
            # noise sample sharing for sampling based output
            noise_sharing = optimization_options['noise_sharing']
            # exclude <unk> tokens from cost computation?
            self._exclude_unk = optimization_options['exclude_unk']
        except KeyError as e:
            raise ValueError(
                "Option {} is missing from optimization options.".format(e))

        self._unk_id = self.network.vocabulary.word_to_id['<unk>']

        # The functions take as input a mini-batch of word IDs and class IDs,
        # and slice input and target IDs for the network.
        batch_word_ids = tensor.matrix('optimizer/batch_word_ids',
                                       dtype='int64')
        batch_word_ids.tag.test_value = test_value(
            size=(101, 16), high=self.network.vocabulary.num_shortlist_words())
        batch_class_ids = tensor.matrix('optimizer/batch_class_ids',
                                        dtype='int64')
        batch_class_ids.tag.test_value = test_value(
            size=(101, 16), high=self.network.vocabulary.num_classes())

        if cost_function == 'cross-entropy':
            # Derive the symbolic expression for log probability of each word.
            logprobs = tensor.log(self.network.target_probs())
        elif cost_function == 'nce':
            logprobs = self._get_nce_cost(sharing=noise_sharing)
        elif cost_function == 'blackout':
            logprobs = self._get_blackout_cost(sharing=noise_sharing)
        else:
            raise ValueError(
                "Invalid cost function requested: `{}'".format(cost_function))

        # Do not predict masked and possibly <unk> tokens. The mask has to be
        # cast to floatX, otherwise the result will be float64 and pulled out
        # from the GPU earlier than necessary.
        mask = self.network.mask
        if self._exclude_unk:
            mask *= tensor.neq(self.network.target_word_ids, self._unk_id)
        logprobs *= tensor.cast(mask, theano.config.floatX)
        # Cost is the negative log probability normalized by the number of
        # training examples in the mini-batch, so that the gradients will also
        # be normalized by the number of training examples.
        cost = -logprobs.sum() / tensor.cast(mask.sum(), theano.config.floatX)

        # Derive the symbolic expression for updating the gradient with regard
        # to each parameter.
        self._gradient_exprs = \
            tensor.grad(cost, wrt=list(self.network.get_variables().values()))

        # Ignore unused input, because is_training is only used by dropout
        # layer.
        self.gradient_update_function = theano.function(
            [batch_word_ids, batch_class_ids, self.network.mask], [],
            givens=[(network.input_word_ids, batch_word_ids[:-1]),
                    (network.input_class_ids, batch_class_ids[:-1]),
                    (network.target_word_ids, batch_word_ids[1:]),
                    (network.target_class_ids, batch_class_ids[1:]),
                    (self.network.is_training, numpy.int8(1)),
                    (self.network.num_noise_samples,
                     numpy.int64(num_noise_samples))],
            updates=self._gradient_update_exprs(),
            name='gradient_update_function',
            on_unused_input='ignore',
            profile=profile)

        alpha = tensor.scalar('optimizer/update_weight',
                              dtype=theano.config.floatX)
        alpha.tag.test_value = 0.1
        self.model_update_function = theano.function(
            [alpha], [],
            updates=self._model_update_exprs(alpha),
            name='model_update_function',
            profile=profile)
Example #10
0
    def __init__(self, vocabulary, architecture, batch_processing=True, profile=False):
        """Initializes the neural network parameters for all layers, and
        creates Theano shared variables from them.

        :type vocabulary: Vocabulary
        :param vocabulary: mapping between word IDs and word classes

        :type architecture: Architecture
        :param architecture: an object that describes the network architecture

        :type batch_processing: bool
        :param batch_processing: True creates a network for processing
                                 mini-batches, False creates a network for
                                 progressing one time step at a time

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.vocabulary = vocabulary
        self.architecture = architecture
        self.batch_processing = batch_processing

        M1 = 2147483647
        M2 = 2147462579
        random_seed = [
            numpy.random.randint(0, M1),
            numpy.random.randint(0, M1),
            numpy.random.randint(1, M1),
            numpy.random.randint(0, M2),
            numpy.random.randint(0, M2),
            numpy.random.randint(1, M2)]
        self.random = RandomStreams(random_seed)

        # Word and class inputs will be available to NetworkInput layers.
        self.word_input = tensor.matrix('network/word_input', dtype='int64')
        self.word_input.tag.test_value = test_value(
            size=(100, 16),
            max_value=vocabulary.num_words())
        self.class_input = tensor.matrix('network/class_input', dtype='int64')
        self.class_input.tag.test_value = test_value(
            size=(100, 16),
            max_value=vocabulary.num_classes())

        # Recurrent layers will create these lists, used by TextSampler to
        # initialize state variables of appropriate sizes.
        self.recurrent_state_input = []
        self.recurrent_state_size = []

        # Create the layers.
        logging.debug("Creating layers.")
        self.layers = OrderedDict()
        for input_options in architecture.inputs:
            input = NetworkInput(input_options, self)
            self.layers[input.name] = input
        for layer_description in architecture.layers:
            layer_options = dict()
            for variable, value in layer_description.items():
                if variable == 'inputs':
                    layer_options['input_layers'] = \
                        [self.layers[x] for x in value]
                else:
                    layer_options[variable] = value
            if layer_options['name'] == architecture.output_layer:
                layer_options['size'] = vocabulary.num_classes()
            layer = create_layer(layer_options, self, profile=profile)
            self.layers[layer.name] = layer
        self.output_layer = self.layers[architecture.output_layer]

        # This list will be filled by the recurrent layers to contain the
        # recurrent state outputs, required by TextSampler.
        self.recurrent_state_output = [None] * len(self.recurrent_state_size)

        # Create initial parameter values.
        logging.debug("Initializing parameters.")
        self.param_init_values = OrderedDict()
        num_params = 0
        for layer in self.layers.values():
            for name, value in layer.param_init_values.items():
                logging.debug("- %s size=%d", name, value.size)
                num_params += value.size
            self.param_init_values.update(layer.param_init_values)
        logging.debug("Total number of parameters: %d", num_params)

        # Create Theano shared variables.
        self.params = {name: theano.shared(value, name)
                       for name, value in self.param_init_values.items()}
        for layer in self.layers.values():
            layer.set_params(self.params)

        if batch_processing:
            self.create_batch_structure()
        else:
            self.create_onestep_structure()
Example #11
0
    def __init__(self,
                 network,
                 ignore_unk=False,
                 unk_penalty=None,
                 profile=False):
        """Creates two Theano function, ``self._target_logprobs_function()``,
        which computes the log probabilities predicted by the neural network for
        the words in a mini-batch, and ``self._total_logprob_function()``, which
        returns the total log probability.

        Both functions take as arguments four matrices:
        1. Word IDs in the shape of a mini-batch. The functions will only use
           the input words (not the last time step).
        2. Class IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        3. Class membership probabilities in the shape of a mini-batch, but only
           for the output words (not the first time step).
        4. Mask in the shape of a mini-batch, but only for the output words (not
           for the first time step).

        ``self._target_logprobs_function()`` will return a matrix of predicted
        log probabilities for the output words (excluding the first time step)
        and the mask after possibly applying special UNK handling.
        ``self._total_logprob_function()`` will return the total log probability
        of the predicted (unmasked) words and the number of those words.

        :type network: Network
        :param network: the neural network object

        :type ignore_unk: bool
        :param ignore_unk: if set to True, <unk> tokens are excluded from
                           perplexity computation

        :type unk_penalty: float
        :param unk_penalty: if set to othern than None, used as <unk> token
                            score

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self._ignore_unk = ignore_unk
        self._unk_penalty = unk_penalty
        self._vocabulary = network.vocabulary
        self._unk_id = network.vocabulary.word_to_id['<unk>']

        # The functions take as input a mini-batch of word IDs and class IDs,
        # and slice input and target IDs for the network.
        batch_word_ids = tensor.matrix('textscorer/batch_word_ids',
                                       dtype='int64')
        batch_word_ids.tag.test_value = test_value(
            size=(101, 16), max_value=self._vocabulary.num_words())
        batch_class_ids = tensor.matrix('textscorer/batch_class_ids',
                                        dtype='int64')
        batch_class_ids.tag.test_value = test_value(
            size=(101, 16), max_value=self._vocabulary.num_classes())
        membership_probs = tensor.matrix('textscorer/membership_probs',
                                         dtype=theano.config.floatX)
        membership_probs.tag.test_value = test_value(size=(100, 16),
                                                     max_value=1.0)

        logprobs = tensor.log(network.target_probs())
        # Add logprobs from the class membership of the predicted word at each
        # time step of each sequence.
        logprobs += tensor.log(membership_probs)
        # If requested, predict <unk> with constant score.
        target_word_ids = batch_word_ids[1:]
        if not self._unk_penalty is None:
            unk_mask = tensor.eq(target_word_ids, self._unk_id)
            unk_indices = unk_mask.nonzero()
            logprobs = tensor.set_subtensor(logprobs[unk_indices],
                                            self._unk_penalty)
        # Ignore logprobs predicting a word that is past the sequence end, and
        # possibly also those that are predicting <unk> token.
        mask = network.mask
        if self._ignore_unk:
            mask *= tensor.neq(target_word_ids, self._unk_id)
        logprobs *= tensor.cast(mask, theano.config.floatX)

        # Ignore unused input variables, because is_training is only used by
        # dropout layer.
        self._target_logprobs_function = theano.function(
            [batch_word_ids, batch_class_ids, membership_probs, network.mask],
            [logprobs, mask],
            givens=[(network.input_word_ids, batch_word_ids[:-1]),
                    (network.input_class_ids, batch_class_ids[:-1]),
                    (network.target_class_ids, batch_class_ids[1:]),
                    (network.is_training, numpy.int8(0))],
            name='target_logprobs',
            on_unused_input='ignore',
            profile=profile)
        self._total_logprob_function = theano.function(
            [batch_word_ids, batch_class_ids, membership_probs, network.mask],
            [logprobs.sum(), mask.sum()],
            givens=[(network.input_word_ids, batch_word_ids[:-1]),
                    (network.input_class_ids, batch_class_ids[:-1]),
                    (network.target_class_ids, batch_class_ids[1:]),
                    (network.is_training, numpy.int8(0))],
            name='total_logprob',
            on_unused_input='ignore',
            profile=profile)
Example #12
0
    def __init__(self, vocabulary, architecture, mode=None, profile=False):
        """Initializes the neural network parameters for all layers, and
        creates Theano shared variables from them.

        When using noise-contrastive estimation, the output layer needs to know
        the prior distribution of the classes, and how many noise classes to
        sample. The number of noise classes per training word is controlled by
        the num_noise_samples tensor variable. The prior distribution is a
        shared variable, so that we don't have to pass the vector to every call
        of a Theano function. The constructor initializes it to the uniform
        distribution, and it can be set to the proper probabilities using the
        set_class_prior_probs() function.

        :type vocabulary: Vocabulary
        :param vocabulary: mapping between word IDs and word classes

        :type architecture: Architecture
        :param architecture: an object that describes the network architecture

        :type mode: Network.Mode
        :param mode: selects mini-batch or single time step processing

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.vocabulary = vocabulary
        self.architecture = architecture
        self.mode = self.Mode() if mode is None else mode

        M1 = 2147483647
        M2 = 2147462579
        random_seed = [
            numpy.random.randint(0, M1),
            numpy.random.randint(0, M1),
            numpy.random.randint(1, M1),
            numpy.random.randint(0, M2),
            numpy.random.randint(0, M2),
            numpy.random.randint(1, M2)
        ]
        self.random = RandomStreams(random_seed)

        # Word and class inputs will be available to NetworkInput layers.
        self.input_word_ids = tensor.matrix('network/input_word_ids',
                                            dtype='int64')
        self.input_class_ids = tensor.matrix('network/input_class_ids',
                                             dtype='int64')
        if self.mode.minibatch:
            self.input_word_ids.tag.test_value = test_value(
                size=(100, 16), max_value=vocabulary.num_words())
            self.input_class_ids.tag.test_value = test_value(
                size=(100, 16), max_value=vocabulary.num_classes())
        else:
            self.input_word_ids.tag.test_value = test_value(
                size=(1, 16), max_value=vocabulary.num_words())
            self.input_class_ids.tag.test_value = test_value(
                size=(1, 16), max_value=vocabulary.num_classes())

        # Recurrent layers will create these lists, used to initialize state
        # variables of appropriate sizes, for doing forward passes one step at a
        # time.
        self.recurrent_state_input = []
        self.recurrent_state_size = []

        # Create the layers.
        logging.debug("Creating layers.")
        self.layers = OrderedDict()
        for input_options in architecture.inputs:
            input = NetworkInput(input_options, self)
            self.layers[input.name] = input
        for layer_description in architecture.layers:
            layer_options = self._layer_options_from_description(
                layer_description)
            if layer_options['name'] == architecture.output_layer:
                layer_options['size'] = vocabulary.num_classes()
            layer = create_layer(layer_options, self, profile=profile)
            self.layers[layer.name] = layer
        self.output_layer = self.layers[architecture.output_layer]

        # This list will be filled by the recurrent layers to contain the
        # recurrent state outputs, for doing forward passes one step at a time.
        self.recurrent_state_output = [None] * len(self.recurrent_state_size)

        # This input variable can be used to specify the classes whose
        # probabilities will be computed, instead of the whole distribution.
        self.target_class_ids = tensor.matrix('network/target_class_ids',
                                              dtype='int64')
        if self.mode.minibatch:
            self.target_class_ids.tag.test_value = test_value(
                size=(100, 16), max_value=vocabulary.num_classes())
        else:
            self.target_class_ids.tag.test_value = test_value(
                size=(1, 16), max_value=vocabulary.num_classes())

        # This input variable is used only for detecting <unk> target words.
        self.target_word_ids = tensor.matrix('network/target_word_ids',
                                             dtype='int64')
        if self.mode.minibatch:
            self.target_word_ids.tag.test_value = test_value(
                size=(100, 16), max_value=vocabulary.num_words())
        else:
            self.target_word_ids.tag.test_value = test_value(
                size=(1, 16), max_value=vocabulary.num_words())

        # Create initial parameter values.
        logging.debug("Initializing parameters.")
        self.param_init_values = OrderedDict()
        num_params = 0
        for layer in self.layers.values():
            for name, value in layer.param_init_values.items():
                logging.debug("- %s size=%d", name, value.size)
                num_params += value.size
            self.param_init_values.update(layer.param_init_values)
        logging.debug("Total number of parameters: %d", num_params)

        # Create Theano shared variables.
        self.params = {
            name: theano.shared(value, name)
            for name, value in self.param_init_values.items()
        }
        for layer in self.layers.values():
            layer.set_params(self.params)

        # mask is used to mask out the rest of the input matrix, when a sequence
        # is shorter than the maximum sequence length. The mask is kept as int8
        # data type, which is how Tensor stores booleans.
        if self.mode.minibatch:
            self.mask = tensor.matrix('network/mask', dtype='int8')
            self.mask.tag.test_value = test_value(size=(100, 16),
                                                  max_value=True)
        else:
            self.mask = tensor.ones(self.input_word_ids.shape, dtype='int8')

        # Dropout layer needs to know whether we are training or evaluating.
        self.is_training = tensor.scalar('network/is_training', dtype='int8')
        self.is_training.tag.test_value = 1

        # When using noise-contrastive estimation, the output layer needs to
        # know the prior distribution of the classes, and how many noise classes
        # to sample.
        self.num_noise_samples = tensor.scalar('network/num_noise_samples',
                                               dtype='int64')
        self.num_noise_samples.tag.test_value = 100
        uniform_class_probs = numpy.ones(vocabulary.num_classes(),
                                         dtype=theano.config.floatX)
        uniform_class_probs /= vocabulary.num_classes()
        self.class_prior_probs = theano.shared(uniform_class_probs,
                                               'network/class_prior_probs')

        for layer in self.layers.values():
            layer.create_structure()
Example #13
0
    def __init__(self, optimization_options, network, device=None,
                 profile=False):
        """Creates Theano functions for training a neural network language
        model.

        The subclass constructor is expected to create the optimizer parameters
        in ``self._params``. This constructor will then create two update
        functions, ``self.gradient_update_function``, which updates the gradient
        parameters and returns the cost, and ``self.model_update_function``,
        which updates model state given the gradients and the learning rate.

        The gradient update functions takes as arguments three matrices:
        1. Word IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        2. Class IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        3. Mask in the shape of a mini-batch, but only for the output words (not
           for the first time step).

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object

        :type device: str
        :param device: device where to store the shared variables

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.network = network

        float_type = numpy.dtype(theano.config.floatX).type
        self.float_type = float_type

        try:
            # numerical stability / smoothing term to prevent divide-by-zero
            self._epsilon = float_type(optimization_options['epsilon'])
            # learning rate / step size
            self.learning_rate = float_type(optimization_options['learning_rate'])
            # weights for training files
            self._weights = optimization_options['weights']
            # maximum norm for parameter updates
            self._max_gradient_norm = float_type(
                optimization_options['max_gradient_norm'])
            # cost function
            cost_function = optimization_options['cost_function']
            # number of noise samples for sampling based output
            num_noise_samples = optimization_options['num_noise_samples']
            # noise sample sharing for sampling based output
            noise_sharing = optimization_options['noise_sharing']
            # ignore <unk> tokens?
            self._ignore_unk = optimization_options['ignore_unk']
            # penalty for <unk> tokens
            unk_penalty = optimization_options['unk_penalty']
            # ignore <unk> tokens?
            self._ignore_unk = optimization_options['ignore_unk']
        except KeyError as e:
            raise ValueError("Option {} is missing from optimization options."
                             .format(e))

        unk_id = self.network.vocabulary.word_to_id['<unk>']

        # The functions take as input a mini-batch of word IDs and class IDs,
        # and slice input and target IDs for the network.
        batch_word_ids = tensor.matrix('optimizer/batch_word_ids',
                                       dtype='int64')
        batch_word_ids.tag.test_value = test_value(
            size=(101, 16), high=self.network.vocabulary.num_words())
        batch_class_ids = tensor.matrix('optimizer/batch_class_ids',
                                        dtype='int64')
        batch_class_ids.tag.test_value = test_value(
            size=(101, 16), high=self.network.vocabulary.num_classes())

        if cost_function == 'cross-entropy':
            # Derive the symbolic expression for log probability of each word.
            logprobs = tensor.log(self.network.target_probs())
        elif cost_function == 'nce':
            logprobs = self._get_nce_cost(sharing=noise_sharing)
        elif cost_function == 'blackout':
            logprobs = self._get_blackout_cost(sharing=noise_sharing)
        else:
            raise ValueError("Invalid cost function requested: `{}'".format(
                             cost_function))

        # If requested, predict <unk> with constant score.
        if not unk_penalty is None:
            unk_mask = tensor.eq(self.network.target_word_ids, unk_id)
            unk_indices = unk_mask.nonzero()
            logprobs = tensor.set_subtensor(logprobs[unk_indices], unk_penalty)
        # Do not predict masked and possibly <unk> tokens. The mask has to be
        # cast to floatX, otherwise the result will be float64 and pulled out
        # from the GPU earlier than necessary.
        mask = self.network.mask
        if self._ignore_unk:
            mask *= tensor.neq(self.network.target_word_ids, unk_id)
        logprobs *= tensor.cast(mask, theano.config.floatX)
        # Cost is the negative log probability normalized by the number of
        # training examples in the mini-batch, so that the gradients will also
        # be normalized by the number of training examples.
        cost = -logprobs.sum() / tensor.cast(mask.sum(), theano.config.floatX)

        # Derive the symbolic expression for updating the gradient with regard
        # to each parameter.
        self._gradient_exprs = \
            tensor.grad(cost, wrt=list(self.network.get_variables().values()))

        # Ignore unused input, because is_training is only used by dropout
        # layer.
        self.gradient_update_function = theano.function(
            [batch_word_ids, batch_class_ids, self.network.mask],
            cost,
            givens=[(network.input_word_ids, batch_word_ids[:-1]),
                    (network.input_class_ids, batch_class_ids[:-1]),
                    (network.target_word_ids, batch_word_ids[1:]),
                    (network.target_class_ids, batch_class_ids[1:]),
                    (self.network.is_training, numpy.int8(1)),
                    (self.network.num_noise_samples,
                     numpy.int64(num_noise_samples))],
            updates=self._gradient_update_exprs(),
            name='gradient_update_function',
            on_unused_input='ignore',
            profile=profile)

        alpha = tensor.scalar('optimizer/update_weight',
                              dtype=theano.config.floatX)
        alpha.tag.test_value = 0.1
        self.model_update_function = theano.function(
            [alpha],
            [],
            updates=self._model_update_exprs(alpha),
            name='model_update_function',
            profile=profile)
Example #14
0
    def __init__(self, architecture, vocabulary, class_prior_probs=None,
                 noise_dampening=1.0, mode=None, default_device=None,
                 profile=False):
        """Initializes the neural network parameters for all layers, and
        creates Theano shared variables from them.

        When using a sampling based output layer, it needs to know the prior
        distribution of the classes, and how many noise classes to sample. The
        number of noise classes per training word is controlled by the
        ``num_noise_samples`` tensor variable. The prior distribution is a
        shared variable, so that we don't have to pass the vector to every call
        of a Theano function. The constructor initializes it using
        ``class_prior_probs`` and ``noise_dampening``.

        :type architecture: Architecture
        :param architecture: an object that describes the network architecture

        :type vocabulary: Vocabulary
        :param vocabulary: mapping between word IDs and word classes

        :type class_prior_probs: numpy.ndarray
        :param class_prior_probs: empirical (unigram) distribution of the output
                                  classes (only required for training)

        :type noise_dampening: float
        :param noise_dampening: exponent to which the unigram distribution is
                                raised before sampling noise samples

        :type mode: Network.Mode
        :param mode: selects mini-batch or single time step processing

        :type default_device: str
        :param default_device: default device where to store the shared variables

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.vocabulary = vocabulary
        self.architecture = architecture
        self.mode = self.Mode() if mode is None else mode

        M1 = 2147483647
        M2 = 2147462579
        random_seed = [
            numpy.random.randint(0, M1),
            numpy.random.randint(0, M1),
            numpy.random.randint(1, M1),
            numpy.random.randint(0, M2),
            numpy.random.randint(0, M2),
            numpy.random.randint(1, M2)]
        self.random = RandomStreams(random_seed)

        # Word and class inputs will be available to NetworkInput layers.
        self.input_word_ids = tensor.matrix('network/input_word_ids', dtype='int64')
        self.input_class_ids = tensor.matrix('network/input_class_ids', dtype='int64')
        if self.mode.minibatch:
            self.input_word_ids.tag.test_value = test_value(
                size=(100, 16), high=vocabulary.num_words())
            self.input_class_ids.tag.test_value = test_value(
                size=(100, 16), high=vocabulary.num_classes())
        else:
            self.input_word_ids.tag.test_value = test_value(
                size=(1, 16), high=vocabulary.num_words())
            self.input_class_ids.tag.test_value = test_value(
                size=(1, 16), high=vocabulary.num_classes())

        # During training, the output layer bias vector is initialized to the
        # unigram probabilities.
        self.class_prior_probs = class_prior_probs

        # Recurrent layers will create these lists, used to initialize state
        # variables of appropriate sizes, for doing forward passes one step at a
        # time.
        self.recurrent_state_input = []
        self.recurrent_state_size = []

        # Create the layers.
        logging.debug("Creating layers.")
        self.layers = OrderedDict()
        for input_options in architecture.inputs:
            input = NetworkInput(input_options, self)
            self.layers[input.name] = input
        for layer_description in architecture.layers:
            layer_options = self._layer_options_from_description(
                layer_description)
            if layer_options['name'] == architecture.output_layer:
                layer_options['size'] = vocabulary.num_classes()
            if not layer_options['devices']:
                layer_options['devices'] = [default_device]
            layer = create_layer(layer_options, self, profile=profile)
            self.layers[layer.name] = layer
        self.output_layer = self.layers[architecture.output_layer]
        num_params = sum(layer.params.total_size
                         for layer in self.layers.values())
        logging.debug("Total number of parameters: %d", num_params)

        # This list will be filled by the recurrent layers to contain the
        # recurrent state outputs, for doing forward passes one step at a time.
        self.recurrent_state_output = [None] * len(self.recurrent_state_size)

        # This input variable can be used to specify the classes whose
        # probabilities will be computed, instead of the whole distribution.
        self.target_class_ids = tensor.matrix('network/target_class_ids',
                                              dtype='int64')
        if self.mode.minibatch:
            self.target_class_ids.tag.test_value = test_value(
                size=(100, 16), high=vocabulary.num_classes())
        else:
            self.target_class_ids.tag.test_value = test_value(
                size=(1, 16), high=vocabulary.num_classes())

        # This input variable is used only for detecting <unk> target words.
        self.target_word_ids = tensor.matrix('network/target_word_ids',
                                             dtype='int64')
        if self.mode.minibatch:
            self.target_word_ids.tag.test_value = test_value(
                size=(100, 16), high=vocabulary.num_words())
        else:
            self.target_word_ids.tag.test_value = test_value(
                size=(1, 16), high=vocabulary.num_words())

        # mask is used to mask out the rest of the input matrix, when a sequence
        # is shorter than the maximum sequence length. The mask is kept as int8
        # data type, which is how Tensor stores booleans.
        if self.mode.minibatch:
            self.mask = tensor.matrix('network/mask', dtype='int8')
            self.mask.tag.test_value = test_value(size=(100, 16), high=True)
        else:
            self.mask = tensor.ones(self.input_word_ids.shape, dtype='int8')

        # Dropout layer needs to know whether we are training or evaluating.
        self.is_training = tensor.scalar('network/is_training', dtype='int8')
        self.is_training.tag.test_value = 1

        # num_noise_samples tells sampling based methods how many noise classes
        # to sample.
        self.num_noise_samples = tensor.scalar('network/num_noise_samples',
                                               dtype='int64')
        self.num_noise_samples.tag.test_value = 25

        # Sampling based methods use this noise distribution, if it's set.
        # Otherwise noise is sampled from uniform distribution.
        if (class_prior_probs is None) or (noise_dampening == 0.0):
            # Use uniform() for sampling based training.
            self.noise_probs = None
        else:
            noise_probs = numpy.power(class_prior_probs, noise_dampening)
            noise_probs /= noise_probs.sum()
            if default_device is None:
                self.noise_probs = \
                    theano.shared(noise_probs.astype(theano.config.floatX),
                                  'network/noise_probs')
            else:
                self.noise_probs = \
                    theano.shared(noise_probs.astype(theano.config.floatX),
                                  'network/noise_probs',
                                  target=default_device)

        for layer in self.layers.values():
            layer.create_structure()
Example #15
0
    def __init__(self,
                 architecture,
                 vocabulary,
                 class_prior_probs=None,
                 noise_dampening=1.0,
                 mode=None,
                 exclude_unk=False,
                 default_device=None,
                 profile=False):
        """Initializes the neural network parameters for all layers, and
        creates Theano shared variables from them.

        When using a sampling based output layer, it needs to know the prior
        distribution of the classes, and how many noise classes to sample. The
        number of noise classes per training word is controlled by the
        ``num_noise_samples`` tensor variable. The prior distribution is a
        shared variable, so that we don't have to pass the vector to every call
        of a Theano function. The constructor initializes it using
        ``class_prior_probs`` and ``noise_dampening``.

        :type architecture: Architecture
        :param architecture: an object that describes the network architecture

        :type vocabulary: Vocabulary
        :param vocabulary: mapping between word IDs and word classes

        :type class_prior_probs: numpy.ndarray
        :param class_prior_probs: empirical (unigram) distribution of the output
                                  classes (only required for training)

        :type noise_dampening: float
        :param noise_dampening: exponent to which the unigram distribution is
                                raised before sampling noise samples

        :type mode: Network.Mode
        :param mode: selects mini-batch or single time step processing

        :type exclude_unk: bool
        :param exclude_unk: if set to ``True``, sets ``<unk>`` probability to
                            zero.

        :type default_device: str
        :param default_device: default device where to store the shared variables

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.vocabulary = vocabulary
        self.architecture = architecture
        self.mode = self.Mode() if mode is None else mode

        M1 = 2147483647
        M2 = 2147462579
        random_seed = [
            numpy.random.randint(0, M1),
            numpy.random.randint(0, M1),
            numpy.random.randint(1, M1),
            numpy.random.randint(0, M2),
            numpy.random.randint(0, M2),
            numpy.random.randint(1, M2)
        ]
        self.random = RandomStreams(random_seed)

        # Word and class inputs will be available to NetworkInput layers.
        self.input_word_ids = tensor.matrix('network/input_word_ids',
                                            dtype='int64')
        self.input_class_ids = tensor.matrix('network/input_class_ids',
                                             dtype='int64')
        if self.mode.minibatch:
            self.input_word_ids.tag.test_value = test_value(
                size=(20, 4), high=vocabulary.num_shortlist_words())
            self.input_class_ids.tag.test_value = test_value(
                size=(20, 4), high=vocabulary.num_classes())
        else:
            self.input_word_ids.tag.test_value = test_value(
                size=(1, 4), high=vocabulary.num_shortlist_words())
            self.input_class_ids.tag.test_value = test_value(
                size=(1, 4), high=vocabulary.num_classes())

        # Should the output layer set the <unk> probability to zero? In that
        # case OOVs are not counted when computing perplexity.
        self.exclude_unk = exclude_unk

        # During training, the output layer bias vector is initialized to the
        # unigram probabilities.
        self.class_prior_probs = class_prior_probs

        # A shortlist model adds these logprobs to OOS logprobs predicted by the
        # network.
        if vocabulary.has_unigram_probs():
            oos_logprobs = numpy.log(vocabulary.get_oos_probs())
            oos_logprobs = oos_logprobs.astype(theano.config.floatX)
            self.oos_logprobs = theano.shared(oos_logprobs,
                                              'network/oos_logprobs')
        else:
            self.oos_logprobs = None

        # Recurrent layers will create these lists, used to initialize state
        # variables of appropriate sizes, for doing forward passes one step at a
        # time.
        self.recurrent_state_input = []
        self.recurrent_state_size = []

        # Create the layers.
        logging.debug("Creating layers.")
        self.layers = OrderedDict()
        for input_options in architecture.inputs:
            network_input = NetworkInput(input_options, self)
            self.layers[network_input.name] = network_input
        for layer_description in architecture.layers:
            layer_options = self._layer_options_from_description(
                layer_description)
            if layer_options['name'] == architecture.output_layer:
                layer_options['size'] = vocabulary.num_classes()
            # 'devices' not in layer_options is for backward compatibility.
            # Remove at some point.
            if ('devices'
                    not in layer_options) or (not layer_options['devices']):
                layer_options['devices'] = [default_device]
            layer = create_layer(layer_options, self, profile=profile)
            self.layers[layer.name] = layer
        self.output_layer = self.layers[architecture.output_layer]
        num_params = sum(layer.num_params() for layer in self.layers.values())
        logging.debug("Total number of parameters: %d", num_params)

        # This list will be filled by the recurrent layers to contain the
        # recurrent state outputs, for doing forward passes one step at a time.
        self.recurrent_state_output = [None] * len(self.recurrent_state_size)

        # This input variable can be used to specify the classes whose
        # probabilities will be computed, instead of the whole distribution.
        self.target_class_ids = tensor.matrix('network/target_class_ids',
                                              dtype='int64')
        if self.mode.minibatch:
            self.target_class_ids.tag.test_value = test_value(
                size=(20, 4), high=vocabulary.num_classes())
        else:
            self.target_class_ids.tag.test_value = test_value(
                size=(1, 4), high=vocabulary.num_classes())

        # This input variable is used only for detecting <unk> target words.
        self.target_word_ids = tensor.matrix('network/target_word_ids',
                                             dtype='int64')
        if self.mode.minibatch:
            self.target_word_ids.tag.test_value = test_value(
                size=(20, 4), high=vocabulary.num_shortlist_words())
        else:
            self.target_word_ids.tag.test_value = test_value(
                size=(1, 4), high=vocabulary.num_shortlist_words())

        # mask is used to mask out the rest of the input matrix, when a sequence
        # is shorter than the maximum sequence length. The mask is kept as int8
        # data type, which is how Tensor stores booleans.
        if self.mode.minibatch:
            self.mask = tensor.matrix('network/mask', dtype='int8')
            self.mask.tag.test_value = test_value(size=(20, 4), high=True)
        else:
            self.mask = tensor.ones(self.input_word_ids.shape, dtype='int8')

        # Dropout layer needs to know whether we are training or evaluating.
        self.is_training = tensor.scalar('network/is_training', dtype='int8')
        self.is_training.tag.test_value = 1

        # num_noise_samples tells sampling based methods how many noise classes
        # to sample.
        self.num_noise_samples = tensor.scalar('network/num_noise_samples',
                                               dtype='int64')
        self.num_noise_samples.tag.test_value = 3

        # Sampling based methods use this noise distribution, if it's set.
        # Otherwise noise is sampled from uniform distribution.
        if (class_prior_probs is None) or (noise_dampening == 0.0):
            # Use uniform() for sampling based training.
            self.noise_probs = None
        else:
            noise_probs = numpy.power(class_prior_probs, noise_dampening)
            noise_probs /= noise_probs.sum()
            if default_device is None:
                self.noise_probs = \
                    theano.shared(noise_probs.astype(theano.config.floatX),
                                  'network/noise_probs')
            else:
                self.noise_probs = \
                    theano.shared(noise_probs.astype(theano.config.floatX),
                                  'network/noise_probs',
                                  target=default_device)

        for layer in self.layers.values():
            layer.create_structure()
Example #16
0
    def __init__(self,
                 vocabulary,
                 architecture,
                 mode=Mode.minibatch,
                 profile=False):
        """Initializes the neural network parameters for all layers, and
        creates Theano shared variables from them.

        :type vocabulary: Vocabulary
        :param vocabulary: mapping between word IDs and word classes

        :type architecture: Architecture
        :param architecture: an object that describes the network architecture

        :type mode: Network.Mode
        :param mode: constructs a variation of the networkif set to True, creates a network that
            produces the probability distribution for the next word (instead of
            of target probabilities for a mini-batch)

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.vocabulary = vocabulary
        self.architecture = architecture
        self.mode = mode

        M1 = 2147483647
        M2 = 2147462579
        random_seed = [
            numpy.random.randint(0, M1),
            numpy.random.randint(0, M1),
            numpy.random.randint(1, M1),
            numpy.random.randint(0, M2),
            numpy.random.randint(0, M2),
            numpy.random.randint(1, M2)
        ]
        self.random = RandomStreams(random_seed)

        # Word and class inputs will be available to NetworkInput layers.
        self.word_input = tensor.matrix('network/word_input', dtype='int64')
        self.class_input = tensor.matrix('network/class_input', dtype='int64')
        if self.mode.is_minibatch():
            self.word_input.tag.test_value = test_value(
                size=(100, 16), max_value=vocabulary.num_words())
            self.class_input.tag.test_value = test_value(
                size=(100, 16), max_value=vocabulary.num_classes())
        else:
            self.word_input.tag.test_value = test_value(
                size=(1, 16), max_value=vocabulary.num_words())
            self.class_input.tag.test_value = test_value(
                size=(1, 16), max_value=vocabulary.num_classes())

        # Recurrent layers will create these lists, used to initialize state
        # variables of appropriate sizes, for doing forward passes one step at a
        # time.
        self.recurrent_state_input = []
        self.recurrent_state_size = []

        # Create the layers.
        logging.debug("Creating layers.")
        self.layers = OrderedDict()
        for input_options in architecture.inputs:
            input = NetworkInput(input_options, self)
            self.layers[input.name] = input
        for layer_description in architecture.layers:
            layer_options = self._layer_options_from_description(
                layer_description)
            if layer_options['name'] == architecture.output_layer:
                layer_options['size'] = vocabulary.num_classes()
            layer = create_layer(layer_options, self, profile=profile)
            self.layers[layer.name] = layer
        self.output_layer = self.layers[architecture.output_layer]

        # This list will be filled by the recurrent layers to contain the
        # recurrent state outputs, for doing forward passes one step at a time.
        self.recurrent_state_output = [None] * len(self.recurrent_state_size)

        # When the mode is target_words, this input variable specifies the words
        # whose probabilities will be computed.
        self.target_class_ids = tensor.matrix('network/target_class_ids',
                                              dtype='int64')
        self.target_class_ids.tag.test_value = test_value(
            size=(1, 16), max_value=vocabulary.num_classes())

        # Create initial parameter values.
        logging.debug("Initializing parameters.")
        self.param_init_values = OrderedDict()
        num_params = 0
        for layer in self.layers.values():
            for name, value in layer.param_init_values.items():
                logging.debug("- %s size=%d", name, value.size)
                num_params += value.size
            self.param_init_values.update(layer.param_init_values)
        logging.debug("Total number of parameters: %d", num_params)

        # Create Theano shared variables.
        self.params = {
            name: theano.shared(value, name)
            for name, value in self.param_init_values.items()
        }
        for layer in self.layers.values():
            layer.set_params(self.params)

        # mask is used to mask out the rest of the input matrix, when a sequence
        # is shorter than the maximum sequence length. The mask is kept as int8
        # data type, which is how Tensor stores booleans.
        if self.mode.is_minibatch():
            self.mask = tensor.matrix('network/mask', dtype='int8')
            self.mask.tag.test_value = test_value(size=(100, 16),
                                                  max_value=True)
        else:
            self.mask = tensor.ones(self.word_input.shape, dtype='int8')

        # Dropout layer needs to know whether we are training or evaluating.
        self.is_training = tensor.scalar('network/is_training', dtype='int8')
        self.is_training.tag.test_value = 1

        for layer in self.layers.values():
            layer.create_structure()