Example #1
0
    def add_recurrent_state(self, size):
        """Adds a recurrent state variable and returns its index.

        Used by recurrent layers to add a state variable that has to be passed
        from one time step to the next, when generating text or computing
        lattice probabilities.

        :type size: int
        :param size: size of the state vector

        :rtype size: int
        :param size: index of the new recurrent state variable
        """

        index = len(self.recurrent_state_size)
        assert index == len(self.recurrent_state_input)

        # The variables are in the structure of a mini-batch (3-dimensional
        # array) to keep the layer functions general.
        variable = tensor.tensor3('network/recurrent_state_' + str(index),
                                  dtype=theano.config.floatX)
        variable.tag.test_value = test_value(size=(1, 4, size), high=1.0)

        self.recurrent_state_size.append(size)
        self.recurrent_state_input.append(variable)

        return index
Example #2
0
    def __init__(self,
                 network,
                 use_shortlist=True,
                 exclude_unk=False,
                 profile=False):
        """Creates two Theano function, ``self._target_logprobs_function()``,
        which computes the log probabilities predicted by the neural network for
        the words in a mini-batch, and ``self._total_logprob_function()``, which
        returns the total log probability.

        Both functions take as arguments four matrices:

        1. Word IDs in the shape of a mini-batch. The functions will only use
           the input words (not the last time step).
        2. Class IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        3. Class membership probabilities in the shape of a mini-batch, but only
           for the output words (not the first time step).
        4. Mask in the shape of a mini-batch, but only for the output words (not
           for the first time step).

        ``self._target_logprobs_function()`` will return a matrix of predicted
        log probabilities for the output words (excluding the first time step)
        and the mask. ``<unk>`` tokens are also masked out if ``exclude_unk`` is
        set to ``True``. ``self._total_logprob_function()`` will return the
        total log probability of the predicted (unmasked) words and the number
        of those words.

        :type network: Network
        :param network: the neural network object

        :type use_shortlist: bool
        :param use_shortlist: if ``True``, the ``<unk>`` probability is
                              distributed among the out-of-shortlist words

        :type exclude_unk: bool
        :param exclude_unk: if set to ``True``, ``<unk>`` tokens are excluded
                            from probability computation

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self._vocabulary = network.vocabulary
        self._unk_id = self._vocabulary.word_to_id['<unk>']

        # The functions take as input a mini-batch of word IDs and class IDs,
        # and slice input and target IDs for the network.
        batch_word_ids = tensor.matrix('textscorer/batch_word_ids',
                                       dtype='int64')
        batch_word_ids.tag.test_value = test_value(
            size=(21, 4), high=self._vocabulary.num_words())
        batch_class_ids = tensor.matrix('textscorer/batch_class_ids',
                                        dtype='int64')
        batch_class_ids.tag.test_value = test_value(
            size=(21, 4), high=self._vocabulary.num_classes())

        all_class_ids = tensor.vector('textscorer/all_class_ids',
                                      dtype='int64')
        all_class_ids.tag.test_value = test_value(
            size=(21, ), high=self._vocabulary.num_classes())

        membership_probs = tensor.matrix('textscorer/membership_probs',
                                         dtype=theano.config.floatX)
        membership_probs.tag.test_value = test_value(size=(20, 4), high=1.0)

        membership_probs_output_vec = tensor.tensor3(
            'textscorer/membership_probs_output_vec',
            dtype=theano.config.floatX)
        membership_probs_output_vec.tag.test_value = test_value(size=(20, 4,
                                                                      5),
                                                                high=1.0)

        k = tensor.scalar('textscorer/k', dtype='int64')
        k.tag.test_value = 4

        # Convert out-of-shortlist words to <unk> in input.
        shortlist_size = self._vocabulary.num_shortlist_words()
        input_word_ids = batch_word_ids[:-1]
        oos_indices = tensor.ge(input_word_ids, shortlist_size).nonzero()
        input_word_ids = tensor.set_subtensor(input_word_ids[oos_indices],
                                              self._unk_id)
        # Out-of-shortlist words are already in <unk> class, because they don't
        # have own classes.
        input_class_ids = batch_class_ids[:-1]
        target_class_ids = batch_class_ids[1:]
        # Target word IDs are not used by the network. We need them to compute
        # probabilities for out-of-shortlist word.
        target_word_ids = batch_word_ids[1:]

        logprobs = tensor.log(network.target_probs())
        logprobs_output_vec = tensor.log(network.output_probs())
        if (logprobs_output_vec.shape[2] !=
                membership_probs_output_vec.shape[2]):
            logprobs_output_vec = logprobs_output_vec[:, :, all_class_ids]

        # Add logprobs from the class membership of the predicted word.

        logprobs += tensor.log(membership_probs)
        logprobs_output_vec += tensor.log(membership_probs_output_vec)

        mask = network.mask
        if use_shortlist and network.oos_logprobs is not None:
            # The probability of out-of-shortlist words (which is the <unk>
            # probability) is multiplied by the fraction of the actual word
            # within the set of OOS words.
            logprobs += network.oos_logprobs[target_word_ids]
            logprobs_output_vec += tensor.tile(
                network.oos_logprobs, (logprobs_output_vec.shape[0],
                                       logprobs_output_vec.shape[1], 1))

            # Always exclude OOV words when using a shortlist - No probability
            # mass is left for them.
            mask *= tensor.neq(target_word_ids, self._unk_id)
        elif exclude_unk:
            # If requested, ignore OOS and OOV probabilities.
            mask *= tensor.neq(target_word_ids, self._unk_id)
            mask *= tensor.lt(target_word_ids, shortlist_size)

        # Ignore unused input variables, because is_training is only used by
        # dropout layer.
        masked_logprobs = logprobs * tensor.cast(mask, theano.config.floatX)
        self._target_logprobs_function = theano.function(
            [batch_word_ids, batch_class_ids, membership_probs, network.mask],
            [masked_logprobs, mask],
            givens=[(network.input_word_ids, input_word_ids),
                    (network.input_class_ids, input_class_ids),
                    (network.target_class_ids, target_class_ids),
                    (network.is_training, numpy.int8(0))],
            name='target_logprobs',
            on_unused_input='ignore',
            profile=profile)

        # Ignore unused input variables, because is_training is only used by
        # dropout layer.
        #mask_output_vec = tensor.tile(mask.reshape([membership_probs.shape[0],membership_probs.shape[1],1]),(1,1,self._vocabulary.num_classes()))
        mask_output_vec = mask.reshape([mask.shape[0], mask.shape[1], 1])
        masked_logprobs_output_vec = logprobs_output_vec * tensor.cast(
            mask_output_vec, theano.config.floatX)
        self._output_vec_logprobs_function = theano.function(
            [
                batch_word_ids, batch_class_ids, all_class_ids,
                membership_probs_output_vec, network.mask
            ], [masked_logprobs_output_vec, mask],
            givens=[(network.input_word_ids, input_word_ids),
                    (network.input_class_ids, input_class_ids),
                    (network.target_class_ids, target_class_ids),
                    (network.is_training, numpy.int8(0))],
            name='output_logprobs',
            on_unused_input='ignore',
            profile=profile)

        top_k = tensor.argsort(masked_logprobs_output_vec, axis=2)[:, :, -k:]
        self._output_top_k_indices_funciton = theano.function(
            [
                batch_word_ids, batch_class_ids, all_class_ids,
                membership_probs_output_vec, network.mask, k
            ], [masked_logprobs_output_vec, top_k, mask],
            givens=[(network.input_word_ids, input_word_ids),
                    (network.input_class_ids, input_class_ids),
                    (network.target_class_ids, target_class_ids),
                    (network.is_training, numpy.int8(0))],
            name='topk_indices',
            on_unused_input='ignore',
            profile=profile)

        # If some word is not in the training data, its class membership
        # probability will be zero. We want to ignore those words. Multiplying
        # by the mask is not possible, because those logprobs will be -inf.
        mask *= tensor.neq(membership_probs, 0.0)
        masked_logprobs = tensor.switch(mask, logprobs, 0.0)
        self._total_logprob_function = theano.function(
            [batch_word_ids, batch_class_ids, membership_probs, network.mask],
            [masked_logprobs.sum(), mask.sum()],
            givens=[(network.input_word_ids, input_word_ids),
                    (network.input_class_ids, input_class_ids),
                    (network.target_class_ids, target_class_ids),
                    (network.is_training, numpy.int8(0))],
            name='total_logprob',
            on_unused_input='ignore',
            profile=profile)

        # These are updated by score_line().
        self.num_words = 0
        self.num_unks = 0
Example #3
0
    def __init__(self,
                 architecture,
                 vocabulary,
                 class_prior_probs=None,
                 mode=None,
                 exclude_unk=False,
                 default_device=None,
                 profile=False):
        """Initializes the neural network parameters for all layers, and
        creates Theano shared variables from them.

        :type architecture: Architecture
        :param architecture: an object that describes the network architecture

        :type vocabulary: Vocabulary
        :param vocabulary: mapping between word IDs and word classes

        :type class_prior_probs: numpy.ndarray
        :param class_prior_probs: empirical (unigram) distribution of the output
                                  classes (only required for training)

        :type mode: Network.Mode
        :param mode: selects mini-batch or single time step processing

        :type exclude_unk: bool
        :param exclude_unk: if set to ``True``, set ``<unk>`` probability to
                            zero before normalizing the network outputs
                            (required to get exact normalization during
                            inference)

        :type default_device: str
        :param default_device: default device where to store the shared variables

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.vocabulary = vocabulary
        self.architecture = architecture
        self.mode = self.Mode() if mode is None else mode

        M1 = 2147483647
        M2 = 2147462579
        random_seed = [
            numpy.random.randint(0, M1),
            numpy.random.randint(0, M1),
            numpy.random.randint(1, M1),
            numpy.random.randint(0, M2),
            numpy.random.randint(0, M2),
            numpy.random.randint(1, M2)
        ]
        self.random = RandomStreams(random_seed)

        # Word and class inputs will be available to NetworkInput layers.
        self.input_word_ids = tensor.matrix('network/input_word_ids',
                                            dtype='int64')
        self.input_class_ids = tensor.matrix('network/input_class_ids',
                                             dtype='int64')
        if self.mode.minibatch:
            self.input_word_ids.tag.test_value = test_value(
                size=(20, 4), high=vocabulary.num_shortlist_words())
            self.input_class_ids.tag.test_value = test_value(
                size=(20, 4), high=vocabulary.num_classes())
        else:
            self.input_word_ids.tag.test_value = test_value(
                size=(1, 4), high=vocabulary.num_shortlist_words())
            self.input_class_ids.tag.test_value = test_value(
                size=(1, 4), high=vocabulary.num_classes())

        # Should the output layer set the <unk> probability to zero? In that
        # case OOVs are not counted when computing perplexity.
        self.exclude_unk = exclude_unk

        # Default device for shared variables.
        self._default_device = default_device

        # During training, the output layer bias vector is initialized to the
        # unigram probabilities.
        self.class_prior_probs = class_prior_probs

        # Variable to check if the initialize the using freeze layer or not
        self.training = True
        logging.info("Seting to the neural network to train.")

        # A shortlist model adds these logprobs to OOS logprobs predicted by the
        # network.
        if vocabulary.has_unigram_probs():
            oos_logprobs = numpy.log(vocabulary.get_oos_probs())
            oos_logprobs = oos_logprobs.astype(theano.config.floatX)
            if self._default_device is None:
                self.oos_logprobs = theano.shared(oos_logprobs,
                                                  'network/oos_logprobs')
            else:
                self.oos_logprobs = theano.shared(oos_logprobs,
                                                  'network/oos_logprobs',
                                                  target=self._default_device)
        else:
            self.oos_logprobs = None

        # Recurrent layers will create these lists, used to initialize state
        # variables of appropriate sizes, for doing forward passes one step at a
        # time.
        self.recurrent_state_input = []
        self.recurrent_state_size = []

        # Create the layers.
        logging.debug("Creating layers.")
        self.layers = OrderedDict()
        for input_options in architecture.inputs:
            network_input = NetworkInput(input_options, self)
            self.layers[network_input.name] = network_input
        for layer_description in architecture.layers:
            layer_options = self._layer_options_from_description(
                layer_description)
            if layer_options['name'] == architecture.output_layer:
                layer_options['size'] = vocabulary.num_classes()
            # 'devices' not in layer_options is for backward compatibility.
            # Remove at some point.
            if ('devices'
                    not in layer_options) or (not layer_options['devices']):
                layer_options['devices'] = [default_device]
            layer = create_layer(layer_options, self, profile=profile)
            self.layers[layer.name] = layer
        self.output_layer = self.layers[architecture.output_layer]
        num_params = sum(layer.num_params() for layer in self.layers.values())
        logging.debug("Total number of model parameters: %d", num_params)

        # This list will be filled by the recurrent layers to contain the
        # recurrent state outputs, for doing forward passes one step at a time.
        self.recurrent_state_output = [None] * len(self.recurrent_state_size)

        # This input variable can be used to specify the classes whose
        # probabilities will be computed, instead of the whole distribution.
        self.target_class_ids = tensor.matrix('network/target_class_ids',
                                              dtype='int64')
        if self.mode.minibatch:
            self.target_class_ids.tag.test_value = test_value(
                size=(20, 4), high=vocabulary.num_classes())
        else:
            self.target_class_ids.tag.test_value = test_value(
                size=(1, 4), high=vocabulary.num_classes())

        # This input variable is used only for detecting <unk> target words.
        self.target_word_ids = tensor.matrix('network/target_word_ids',
                                             dtype='int64')
        if self.mode.minibatch:
            self.target_word_ids.tag.test_value = test_value(
                size=(20, 4), high=vocabulary.num_shortlist_words())
        else:
            self.target_word_ids.tag.test_value = test_value(
                size=(1, 4), high=vocabulary.num_shortlist_words())

        # mask is used to mask out the rest of the input matrix, when a sequence
        # is shorter than the maximum sequence length. The mask is kept as int8
        # data type, which is how Tensor stores booleans.
        if self.mode.minibatch:
            self.mask = tensor.matrix('network/mask', dtype='int8')
            self.mask.tag.test_value = test_value(size=(20, 4), high=True)
        else:
            self.mask = tensor.ones(self.input_word_ids.shape, dtype='int8')

        # Dropout layer needs to know whether we are training or evaluating.
        self.is_training = tensor.scalar('network/is_training', dtype='int8')
        self.is_training.tag.test_value = 1

        # num_noise_samples tells sampling based methods how many noise classes
        # to sample.
        self.num_noise_samples = tensor.scalar('network/num_noise_samples',
                                               dtype='int64')
        self.num_noise_samples.tag.test_value = 3
        self.noise_distribution = None

        for layer in self.layers.values():
            layer.create_structure()
Example #4
0
    def __init__(self,
                 optimization_options,
                 network,
                 cost_function,
                 profile=False):
        """Creates Theano functions for training a neural network language
        model.

        The subclass constructor is expected to create the optimizer parameters
        in ``self._params``. This constructor will then create a function
        ``self.update_function``, which updates the optimizer parameters, and
        then the model state given the gradients, the optimizer parameters, and the
        learning rate.

        The update functions takes as arguments four matrices and the alpha
        hyperparameter:

        1. Word IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        2. Class IDs in the shape of a mini-batch. The functions will slice this
           into input and output.
        3. Mask in the shape of a mini-batch, but only for the output words (not
           for the first time step).
        4. Weights in the shape of a mini-batch, but only for the output words
           (not for the first time step).
        4. Alpha or learning rate is used to scale the size of the update.

        :type optimization_options: dict
        :param optimization_options: a dictionary of optimization options

        :type network: Network
        :param network: the neural network object

        :type cost_function: Cost
        :param cost_function: an object from one of the cost function classes
                              that defined the training objective

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.network = network

        float_type = numpy.dtype(theano.config.floatX).type
        self.float_type = float_type

        try:
            # numerical stability / smoothing term to prevent divide-by-zero
            self._epsilon = float_type(optimization_options['epsilon'])
            # learning rate / step size
            self.learning_rate = float_type(
                optimization_options['learning_rate'])
            # weights for training files
            self._weights = optimization_options['weights']
            # maximum norm for parameter updates
            self._max_gradient_norm = float_type(
                optimization_options['max_gradient_norm'])
            # number of noise samples for sampling based output
            num_noise_samples = optimization_options['num_noise_samples']
            # noise sample sharing for sampling based output
            noise_sharing = optimization_options['noise_sharing']
        except KeyError as e:
            raise ValueError(
                "Option {} is missing from optimization options.".format(e))

        self._unk_id = self.network.vocabulary.word_to_id['<unk>']

        # The function takes as inputs a mini-batch of word IDs and class IDs,
        # and slices the input and target IDs for the network.
        batch_word_ids = tensor.matrix('optimizer/batch_word_ids',
                                       dtype='int64')
        batch_word_ids.tag.test_value = test_value(
            size=(101, 16), high=self.network.vocabulary.num_shortlist_words())
        batch_class_ids = tensor.matrix('optimizer/batch_class_ids',
                                        dtype='int64')
        batch_class_ids.tag.test_value = test_value(
            size=(101, 16), high=self.network.vocabulary.num_classes())

        # Derive the symbolic expression for updating the gradient with regard
        # to each parameter.
        cost, num_words = cost_function.get_tensor()
        self._gradients = \
            tensor.grad(cost, wrt=list(self.network.get_variables().values()))

        # The function takes as input the learning rate.
        alpha = tensor.scalar('optimizer/alpha', dtype=theano.config.floatX)
        alpha.tag.test_value = 0.1

        # The function takes as input a matrix of weights, one for each
        # target word. These are used to scale the parameter updates.
        weights = tensor.matrix('optimizer/weights',
                                dtype=theano.config.floatX)
        weights.tag.test_value = test_value(size=(100, 16), high=1.0)
        word_positions = tensor.eq(self.network.mask, 1).nonzero()
        weight = weights[word_positions].sum()
        num_words_float = tensor.cast(num_words, theano.config.floatX)
        modified_alpha = tensor.switch(tensor.gt(num_words, 0),
                                       alpha * weight / num_words_float, alpha)

        # Ignore unused input, because is_training is only used by dropout
        # layer.
        self.update_function = theano.function(
            [
                batch_word_ids, batch_class_ids, self.network.mask, weights,
                alpha
            ], [cost, num_words],
            givens=[(network.input_word_ids, batch_word_ids[:-1]),
                    (network.input_class_ids, batch_class_ids[:-1]),
                    (network.target_word_ids, batch_word_ids[1:]),
                    (network.target_class_ids, batch_class_ids[1:]),
                    (self.network.is_training, numpy.int8(1)),
                    (self.network.num_noise_samples,
                     numpy.int64(num_noise_samples))],
            updates=self._get_param_updates(alpha),
            name='update_function',
            on_unused_input='ignore',
            profile=profile)