def add_recurrent_state(self, size): """Adds a recurrent state variable and returns its index. Used by recurrent layers to add a state variable that has to be passed from one time step to the next, when generating text or computing lattice probabilities. :type size: int :param size: size of the state vector :rtype size: int :param size: index of the new recurrent state variable """ index = len(self.recurrent_state_size) assert index == len(self.recurrent_state_input) # The variables are in the structure of a mini-batch (3-dimensional # array) to keep the layer functions general. variable = tensor.tensor3('network/recurrent_state_' + str(index), dtype=theano.config.floatX) variable.tag.test_value = test_value(size=(1, 4, size), high=1.0) self.recurrent_state_size.append(size) self.recurrent_state_input.append(variable) return index
def __init__(self, network, use_shortlist=True, exclude_unk=False, profile=False): """Creates two Theano function, ``self._target_logprobs_function()``, which computes the log probabilities predicted by the neural network for the words in a mini-batch, and ``self._total_logprob_function()``, which returns the total log probability. Both functions take as arguments four matrices: 1. Word IDs in the shape of a mini-batch. The functions will only use the input words (not the last time step). 2. Class IDs in the shape of a mini-batch. The functions will slice this into input and output. 3. Class membership probabilities in the shape of a mini-batch, but only for the output words (not the first time step). 4. Mask in the shape of a mini-batch, but only for the output words (not for the first time step). ``self._target_logprobs_function()`` will return a matrix of predicted log probabilities for the output words (excluding the first time step) and the mask. ``<unk>`` tokens are also masked out if ``exclude_unk`` is set to ``True``. ``self._total_logprob_function()`` will return the total log probability of the predicted (unmasked) words and the number of those words. :type network: Network :param network: the neural network object :type use_shortlist: bool :param use_shortlist: if ``True``, the ``<unk>`` probability is distributed among the out-of-shortlist words :type exclude_unk: bool :param exclude_unk: if set to ``True``, ``<unk>`` tokens are excluded from probability computation :type profile: bool :param profile: if set to True, creates a Theano profile object """ self._vocabulary = network.vocabulary self._unk_id = self._vocabulary.word_to_id['<unk>'] # The functions take as input a mini-batch of word IDs and class IDs, # and slice input and target IDs for the network. batch_word_ids = tensor.matrix('textscorer/batch_word_ids', dtype='int64') batch_word_ids.tag.test_value = test_value( size=(21, 4), high=self._vocabulary.num_words()) batch_class_ids = tensor.matrix('textscorer/batch_class_ids', dtype='int64') batch_class_ids.tag.test_value = test_value( size=(21, 4), high=self._vocabulary.num_classes()) all_class_ids = tensor.vector('textscorer/all_class_ids', dtype='int64') all_class_ids.tag.test_value = test_value( size=(21, ), high=self._vocabulary.num_classes()) membership_probs = tensor.matrix('textscorer/membership_probs', dtype=theano.config.floatX) membership_probs.tag.test_value = test_value(size=(20, 4), high=1.0) membership_probs_output_vec = tensor.tensor3( 'textscorer/membership_probs_output_vec', dtype=theano.config.floatX) membership_probs_output_vec.tag.test_value = test_value(size=(20, 4, 5), high=1.0) k = tensor.scalar('textscorer/k', dtype='int64') k.tag.test_value = 4 # Convert out-of-shortlist words to <unk> in input. shortlist_size = self._vocabulary.num_shortlist_words() input_word_ids = batch_word_ids[:-1] oos_indices = tensor.ge(input_word_ids, shortlist_size).nonzero() input_word_ids = tensor.set_subtensor(input_word_ids[oos_indices], self._unk_id) # Out-of-shortlist words are already in <unk> class, because they don't # have own classes. input_class_ids = batch_class_ids[:-1] target_class_ids = batch_class_ids[1:] # Target word IDs are not used by the network. We need them to compute # probabilities for out-of-shortlist word. target_word_ids = batch_word_ids[1:] logprobs = tensor.log(network.target_probs()) logprobs_output_vec = tensor.log(network.output_probs()) if (logprobs_output_vec.shape[2] != membership_probs_output_vec.shape[2]): logprobs_output_vec = logprobs_output_vec[:, :, all_class_ids] # Add logprobs from the class membership of the predicted word. logprobs += tensor.log(membership_probs) logprobs_output_vec += tensor.log(membership_probs_output_vec) mask = network.mask if use_shortlist and network.oos_logprobs is not None: # The probability of out-of-shortlist words (which is the <unk> # probability) is multiplied by the fraction of the actual word # within the set of OOS words. logprobs += network.oos_logprobs[target_word_ids] logprobs_output_vec += tensor.tile( network.oos_logprobs, (logprobs_output_vec.shape[0], logprobs_output_vec.shape[1], 1)) # Always exclude OOV words when using a shortlist - No probability # mass is left for them. mask *= tensor.neq(target_word_ids, self._unk_id) elif exclude_unk: # If requested, ignore OOS and OOV probabilities. mask *= tensor.neq(target_word_ids, self._unk_id) mask *= tensor.lt(target_word_ids, shortlist_size) # Ignore unused input variables, because is_training is only used by # dropout layer. masked_logprobs = logprobs * tensor.cast(mask, theano.config.floatX) self._target_logprobs_function = theano.function( [batch_word_ids, batch_class_ids, membership_probs, network.mask], [masked_logprobs, mask], givens=[(network.input_word_ids, input_word_ids), (network.input_class_ids, input_class_ids), (network.target_class_ids, target_class_ids), (network.is_training, numpy.int8(0))], name='target_logprobs', on_unused_input='ignore', profile=profile) # Ignore unused input variables, because is_training is only used by # dropout layer. #mask_output_vec = tensor.tile(mask.reshape([membership_probs.shape[0],membership_probs.shape[1],1]),(1,1,self._vocabulary.num_classes())) mask_output_vec = mask.reshape([mask.shape[0], mask.shape[1], 1]) masked_logprobs_output_vec = logprobs_output_vec * tensor.cast( mask_output_vec, theano.config.floatX) self._output_vec_logprobs_function = theano.function( [ batch_word_ids, batch_class_ids, all_class_ids, membership_probs_output_vec, network.mask ], [masked_logprobs_output_vec, mask], givens=[(network.input_word_ids, input_word_ids), (network.input_class_ids, input_class_ids), (network.target_class_ids, target_class_ids), (network.is_training, numpy.int8(0))], name='output_logprobs', on_unused_input='ignore', profile=profile) top_k = tensor.argsort(masked_logprobs_output_vec, axis=2)[:, :, -k:] self._output_top_k_indices_funciton = theano.function( [ batch_word_ids, batch_class_ids, all_class_ids, membership_probs_output_vec, network.mask, k ], [masked_logprobs_output_vec, top_k, mask], givens=[(network.input_word_ids, input_word_ids), (network.input_class_ids, input_class_ids), (network.target_class_ids, target_class_ids), (network.is_training, numpy.int8(0))], name='topk_indices', on_unused_input='ignore', profile=profile) # If some word is not in the training data, its class membership # probability will be zero. We want to ignore those words. Multiplying # by the mask is not possible, because those logprobs will be -inf. mask *= tensor.neq(membership_probs, 0.0) masked_logprobs = tensor.switch(mask, logprobs, 0.0) self._total_logprob_function = theano.function( [batch_word_ids, batch_class_ids, membership_probs, network.mask], [masked_logprobs.sum(), mask.sum()], givens=[(network.input_word_ids, input_word_ids), (network.input_class_ids, input_class_ids), (network.target_class_ids, target_class_ids), (network.is_training, numpy.int8(0))], name='total_logprob', on_unused_input='ignore', profile=profile) # These are updated by score_line(). self.num_words = 0 self.num_unks = 0
def __init__(self, architecture, vocabulary, class_prior_probs=None, mode=None, exclude_unk=False, default_device=None, profile=False): """Initializes the neural network parameters for all layers, and creates Theano shared variables from them. :type architecture: Architecture :param architecture: an object that describes the network architecture :type vocabulary: Vocabulary :param vocabulary: mapping between word IDs and word classes :type class_prior_probs: numpy.ndarray :param class_prior_probs: empirical (unigram) distribution of the output classes (only required for training) :type mode: Network.Mode :param mode: selects mini-batch or single time step processing :type exclude_unk: bool :param exclude_unk: if set to ``True``, set ``<unk>`` probability to zero before normalizing the network outputs (required to get exact normalization during inference) :type default_device: str :param default_device: default device where to store the shared variables :type profile: bool :param profile: if set to True, creates a Theano profile object """ self.vocabulary = vocabulary self.architecture = architecture self.mode = self.Mode() if mode is None else mode M1 = 2147483647 M2 = 2147462579 random_seed = [ numpy.random.randint(0, M1), numpy.random.randint(0, M1), numpy.random.randint(1, M1), numpy.random.randint(0, M2), numpy.random.randint(0, M2), numpy.random.randint(1, M2) ] self.random = RandomStreams(random_seed) # Word and class inputs will be available to NetworkInput layers. self.input_word_ids = tensor.matrix('network/input_word_ids', dtype='int64') self.input_class_ids = tensor.matrix('network/input_class_ids', dtype='int64') if self.mode.minibatch: self.input_word_ids.tag.test_value = test_value( size=(20, 4), high=vocabulary.num_shortlist_words()) self.input_class_ids.tag.test_value = test_value( size=(20, 4), high=vocabulary.num_classes()) else: self.input_word_ids.tag.test_value = test_value( size=(1, 4), high=vocabulary.num_shortlist_words()) self.input_class_ids.tag.test_value = test_value( size=(1, 4), high=vocabulary.num_classes()) # Should the output layer set the <unk> probability to zero? In that # case OOVs are not counted when computing perplexity. self.exclude_unk = exclude_unk # Default device for shared variables. self._default_device = default_device # During training, the output layer bias vector is initialized to the # unigram probabilities. self.class_prior_probs = class_prior_probs # Variable to check if the initialize the using freeze layer or not self.training = True logging.info("Seting to the neural network to train.") # A shortlist model adds these logprobs to OOS logprobs predicted by the # network. if vocabulary.has_unigram_probs(): oos_logprobs = numpy.log(vocabulary.get_oos_probs()) oos_logprobs = oos_logprobs.astype(theano.config.floatX) if self._default_device is None: self.oos_logprobs = theano.shared(oos_logprobs, 'network/oos_logprobs') else: self.oos_logprobs = theano.shared(oos_logprobs, 'network/oos_logprobs', target=self._default_device) else: self.oos_logprobs = None # Recurrent layers will create these lists, used to initialize state # variables of appropriate sizes, for doing forward passes one step at a # time. self.recurrent_state_input = [] self.recurrent_state_size = [] # Create the layers. logging.debug("Creating layers.") self.layers = OrderedDict() for input_options in architecture.inputs: network_input = NetworkInput(input_options, self) self.layers[network_input.name] = network_input for layer_description in architecture.layers: layer_options = self._layer_options_from_description( layer_description) if layer_options['name'] == architecture.output_layer: layer_options['size'] = vocabulary.num_classes() # 'devices' not in layer_options is for backward compatibility. # Remove at some point. if ('devices' not in layer_options) or (not layer_options['devices']): layer_options['devices'] = [default_device] layer = create_layer(layer_options, self, profile=profile) self.layers[layer.name] = layer self.output_layer = self.layers[architecture.output_layer] num_params = sum(layer.num_params() for layer in self.layers.values()) logging.debug("Total number of model parameters: %d", num_params) # This list will be filled by the recurrent layers to contain the # recurrent state outputs, for doing forward passes one step at a time. self.recurrent_state_output = [None] * len(self.recurrent_state_size) # This input variable can be used to specify the classes whose # probabilities will be computed, instead of the whole distribution. self.target_class_ids = tensor.matrix('network/target_class_ids', dtype='int64') if self.mode.minibatch: self.target_class_ids.tag.test_value = test_value( size=(20, 4), high=vocabulary.num_classes()) else: self.target_class_ids.tag.test_value = test_value( size=(1, 4), high=vocabulary.num_classes()) # This input variable is used only for detecting <unk> target words. self.target_word_ids = tensor.matrix('network/target_word_ids', dtype='int64') if self.mode.minibatch: self.target_word_ids.tag.test_value = test_value( size=(20, 4), high=vocabulary.num_shortlist_words()) else: self.target_word_ids.tag.test_value = test_value( size=(1, 4), high=vocabulary.num_shortlist_words()) # mask is used to mask out the rest of the input matrix, when a sequence # is shorter than the maximum sequence length. The mask is kept as int8 # data type, which is how Tensor stores booleans. if self.mode.minibatch: self.mask = tensor.matrix('network/mask', dtype='int8') self.mask.tag.test_value = test_value(size=(20, 4), high=True) else: self.mask = tensor.ones(self.input_word_ids.shape, dtype='int8') # Dropout layer needs to know whether we are training or evaluating. self.is_training = tensor.scalar('network/is_training', dtype='int8') self.is_training.tag.test_value = 1 # num_noise_samples tells sampling based methods how many noise classes # to sample. self.num_noise_samples = tensor.scalar('network/num_noise_samples', dtype='int64') self.num_noise_samples.tag.test_value = 3 self.noise_distribution = None for layer in self.layers.values(): layer.create_structure()
def __init__(self, optimization_options, network, cost_function, profile=False): """Creates Theano functions for training a neural network language model. The subclass constructor is expected to create the optimizer parameters in ``self._params``. This constructor will then create a function ``self.update_function``, which updates the optimizer parameters, and then the model state given the gradients, the optimizer parameters, and the learning rate. The update functions takes as arguments four matrices and the alpha hyperparameter: 1. Word IDs in the shape of a mini-batch. The functions will slice this into input and output. 2. Class IDs in the shape of a mini-batch. The functions will slice this into input and output. 3. Mask in the shape of a mini-batch, but only for the output words (not for the first time step). 4. Weights in the shape of a mini-batch, but only for the output words (not for the first time step). 4. Alpha or learning rate is used to scale the size of the update. :type optimization_options: dict :param optimization_options: a dictionary of optimization options :type network: Network :param network: the neural network object :type cost_function: Cost :param cost_function: an object from one of the cost function classes that defined the training objective :type profile: bool :param profile: if set to True, creates a Theano profile object """ self.network = network float_type = numpy.dtype(theano.config.floatX).type self.float_type = float_type try: # numerical stability / smoothing term to prevent divide-by-zero self._epsilon = float_type(optimization_options['epsilon']) # learning rate / step size self.learning_rate = float_type( optimization_options['learning_rate']) # weights for training files self._weights = optimization_options['weights'] # maximum norm for parameter updates self._max_gradient_norm = float_type( optimization_options['max_gradient_norm']) # number of noise samples for sampling based output num_noise_samples = optimization_options['num_noise_samples'] # noise sample sharing for sampling based output noise_sharing = optimization_options['noise_sharing'] except KeyError as e: raise ValueError( "Option {} is missing from optimization options.".format(e)) self._unk_id = self.network.vocabulary.word_to_id['<unk>'] # The function takes as inputs a mini-batch of word IDs and class IDs, # and slices the input and target IDs for the network. batch_word_ids = tensor.matrix('optimizer/batch_word_ids', dtype='int64') batch_word_ids.tag.test_value = test_value( size=(101, 16), high=self.network.vocabulary.num_shortlist_words()) batch_class_ids = tensor.matrix('optimizer/batch_class_ids', dtype='int64') batch_class_ids.tag.test_value = test_value( size=(101, 16), high=self.network.vocabulary.num_classes()) # Derive the symbolic expression for updating the gradient with regard # to each parameter. cost, num_words = cost_function.get_tensor() self._gradients = \ tensor.grad(cost, wrt=list(self.network.get_variables().values())) # The function takes as input the learning rate. alpha = tensor.scalar('optimizer/alpha', dtype=theano.config.floatX) alpha.tag.test_value = 0.1 # The function takes as input a matrix of weights, one for each # target word. These are used to scale the parameter updates. weights = tensor.matrix('optimizer/weights', dtype=theano.config.floatX) weights.tag.test_value = test_value(size=(100, 16), high=1.0) word_positions = tensor.eq(self.network.mask, 1).nonzero() weight = weights[word_positions].sum() num_words_float = tensor.cast(num_words, theano.config.floatX) modified_alpha = tensor.switch(tensor.gt(num_words, 0), alpha * weight / num_words_float, alpha) # Ignore unused input, because is_training is only used by dropout # layer. self.update_function = theano.function( [ batch_word_ids, batch_class_ids, self.network.mask, weights, alpha ], [cost, num_words], givens=[(network.input_word_ids, batch_word_ids[:-1]), (network.input_class_ids, batch_class_ids[:-1]), (network.target_word_ids, batch_word_ids[1:]), (network.target_class_ids, batch_class_ids[1:]), (self.network.is_training, numpy.int8(1)), (self.network.num_noise_samples, numpy.int64(num_noise_samples))], updates=self._get_param_updates(alpha), name='update_function', on_unused_input='ignore', profile=profile)