def compute_perplexity(self, batch_iter): """Computes the perplexity of text read using the given iterator. ``batch_iter`` is an iterator to the input data. On each call it creates a two 2-dimensional matrices, both indexed by time step and sequence. The first matrix contains the word IDs, the second one masks out elements past the sequence ends. :type batch_iter: BatchIterator :param batch_iter: an iterator that creates mini-batches from the input data :rtype: float :returns: perplexity, i.e. exponent of negative log probability normalized by the number of words """ logprob = 0 num_words = 0 for word_ids, _, mask in batch_iter: class_ids, membership_probs = \ self._vocabulary.get_class_memberships(word_ids) membership_probs = membership_probs.astype(theano.config.floatX) # total_logprob_function() uses the word and class IDs of the entire # mini-batch, but membership probs and mask are only for the output. batch_logprob, batch_num_words = \ self._total_logprob_function(word_ids, class_ids, membership_probs[1:], mask[1:]) if numpy.isnan(batch_logprob): raise NumberError("Log probability of a mini-batch is NaN.") if numpy.isinf(batch_logprob): raise NumberError( "Log probability of a mini-batch is +/- infinity.") logprob += batch_logprob num_words += batch_num_words if num_words == 0: raise ValueError("Zero words for computing perplexity. Does the " "evaluation data contain only OOV words?") cross_entropy = -logprob / num_words return numpy.exp(cross_entropy)
def score_batch(self, word_ids, class_ids, membership_probs, mask): """Computes the log probabilities predicted by the neural network for the words in a mini-batch. Indices in the resulting list of lists will be a transpose of those of the input matrices matrices, so that the first index is the sequence, not the time step. :type word_ids: numpy.ndarray of an integer type :param word_ids: a 2-dimensional matrix, indexed by time step and sequence, that contains the word IDs :type class_ids: numpy.ndarray of an integer type :param class_ids: a 2-dimensional matrix, indexed by time step and sequence, that contains the class IDs :type membership_probs: numpy.ndarray of a floating point type :param membership_probs: a 2-dimensional matrix, indexed by time step and sequences, that contains the class membership probabilities of the words :type mask: numpy.ndarray of a floating point type :param mask: a 2-dimensional matrix, indexed by time step and sequence, that masks out elements past the sequence ends :rtype: list of lists :returns: logprob of each word in each sequence """ result = [] # A matrix of neural network logprobs of each word in each sequence. logprobs = self.score_function(word_ids, class_ids, mask) # Add logprobs from the class membership of the predicted word at each # time step of each sequence. logprobs += numpy.log(membership_probs[1:]) # If requested, predict <unk> with constant score. if not self.unk_penalty is None: logprobs[word_ids[1:] == self.unk_id] = self.unk_penalty # Ignore logprobs predicting a word that is past the sequence end, and # possibly also those that are predicting <unk> token. if self.ignore_unk: mask = numpy.copy(mask) mask[word_ids == self.unk_id] = 0 for seq_index in range(logprobs.shape[1]): seq_logprobs = logprobs[:, seq_index] seq_mask = mask[1:, seq_index] seq_logprobs = seq_logprobs[seq_mask == 1] if numpy.isnan(sum(seq_logprobs)): raise NumberError("Sequence logprob has NaN value.") result.append(seq_logprobs) return result
def update_minibatch(self, word_ids, class_ids, file_ids, mask): """Optimizes the neural network parameters using the given inputs and learning rate. :type word_ids: ndarray of ints :param word_ids: a 2-dimensional matrix, indexed by time step and sequence, that contains the word IDs :type class_ids: ndarray of ints :param class_ids: a 2-dimensional matrix, indexed by time step and sequence, that contains the class IDs :type file_ids: ndarray of ints :param file_ids: a 2-dimensional matrix, indexed by time step and sequence, that identifies the file in case of multiple training files :type mask: numpy.ndarray of a floating point type :param mask: a 2-dimensional matrix, indexed by time step and sequence, that masks out elements past the sequence ends. """ update_start_time = time() # We should predict probabilities of the words at the following time # step. input_word_ids = word_ids[:-1] input_class_ids = class_ids[:-1] target_word_ids = word_ids[1:] target_class_ids = class_ids[1:] mask = mask[1:] self.update_cost = self.gradient_update_function( input_word_ids, input_class_ids, target_word_ids, target_class_ids, mask) if numpy.isnan(self.update_cost) or numpy.isinf(self.update_cost): raise NumberError("Mini-batch cost computation resulted in a " "numerical error.") alpha = self.learning_rate if self.ignore_unk: mask *= tensor.neq(target_word_ids, unk_id) num_words = numpy.count_nonzero(mask) float_type = numpy.dtype(theano.config.floatX).type if num_words > 0: file_ids = file_ids[:-1] weights = self._weights[file_ids] alpha *= weights[mask == 1].sum() / float_type(num_words) self.model_update_function(alpha) self.update_duration = time() - update_start_time
def score_sequence(self, word_ids, class_ids, membership_probs): """Computes the log probability of a word sequence. :type word_ids: ndarray :param word_ids: a vector of word IDs :type class_ids: list of ints :param class_ids: corresponding class IDs :type membership_probs: list of floats :param membership_probs: list of class membership probabilities :rtype: float :returns: log probability of the word sequence """ # Create 2-dimensional matrices representing the transposes of the # vectors. word_ids = numpy.transpose(word_ids[numpy.newaxis]) class_ids = numpy.array([[x] for x in class_ids], numpy.int64) membership_probs = numpy.array([[x] for x in membership_probs ]).astype(theano.config.floatX) # Mask used by the network is all ones. mask = numpy.ones(word_ids.shape, numpy.int8) # total_logprob_function() uses the word and class IDs of the entire # mini-batch, but membership probs and mask are only for the output. logprob, _ = self._total_logprob_function(word_ids, class_ids, membership_probs[1:], mask[1:]) if numpy.isnan(logprob): raise NumberError("Log probability of a sequence is NaN.") if numpy.isinf(logprob): raise NumberError("Log probability of a sequence is +/- infinity.") return logprob
def score_sequence(self, word_ids, class_ids, membership_probs): """Computes the log probability of a word sequence. :type word_ids: ndarray :param word_ids: a vector of word IDs :type class_ids: list of ints :param class_ids: corresponding class IDs :type membership_probs: list of floats :param membership_probs: list of class membership probabilities :rtype: float :returns: log probability of the sentence """ # Create 2-dimensional matrices representing the transposes of the # vectors. word_ids = numpy.transpose(word_ids[numpy.newaxis]) class_ids = numpy.array([[x] for x in class_ids], numpy.int64) membership_probs = numpy.array([[x] for x in membership_probs ]).astype(theano.config.floatX) # Mask used by the network is all ones. mask = numpy.ones(word_ids.shape, numpy.int8) logprobs = self.score_function(word_ids, class_ids, mask) # Add logprobs from the class membership of the predicted word at each # time step of each sequence. logprobs += numpy.log(membership_probs[1:]) # If requested, predict <unk> with constant score. if not self.unk_penalty is None: logprobs[word_ids[1:] == self.unk_id] = self.unk_penalty # If requested, zero out logprobs predicting <unk> token. if self.ignore_unk: logprobs[word_ids[1:] == self.unk_id] = 0 logprob = logprobs.sum() if numpy.isnan(logprob): raise NumberError("Sentence logprob has NaN value.") return logprob
def train(self): while self.stopper.start_new_epoch(): for word_ids, file_ids, mask in self.training_iter: self.update_number += 1 self.total_updates += 1 class_ids = self.vocabulary.word_id_to_class_id[word_ids] self.optimizer.update_minibatch(word_ids, class_ids, file_ids, mask) if (self.log_update_interval >= 1) and \ (self.total_updates % self.log_update_interval == 0): self._log_update() if self._is_scheduled(self.options['validation_frequency']): perplexity = self.scorer.compute_perplexity( self.validation_iter) if numpy.isnan(perplexity) or numpy.isinf(perplexity): raise NumberError( "Validation set perplexity computation resulted " "in a numerical error.") else: perplexity = None self._validate(perplexity) if not self.stopper.start_new_minibatch(): break message = "Finished training epoch {}.".format(self.epoch_number) best_cost = self.candidate_cost() if not best_cost is None: message += " Best validation perplexity {:.2f}.".format( best_cost) print(message) self.epoch_number += 1 self.update_number = 0 print("Training finished.")
def _validate(self): """If at or just before the actual validation point, computes perplexity and adds to the list of samples. At the actual validation point we have `self._samples_per_validation` values and combine them using `self._statistic_function`. If the model performance has improved, the state at the center of the validation samples will be saved using `self._set_candidate_state()`. :type perplexity: float :param perplexity: computed perplexity at a validation point, None elsewhere """ if self._validation_iter is None: return # Validation has not been configured. if not self._is_scheduled(self._options['validation_frequency'], self._samples_per_validation - 1): return # We don't have to validate now. perplexity = self._scorer.compute_perplexity(self._validation_iter) if numpy.isnan(perplexity) or numpy.isinf(perplexity): raise NumberError("Validation set perplexity computation resulted " "in a numerical error.") self._local_perplexities.append(perplexity) if len(self._local_perplexities) == 1: logging.debug("[%d] First validation sample, perplexity %.2f.", self.update_number, perplexity) # The rest of the function will be executed only at and after the center # of sampling points. if not self._is_scheduled(self._options['validation_frequency'], self._samples_per_validation // 2): return # The first sampling point within samples_per_validation / 2 of the # actual validation point is the center of the sampling points. This # will be saved in case the model performance has improved. if self._validation_state is None: logging.debug("[%d] Center of validation, perplexity %.2f.", self.update_number, perplexity) self._validation_state = h5py.File(name='validation-state', driver='core', backing_store=False) self.get_state(self._validation_state) # The rest of the function will be executed only at the final sampling # point. if not self._is_scheduled(self._options['validation_frequency']): return logging.debug("[%d] Last validation sample, perplexity %.2f.", self.update_number, perplexity) if len(self._local_perplexities) < self._samples_per_validation: # After restoring a previous validation state, which is at the # center of the sampling points, the trainer will collect again half # of the samples. Don't take that as a validation. logging.debug( "[%d] Only %d samples collected. Ignoring this " "validation.", self.update_number, len(self._local_perplexities)) self._local_perplexities = [] self._validation_state.close() self._validation_state = None return statistic = self._statistic_function(self._local_perplexities) self._cost_history = numpy.append(self._cost_history, statistic) if self._has_improved(): # Take the state at the actual validation point and replace the cost # history with the current cost history that also includes this # latest statistic. h5_cost_history = self._validation_state['trainer/cost_history'] h5_cost_history.resize(self._cost_history.shape) h5_cost_history[:] = self._cost_history self._set_candidate_state(self._validation_state) self._log_validation() if (self._options['patience'] >= 0) and \ (self.validations_since_candidate() > self._options['patience']): # Too many validations without finding a new candidate state. # If any validations have been done, the best state has been found # and saved. If training has been started from previous state, # _candidate_state has been set to the initial state. assert self._candidate_state is not None self._decrease_learning_rate() self._local_perplexities = [] self._validation_state.close() self._validation_state = None