Example #1
0
    def compute_perplexity(self, batch_iter):
        """Computes the perplexity of text read using the given iterator.

        ``batch_iter`` is an iterator to the input data. On each call it creates
        a two 2-dimensional matrices, both indexed by time step and sequence.
        The first matrix contains the word IDs, the second one masks out
        elements past the sequence ends.

        :type batch_iter: BatchIterator
        :param batch_iter: an iterator that creates mini-batches from the input
                           data

        :rtype: float
        :returns: perplexity, i.e. exponent of negative log probability
                  normalized by the number of words
        """

        logprob = 0
        num_words = 0

        for word_ids, _, mask in batch_iter:
            class_ids, membership_probs = \
                self._vocabulary.get_class_memberships(word_ids)
            membership_probs = membership_probs.astype(theano.config.floatX)

            # total_logprob_function() uses the word and class IDs of the entire
            # mini-batch, but membership probs and mask are only for the output.
            batch_logprob, batch_num_words = \
                self._total_logprob_function(word_ids,
                                             class_ids,
                                             membership_probs[1:],
                                             mask[1:])
            if numpy.isnan(batch_logprob):
                raise NumberError("Log probability of a mini-batch is NaN.")
            if numpy.isinf(batch_logprob):
                raise NumberError(
                    "Log probability of a mini-batch is +/- infinity.")

            logprob += batch_logprob
            num_words += batch_num_words

        if num_words == 0:
            raise ValueError("Zero words for computing perplexity. Does the "
                             "evaluation data contain only OOV words?")
        cross_entropy = -logprob / num_words
        return numpy.exp(cross_entropy)
Example #2
0
    def score_batch(self, word_ids, class_ids, membership_probs, mask):
        """Computes the log probabilities predicted by the neural network for
        the words in a mini-batch.

        Indices in the resulting list of lists will be a transpose of those of
        the input matrices matrices, so that the first index is the sequence,
        not the time step.

        :type word_ids: numpy.ndarray of an integer type
        :param word_ids: a 2-dimensional matrix, indexed by time step and
                         sequence, that contains the word IDs

        :type class_ids: numpy.ndarray of an integer type
        :param class_ids: a 2-dimensional matrix, indexed by time step and
                          sequence, that contains the class IDs

        :type membership_probs: numpy.ndarray of a floating point type
        :param membership_probs: a 2-dimensional matrix, indexed by time step
                                 and sequences, that contains the class
                                 membership probabilities of the words

        :type mask: numpy.ndarray of a floating point type
        :param mask: a 2-dimensional matrix, indexed by time step and sequence,
                     that masks out elements past the sequence ends

        :rtype: list of lists
        :returns: logprob of each word in each sequence
        """

        result = []

        # A matrix of neural network logprobs of each word in each sequence.
        logprobs = self.score_function(word_ids, class_ids, mask)
        # Add logprobs from the class membership of the predicted word at each
        # time step of each sequence.
        logprobs += numpy.log(membership_probs[1:])
        # If requested, predict <unk> with constant score.
        if not self.unk_penalty is None:
            logprobs[word_ids[1:] == self.unk_id] = self.unk_penalty
        # Ignore logprobs predicting a word that is past the sequence end, and
        # possibly also those that are predicting <unk> token.
        if self.ignore_unk:
            mask = numpy.copy(mask)
            mask[word_ids == self.unk_id] = 0
        for seq_index in range(logprobs.shape[1]):
            seq_logprobs = logprobs[:, seq_index]
            seq_mask = mask[1:, seq_index]
            seq_logprobs = seq_logprobs[seq_mask == 1]
            if numpy.isnan(sum(seq_logprobs)):
                raise NumberError("Sequence logprob has NaN value.")
            result.append(seq_logprobs)

        return result
Example #3
0
    def update_minibatch(self, word_ids, class_ids, file_ids, mask):
        """Optimizes the neural network parameters using the given inputs and
        learning rate.

        :type word_ids: ndarray of ints
        :param word_ids: a 2-dimensional matrix, indexed by time step and
                         sequence, that contains the word IDs

        :type class_ids: ndarray of ints
        :param class_ids: a 2-dimensional matrix, indexed by time step and
                          sequence, that contains the class IDs

        :type file_ids: ndarray of ints
        :param file_ids: a 2-dimensional matrix, indexed by time step and
                         sequence, that identifies the file in case of multiple
                         training files

        :type mask: numpy.ndarray of a floating point type
        :param mask: a 2-dimensional matrix, indexed by time step and sequence,
                     that masks out elements past the sequence ends.
        """

        update_start_time = time()

        # We should predict probabilities of the words at the following time
        # step.
        input_word_ids = word_ids[:-1]
        input_class_ids = class_ids[:-1]
        target_word_ids = word_ids[1:]
        target_class_ids = class_ids[1:]
        mask = mask[1:]
        self.update_cost = self.gradient_update_function(
            input_word_ids, input_class_ids, target_word_ids, target_class_ids,
            mask)
        if numpy.isnan(self.update_cost) or numpy.isinf(self.update_cost):
            raise NumberError("Mini-batch cost computation resulted in a "
                              "numerical error.")

        alpha = self.learning_rate
        if self.ignore_unk:
            mask *= tensor.neq(target_word_ids, unk_id)
        num_words = numpy.count_nonzero(mask)
        float_type = numpy.dtype(theano.config.floatX).type
        if num_words > 0:
            file_ids = file_ids[:-1]
            weights = self._weights[file_ids]
            alpha *= weights[mask == 1].sum() / float_type(num_words)
        self.model_update_function(alpha)

        self.update_duration = time() - update_start_time
Example #4
0
    def score_sequence(self, word_ids, class_ids, membership_probs):
        """Computes the log probability of a word sequence.

        :type word_ids: ndarray
        :param word_ids: a vector of word IDs

        :type class_ids: list of ints
        :param class_ids: corresponding class IDs

        :type membership_probs: list of floats
        :param membership_probs: list of class membership probabilities

        :rtype: float
        :returns: log probability of the word sequence
        """

        # Create 2-dimensional matrices representing the transposes of the
        # vectors.
        word_ids = numpy.transpose(word_ids[numpy.newaxis])
        class_ids = numpy.array([[x] for x in class_ids], numpy.int64)
        membership_probs = numpy.array([[x] for x in membership_probs
                                        ]).astype(theano.config.floatX)
        # Mask used by the network is all ones.
        mask = numpy.ones(word_ids.shape, numpy.int8)

        # total_logprob_function() uses the word and class IDs of the entire
        # mini-batch, but membership probs and mask are only for the output.
        logprob, _ = self._total_logprob_function(word_ids, class_ids,
                                                  membership_probs[1:],
                                                  mask[1:])
        if numpy.isnan(logprob):
            raise NumberError("Log probability of a sequence is NaN.")
        if numpy.isinf(logprob):
            raise NumberError("Log probability of a sequence is +/- infinity.")

        return logprob
Example #5
0
    def score_sequence(self, word_ids, class_ids, membership_probs):
        """Computes the log probability of a word sequence.

        :type word_ids: ndarray
        :param word_ids: a vector of word IDs

        :type class_ids: list of ints
        :param class_ids: corresponding class IDs

        :type membership_probs: list of floats
        :param membership_probs: list of class membership probabilities

        :rtype: float
        :returns: log probability of the sentence
        """

        # Create 2-dimensional matrices representing the transposes of the
        # vectors.
        word_ids = numpy.transpose(word_ids[numpy.newaxis])
        class_ids = numpy.array([[x] for x in class_ids], numpy.int64)
        membership_probs = numpy.array([[x] for x in membership_probs
                                        ]).astype(theano.config.floatX)
        # Mask used by the network is all ones.
        mask = numpy.ones(word_ids.shape, numpy.int8)

        logprobs = self.score_function(word_ids, class_ids, mask)
        # Add logprobs from the class membership of the predicted word at each
        # time step of each sequence.
        logprobs += numpy.log(membership_probs[1:])
        # If requested, predict <unk> with constant score.
        if not self.unk_penalty is None:
            logprobs[word_ids[1:] == self.unk_id] = self.unk_penalty
        # If requested, zero out logprobs predicting <unk> token.
        if self.ignore_unk:
            logprobs[word_ids[1:] == self.unk_id] = 0

        logprob = logprobs.sum()
        if numpy.isnan(logprob):
            raise NumberError("Sentence logprob has NaN value.")
        return logprob
Example #6
0
    def train(self):
        while self.stopper.start_new_epoch():
            for word_ids, file_ids, mask in self.training_iter:
                self.update_number += 1
                self.total_updates += 1

                class_ids = self.vocabulary.word_id_to_class_id[word_ids]
                self.optimizer.update_minibatch(word_ids, class_ids, file_ids,
                                                mask)

                if (self.log_update_interval >= 1) and \
                   (self.total_updates % self.log_update_interval == 0):
                    self._log_update()

                if self._is_scheduled(self.options['validation_frequency']):
                    perplexity = self.scorer.compute_perplexity(
                        self.validation_iter)
                    if numpy.isnan(perplexity) or numpy.isinf(perplexity):
                        raise NumberError(
                            "Validation set perplexity computation resulted "
                            "in a numerical error.")
                else:
                    perplexity = None
                self._validate(perplexity)

                if not self.stopper.start_new_minibatch():
                    break

            message = "Finished training epoch {}.".format(self.epoch_number)
            best_cost = self.candidate_cost()
            if not best_cost is None:
                message += " Best validation perplexity {:.2f}.".format(
                    best_cost)
            print(message)

            self.epoch_number += 1
            self.update_number = 0

        print("Training finished.")
Example #7
0
    def _validate(self):
        """If at or just before the actual validation point, computes perplexity
        and adds to the list of samples. At the actual validation point we have
        `self._samples_per_validation` values and combine them using
        `self._statistic_function`. If the model performance has improved, the
        state at the center of the validation samples will be saved using
        `self._set_candidate_state()`.

        :type perplexity: float
        :param perplexity: computed perplexity at a validation point, None
                           elsewhere
        """

        if self._validation_iter is None:
            return  # Validation has not been configured.

        if not self._is_scheduled(self._options['validation_frequency'],
                                  self._samples_per_validation - 1):
            return  # We don't have to validate now.

        perplexity = self._scorer.compute_perplexity(self._validation_iter)
        if numpy.isnan(perplexity) or numpy.isinf(perplexity):
            raise NumberError("Validation set perplexity computation resulted "
                              "in a numerical error.")

        self._local_perplexities.append(perplexity)
        if len(self._local_perplexities) == 1:
            logging.debug("[%d] First validation sample, perplexity %.2f.",
                          self.update_number, perplexity)

        # The rest of the function will be executed only at and after the center
        # of sampling points.
        if not self._is_scheduled(self._options['validation_frequency'],
                                  self._samples_per_validation // 2):
            return

        # The first sampling point within samples_per_validation / 2 of the
        # actual validation point is the center of the sampling points. This
        # will be saved in case the model performance has improved.
        if self._validation_state is None:
            logging.debug("[%d] Center of validation, perplexity %.2f.",
                          self.update_number, perplexity)
            self._validation_state = h5py.File(name='validation-state',
                                               driver='core',
                                               backing_store=False)
            self.get_state(self._validation_state)

        # The rest of the function will be executed only at the final sampling
        # point.
        if not self._is_scheduled(self._options['validation_frequency']):
            return
        logging.debug("[%d] Last validation sample, perplexity %.2f.",
                      self.update_number, perplexity)

        if len(self._local_perplexities) < self._samples_per_validation:
            # After restoring a previous validation state, which is at the
            # center of the sampling points, the trainer will collect again half
            # of the samples. Don't take that as a validation.
            logging.debug(
                "[%d] Only %d samples collected. Ignoring this "
                "validation.", self.update_number,
                len(self._local_perplexities))
            self._local_perplexities = []
            self._validation_state.close()
            self._validation_state = None
            return

        statistic = self._statistic_function(self._local_perplexities)
        self._cost_history = numpy.append(self._cost_history, statistic)
        if self._has_improved():
            # Take the state at the actual validation point and replace the cost
            # history with the current cost history that also includes this
            # latest statistic.
            h5_cost_history = self._validation_state['trainer/cost_history']
            h5_cost_history.resize(self._cost_history.shape)
            h5_cost_history[:] = self._cost_history
            self._set_candidate_state(self._validation_state)

        self._log_validation()

        if (self._options['patience'] >= 0) and \
           (self.validations_since_candidate() > self._options['patience']):
            # Too many validations without finding a new candidate state.

            # If any validations have been done, the best state has been found
            # and saved. If training has been started from previous state,
            # _candidate_state has been set to the initial state.
            assert self._candidate_state is not None

            self._decrease_learning_rate()

        self._local_perplexities = []
        self._validation_state.close()
        self._validation_state = None