def __init__(self, dim, emb_dim, readout_dims, num_input_words,
                 def_num_input_words, vocab, use_definitions, def_word_gating,
                 compose_type, coattention, def_reader, reuse_word_embeddings,
                 random_unk, **kwargs):
        self._vocab = vocab
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        self._coattention = coattention
        self._num_input_words = num_input_words
        self._use_definitions = use_definitions
        self._random_unk = random_unk
        self._reuse_word_embeddings = reuse_word_embeddings

        lookup_num_words = num_input_words
        if reuse_word_embeddings:
            lookup_num_words = max(num_input_words, def_num_input_words)
        if random_unk:
            lookup_num_words = vocab.size()

        # Dima: we can have slightly less copy-paste here if we
        # copy the RecurrentFromFork class from my other projects.
        children = []
        self._lookup = LookupTable(lookup_num_words, emb_dim)
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._question_transform = Linear(dim, dim, name='question_transform')
        self._bidir_fork = Linear(3 * dim if coattention else 2 * dim,
                                  4 * dim,
                                  name='bidir_fork')
        self._bidir = Bidirectional(LSTM(dim), name='bidir')
        children.extend([
            self._lookup, self._encoder_fork, self._encoder_rnn,
            self._question_transform, self._bidir, self._bidir_fork
        ])

        activations = [Rectifier()] * len(readout_dims) + [None]
        readout_dims = [2 * dim] + readout_dims + [1]
        self._begin_readout = MLP(activations,
                                  readout_dims,
                                  name='begin_readout')
        self._end_readout = MLP(activations, readout_dims, name='end_readout')
        self._softmax = NDimensionalSoftmax()
        children.extend(
            [self._begin_readout, self._end_readout, self._softmax])

        if self._use_definitions:
            # A potential bug here: we pass the same vocab to the def reader.
            # If a different token is reserved for UNK in text and in the definitions,
            # we can be screwed.
            def_reader_class = eval(def_reader)
            def_reader_kwargs = dict(
                num_input_words=def_num_input_words,
                dim=dim,
                emb_dim=emb_dim,
                vocab=vocab,
                lookup=self._lookup if reuse_word_embeddings else None)
            if def_reader_class == MeanPoolReadDefinitions:
                def_reader_kwargs.update(dict(normalize=True, translate=False))
            self._def_reader = def_reader_class(**def_reader_kwargs)
            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              def_word_gating=def_word_gating,
                                              compose_type=compose_type)
            children.extend([self._def_reader, self._combiner])

        super(ExtractiveQAModel, self).__init__(children=children, **kwargs)

        # create default input variables
        self.contexts = tensor.lmatrix('contexts')
        self.context_mask = tensor.matrix('contexts_mask')
        self.questions = tensor.lmatrix('questions')
        self.question_mask = tensor.matrix('questions_mask')
        self.answer_begins = tensor.lvector('answer_begins')
        self.answer_ends = tensor.lvector('answer_ends')
        input_vars = [
            self.contexts, self.context_mask, self.questions,
            self.question_mask, self.answer_begins, self.answer_ends
        ]
        if self._use_definitions:
            self.defs = tensor.lmatrix('defs')
            self.def_mask = tensor.matrix('def_mask')
            self.contexts_def_map = tensor.lmatrix('contexts_def_map')
            self.questions_def_map = tensor.lmatrix('questions_def_map')
            input_vars.extend([
                self.defs, self.def_mask, self.contexts_def_map,
                self.questions_def_map
            ])
        self.input_vars = OrderedDict([(var.name, var) for var in input_vars])
Exemple #2
0
    def __init__(self,
                 emb_dim,
                 emb_def_dim,
                 dim,
                 num_input_words,
                 def_num_input_words,
                 num_output_words,
                 vocab,
                 retrieval=None,
                 def_reader='LSTM',
                 standalone_def_lookup=True,
                 standalone_def_rnn=True,
                 disregard_word_embeddings=False,
                 compose_type='sum',
                 very_rare_threshold=[10],
                 cache_size=0,
                 **kwargs):
        # TODO(tombosc): document
        if emb_dim == 0:
            emb_dim = dim
        if emb_def_dim == 0:
            emb_def_dim = emb_dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        if (num_input_words !=
                def_num_input_words) and (not standalone_def_lookup):
            raise NotImplementedError()

        self._very_rare_threshold = very_rare_threshold
        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab
        self._retrieval = retrieval
        self._disregard_word_embeddings = disregard_word_embeddings
        self._compose_type = compose_type

        self._word_to_id = WordToIdOp(self._vocab)
        self._word_to_count = WordToCountOp(self._vocab)

        children = []
        self._cache = None
        if cache_size > 0:
            #TODO(tombosc) do we implement cache as LookupTable or theano matrix?
            #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim)))
            self._cache = LookupTable(cache_size,
                                      emb_dim,
                                      name='cache_def_embeddings')
            children.append(self._cache)

        if self._retrieval:
            self._retrieve = RetrievalOp(retrieval)

        self._main_lookup = LookupTable(self._num_input_words,
                                        emb_dim,
                                        name='main_lookup')
        self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork')
        self._main_rnn = DebugLSTM(
            dim, name='main_rnn')  # TODO(tombosc): use regular LSTM?
        children.extend([self._main_lookup, self._main_fork, self._main_rnn])
        if self._retrieval:
            if standalone_def_lookup:
                lookup = None
            else:
                if emb_dim != emb_def_dim:
                    raise ValueError(
                        "emb_dim != emb_def_dim: cannot share lookup")
                lookup = self._main_lookup

            if def_reader == 'LSTM':
                if standalone_def_rnn:
                    fork_and_rnn = None
                else:
                    fork_and_rnn = (self._main_fork, self._main_rnn)
                self._def_reader = LSTMReadDefinitions(def_num_input_words,
                                                       emb_def_dim,
                                                       dim,
                                                       vocab,
                                                       lookup,
                                                       fork_and_rnn,
                                                       cache=self._cache)

            elif def_reader == 'mean':
                self._def_reader = MeanPoolReadDefinitions(
                    def_num_input_words,
                    emb_def_dim,
                    dim,
                    vocab,
                    lookup,
                    translate=(emb_def_dim != dim),
                    normalize=False)
            else:
                raise Exception("def reader not understood")

            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              compose_type=compose_type)

            children.extend([self._def_reader, self._combiner])

        self._pre_softmax = Linear(dim, self._num_output_words)
        self._softmax = NDimensionalSoftmax()
        children.extend([self._pre_softmax, self._softmax])

        super(LanguageModel, self).__init__(children=children, **kwargs)
class ExtractiveQAModel(Initializable):
    """The dictionary-equipped extractive QA model.

    Parameters
    ----------
    dim : int
        The default dimensionality for the components.
    emd_dim : int
        The dimensionality for the embeddings. If 0, `dim` is used.
    coattention : bool
        Use the coattention mechanism.
    num_input_words : int
        The number of input words. If 0, `vocab.size()` is used.
        The vocabulary object.
    use_definitions : bool
        Triggers the use of definitions.
    reuse_word_embeddings : bool
    compose_type : str

    """
    def __init__(self, dim, emb_dim, readout_dims, num_input_words,
                 def_num_input_words, vocab, use_definitions, def_word_gating,
                 compose_type, coattention, def_reader, reuse_word_embeddings,
                 random_unk, **kwargs):
        self._vocab = vocab
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        self._coattention = coattention
        self._num_input_words = num_input_words
        self._use_definitions = use_definitions
        self._random_unk = random_unk
        self._reuse_word_embeddings = reuse_word_embeddings

        lookup_num_words = num_input_words
        if reuse_word_embeddings:
            lookup_num_words = max(num_input_words, def_num_input_words)
        if random_unk:
            lookup_num_words = vocab.size()

        # Dima: we can have slightly less copy-paste here if we
        # copy the RecurrentFromFork class from my other projects.
        children = []
        self._lookup = LookupTable(lookup_num_words, emb_dim)
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._question_transform = Linear(dim, dim, name='question_transform')
        self._bidir_fork = Linear(3 * dim if coattention else 2 * dim,
                                  4 * dim,
                                  name='bidir_fork')
        self._bidir = Bidirectional(LSTM(dim), name='bidir')
        children.extend([
            self._lookup, self._encoder_fork, self._encoder_rnn,
            self._question_transform, self._bidir, self._bidir_fork
        ])

        activations = [Rectifier()] * len(readout_dims) + [None]
        readout_dims = [2 * dim] + readout_dims + [1]
        self._begin_readout = MLP(activations,
                                  readout_dims,
                                  name='begin_readout')
        self._end_readout = MLP(activations, readout_dims, name='end_readout')
        self._softmax = NDimensionalSoftmax()
        children.extend(
            [self._begin_readout, self._end_readout, self._softmax])

        if self._use_definitions:
            # A potential bug here: we pass the same vocab to the def reader.
            # If a different token is reserved for UNK in text and in the definitions,
            # we can be screwed.
            def_reader_class = eval(def_reader)
            def_reader_kwargs = dict(
                num_input_words=def_num_input_words,
                dim=dim,
                emb_dim=emb_dim,
                vocab=vocab,
                lookup=self._lookup if reuse_word_embeddings else None)
            if def_reader_class == MeanPoolReadDefinitions:
                def_reader_kwargs.update(dict(normalize=True, translate=False))
            self._def_reader = def_reader_class(**def_reader_kwargs)
            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              def_word_gating=def_word_gating,
                                              compose_type=compose_type)
            children.extend([self._def_reader, self._combiner])

        super(ExtractiveQAModel, self).__init__(children=children, **kwargs)

        # create default input variables
        self.contexts = tensor.lmatrix('contexts')
        self.context_mask = tensor.matrix('contexts_mask')
        self.questions = tensor.lmatrix('questions')
        self.question_mask = tensor.matrix('questions_mask')
        self.answer_begins = tensor.lvector('answer_begins')
        self.answer_ends = tensor.lvector('answer_ends')
        input_vars = [
            self.contexts, self.context_mask, self.questions,
            self.question_mask, self.answer_begins, self.answer_ends
        ]
        if self._use_definitions:
            self.defs = tensor.lmatrix('defs')
            self.def_mask = tensor.matrix('def_mask')
            self.contexts_def_map = tensor.lmatrix('contexts_def_map')
            self.questions_def_map = tensor.lmatrix('questions_def_map')
            input_vars.extend([
                self.defs, self.def_mask, self.contexts_def_map,
                self.questions_def_map
            ])
        self.input_vars = OrderedDict([(var.name, var) for var in input_vars])

    def set_embeddings(self, embeddings):
        self._lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def embeddings_var(self):
        return self._lookup.parameters[0]

    def def_reading_parameters(self):
        parameters = Selector(self._def_reader).get_parameters().values()
        parameters.extend(Selector(self._combiner).get_parameters().values())
        if self._reuse_word_embeddings:
            lookup_parameters = Selector(
                self._lookup).get_parameters().values()
            parameters = [p for p in parameters if p not in lookup_parameters]
        return parameters

    @application
    def _encode(self,
                application_call,
                text,
                mask,
                def_embs=None,
                def_map=None,
                text_name=None):
        if not self._random_unk:
            text = (tensor.lt(text, self._num_input_words) * text +
                    tensor.ge(text, self._num_input_words) * self._vocab.unk)
        if text_name:
            application_call.add_auxiliary_variable(
                unk_ratio(text, mask, self._vocab.unk),
                name='{}_unk_ratio'.format(text_name))
        embs = self._lookup.apply(text)
        if self._random_unk:
            embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs +
                    tensor.ge(text, self._num_input_words)[:, :, None] *
                    disconnected_grad(embs))
        if def_embs:
            embs = self._combiner.apply(embs, mask, def_embs, def_map)
        add_role(embs, EMBEDDINGS)
        encoded = flip01(
            self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)),
                                    mask=mask.T)[0])
        return encoded

    @application
    def apply(self,
              application_call,
              contexts,
              contexts_mask,
              questions,
              questions_mask,
              answer_begins,
              answer_ends,
              defs=None,
              def_mask=None,
              contexts_def_map=None,
              questions_def_map=None):
        def_embs = None
        if self._use_definitions:
            def_embs = self._def_reader.apply(defs, def_mask)

        context_enc = self._encode(contexts, contexts_mask, def_embs,
                                   contexts_def_map, 'context')
        question_enc_pre = self._encode(questions, questions_mask, def_embs,
                                        questions_def_map, 'question')
        question_enc = tensor.tanh(
            self._question_transform.apply(question_enc_pre))

        # should be (batch size, context length, question_length)
        affinity = tensor.batched_dot(context_enc, flip12(question_enc))
        affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :]
        affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask)
        # soft-aligns every position in the context to positions in the question
        d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1)
        application_call.add_auxiliary_variable(d2q_att_weights.copy(),
                                                name='d2q_att_weights')
        # soft-aligns every position in the question to positions in the document
        q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1)
        application_call.add_auxiliary_variable(q2d_att_weights.copy(),
                                                name='q2d_att_weights')

        # question encoding "in the view of the document"
        question_enc_informed = tensor.batched_dot(q2d_att_weights,
                                                   context_enc)
        question_enc_concatenated = tensor.concatenate(
            [question_enc, question_enc_informed], 2)
        # document encoding "in the view of the question"
        context_enc_informed = tensor.batched_dot(d2q_att_weights,
                                                  question_enc_concatenated)

        if self._coattention:
            context_enc_concatenated = tensor.concatenate(
                [context_enc, context_enc_informed], 2)
        else:
            question_repr_repeated = tensor.repeat(question_enc[:, [-1], :],
                                                   context_enc.shape[1],
                                                   axis=1)
            context_enc_concatenated = tensor.concatenate(
                [context_enc, question_repr_repeated], 2)

        # note: forward and backward LSTMs share the
        # input weights in the current impl
        bidir_states = flip01(
            self._bidir.apply(self._bidir_fork.apply(
                flip01(context_enc_concatenated)),
                              mask=contexts_mask.T)[0])

        begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0]
        begin_readouts = begin_readouts * contexts_mask - 1000.0 * (
            1 - contexts_mask)
        begin_costs = self._softmax.categorical_cross_entropy(
            answer_begins, begin_readouts)

        end_readouts = self._end_readout.apply(bidir_states)[:, :, 0]
        end_readouts = end_readouts * contexts_mask - 1000.0 * (1 -
                                                                contexts_mask)
        end_costs = self._softmax.categorical_cross_entropy(
            answer_ends, end_readouts)

        predicted_begins = begin_readouts.argmax(axis=-1)
        predicted_ends = end_readouts.argmax(axis=-1)
        exact_match = (tensor.eq(predicted_begins, answer_begins) *
                       tensor.eq(predicted_ends, answer_ends))
        application_call.add_auxiliary_variable(predicted_begins,
                                                name='predicted_begins')
        application_call.add_auxiliary_variable(predicted_ends,
                                                name='predicted_ends')
        application_call.add_auxiliary_variable(exact_match,
                                                name='exact_match')

        return begin_costs + end_costs

    def apply_with_default_vars(self):
        return self.apply(*self.input_vars.values())
Exemple #4
0
class LanguageModel(Initializable):
    """The dictionary-equipped language model.

    Parameters
    ----------
    emb_dim: int
        The dimension of word embeddings (including for def model if standalone)
    dim : int
        The dimension of the RNNs states (including for def model if standalone)
    num_input_words : int
        The size of the LM's input vocabulary.
    num_output_words : int
        The size of the LM's output vocabulary.
    vocab
        The vocabulary object.
    retrieval
        The dictionary retrieval algorithm. If `None`, the language model
        does not use any dictionary.
    def_reader: either 'LSTM' or 'mean'
    standalone_def_rnn : bool
        If `True`, a standalone RNN with separate word embeddings is used
        to embed definition. If `False` the language model is reused.
    disregard_word_embeddings : bool
        If `True`, the word embeddings are not used, only the information
        from the definitions is used.
    compose_type : str
        If 'sum', the definition and word embeddings are averaged
        If 'fully_connected_linear', a learned perceptron compose the 2
        embeddings linearly
        If 'fully_connected_relu', ...
        If 'fully_connected_tanh', ...

    """
    def __init__(self,
                 emb_dim,
                 emb_def_dim,
                 dim,
                 num_input_words,
                 def_num_input_words,
                 num_output_words,
                 vocab,
                 retrieval=None,
                 def_reader='LSTM',
                 standalone_def_lookup=True,
                 standalone_def_rnn=True,
                 disregard_word_embeddings=False,
                 compose_type='sum',
                 very_rare_threshold=[10],
                 cache_size=0,
                 **kwargs):
        # TODO(tombosc): document
        if emb_dim == 0:
            emb_dim = dim
        if emb_def_dim == 0:
            emb_def_dim = emb_dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        if (num_input_words !=
                def_num_input_words) and (not standalone_def_lookup):
            raise NotImplementedError()

        self._very_rare_threshold = very_rare_threshold
        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab
        self._retrieval = retrieval
        self._disregard_word_embeddings = disregard_word_embeddings
        self._compose_type = compose_type

        self._word_to_id = WordToIdOp(self._vocab)
        self._word_to_count = WordToCountOp(self._vocab)

        children = []
        self._cache = None
        if cache_size > 0:
            #TODO(tombosc) do we implement cache as LookupTable or theano matrix?
            #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim)))
            self._cache = LookupTable(cache_size,
                                      emb_dim,
                                      name='cache_def_embeddings')
            children.append(self._cache)

        if self._retrieval:
            self._retrieve = RetrievalOp(retrieval)

        self._main_lookup = LookupTable(self._num_input_words,
                                        emb_dim,
                                        name='main_lookup')
        self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork')
        self._main_rnn = DebugLSTM(
            dim, name='main_rnn')  # TODO(tombosc): use regular LSTM?
        children.extend([self._main_lookup, self._main_fork, self._main_rnn])
        if self._retrieval:
            if standalone_def_lookup:
                lookup = None
            else:
                if emb_dim != emb_def_dim:
                    raise ValueError(
                        "emb_dim != emb_def_dim: cannot share lookup")
                lookup = self._main_lookup

            if def_reader == 'LSTM':
                if standalone_def_rnn:
                    fork_and_rnn = None
                else:
                    fork_and_rnn = (self._main_fork, self._main_rnn)
                self._def_reader = LSTMReadDefinitions(def_num_input_words,
                                                       emb_def_dim,
                                                       dim,
                                                       vocab,
                                                       lookup,
                                                       fork_and_rnn,
                                                       cache=self._cache)

            elif def_reader == 'mean':
                self._def_reader = MeanPoolReadDefinitions(
                    def_num_input_words,
                    emb_def_dim,
                    dim,
                    vocab,
                    lookup,
                    translate=(emb_def_dim != dim),
                    normalize=False)
            else:
                raise Exception("def reader not understood")

            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              compose_type=compose_type)

            children.extend([self._def_reader, self._combiner])

        self._pre_softmax = Linear(dim, self._num_output_words)
        self._softmax = NDimensionalSoftmax()
        children.extend([self._pre_softmax, self._softmax])

        super(LanguageModel, self).__init__(children=children, **kwargs)

    def _push_initialization_config(self):
        super(LanguageModel, self)._push_initialization_config()
        if self._cache:
            self._cache.weights_init = Constant(0.)

    def set_def_embeddings(self, embeddings):
        self._def_reader._def_lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def get_def_embeddings_params(self):
        return self._def_reader._def_lookup.parameters[0]

    def get_cache_params(self):
        return self._cache.W

    def add_perplexity_measure(self, application_call, minus_logs, mask, name):
        costs = (minus_logs * mask).sum(axis=0)
        perplexity = tensor.exp(costs.sum() / mask.sum())
        perplexity.tag.aggregation_scheme = Perplexity(costs.sum(), mask.sum())
        full_name = "perplexity_" + name
        application_call.add_auxiliary_variable(perplexity, name=full_name)
        return costs

    @application
    def apply(self, application_call, words, mask):
        """Compute the log-likelihood for a batch of sequences.

        words
            An integer matrix of shape (B, T), where T is the number of time
            step, B is the batch size. Note that this order of the axis is
            different from what all RNN bricks consume, hence and the axis
            should be transposed at some point.
        mask
            A float32 matrix of shape (B, T). Zeros indicate the padding.

        """
        if self._retrieval:
            defs, def_mask, def_map = self._retrieve(words)
            def_embeddings = self._def_reader.apply(defs, def_mask)

            # Auxililary variable for debugging
            application_call.add_auxiliary_variable(def_embeddings.shape[0],
                                                    name="num_definitions")

        word_ids = self._word_to_id(words)

        # shortlisting
        input_word_ids = (
            tensor.lt(word_ids, self._num_input_words) * word_ids +
            tensor.ge(word_ids, self._num_input_words) * self._vocab.unk)
        output_word_ids = (
            tensor.lt(word_ids, self._num_output_words) * word_ids +
            tensor.ge(word_ids, self._num_output_words) * self._vocab.unk)

        application_call.add_auxiliary_variable(unk_ratio(
            input_word_ids, mask, self._vocab.unk),
                                                name='unk_ratio')

        # Run the main rnn with combined inputs
        word_embs = self._main_lookup.apply(input_word_ids)
        application_call.add_auxiliary_variable(masked_root_mean_square(
            word_embs, mask),
                                                name='word_emb_RMS')

        if self._retrieval:
            rnn_inputs, updated, positions = self._combiner.apply(
                word_embs, mask, def_embeddings, def_map)
        else:
            rnn_inputs = word_embs

        updates = []
        if self._cache:
            flat_word_ids = word_ids.flatten()
            flat_word_ids_to_update = flat_word_ids[positions]
            # computing updates for cache
            updates = [
                (self._cache.W,
                 tensor.set_subtensor(self._cache.W[flat_word_ids_to_update],
                                      updated))
            ]

        application_call.add_auxiliary_variable(masked_root_mean_square(
            word_embs, mask),
                                                name='main_rnn_in_RMS')

        main_rnn_states = self._main_rnn.apply(tensor.transpose(
            self._main_fork.apply(rnn_inputs), (1, 0, 2)),
                                               mask=mask.T)[0]

        # The first token is not predicted
        logits = self._pre_softmax.apply(main_rnn_states[:-1])
        targets = output_word_ids.T[1:]
        out_softmax = self._softmax.apply(logits, extra_ndim=1)
        application_call.add_auxiliary_variable(out_softmax.copy(),
                                                name="proba_out")
        minus_logs = self._softmax.categorical_cross_entropy(targets,
                                                             logits,
                                                             extra_ndim=1)

        targets_mask = mask.T[1:]
        costs = self.add_perplexity_measure(application_call, minus_logs,
                                            targets_mask, "")

        missing_embs = tensor.eq(input_word_ids,
                                 self._vocab.unk).astype('int32')  # (bs, L)
        self.add_perplexity_measure(application_call, minus_logs,
                                    targets_mask * missing_embs.T[:-1],
                                    "after_mis_word_embs")
        self.add_perplexity_measure(application_call, minus_logs,
                                    targets_mask * (1 - missing_embs.T[:-1]),
                                    "after_word_embs")

        word_counts = self._word_to_count(words)
        very_rare_masks = []
        for threshold in self._very_rare_threshold:
            very_rare_mask = tensor.lt(word_counts, threshold).astype('int32')
            very_rare_mask = targets_mask * (very_rare_mask.T[:-1])
            very_rare_masks.append(very_rare_mask)
            self.add_perplexity_measure(application_call, minus_logs,
                                        very_rare_mask,
                                        "after_very_rare_" + str(threshold))

        if self._retrieval:
            has_def = tensor.zeros_like(output_word_ids)
            has_def = tensor.inc_subtensor(
                has_def[def_map[:, 0], def_map[:, 1]], 1)
            mask_targets_has_def = has_def.T[:-1] * targets_mask  # (L-1, bs)
            self.add_perplexity_measure(application_call, minus_logs,
                                        mask_targets_has_def, "after_def_embs")

            for thresh, very_rare_mask in zip(self._very_rare_threshold,
                                              very_rare_masks):
                self.add_perplexity_measure(
                    application_call, minus_logs,
                    very_rare_mask * mask_targets_has_def,
                    "after_def_very_rare_" + str(thresh))

            application_call.add_auxiliary_variable(mask_targets_has_def.T,
                                                    name='mask_def_emb')

        return costs, updates