class ExtractiveQAModel(Initializable): """The dictionary-equipped extractive QA model. Parameters ---------- dim : int The default dimensionality for the components. emd_dim : int The dimensionality for the embeddings. If 0, `dim` is used. coattention : bool Use the coattention mechanism. num_input_words : int The number of input words. If 0, `vocab.size()` is used. The vocabulary object. use_definitions : bool Triggers the use of definitions. reuse_word_embeddings : bool compose_type : str """ def __init__(self, dim, emb_dim, readout_dims, num_input_words, def_num_input_words, vocab, use_definitions, def_word_gating, compose_type, coattention, def_reader, reuse_word_embeddings, random_unk, **kwargs): self._vocab = vocab if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words self._coattention = coattention self._num_input_words = num_input_words self._use_definitions = use_definitions self._random_unk = random_unk self._reuse_word_embeddings = reuse_word_embeddings lookup_num_words = num_input_words if reuse_word_embeddings: lookup_num_words = max(num_input_words, def_num_input_words) if random_unk: lookup_num_words = vocab.size() # Dima: we can have slightly less copy-paste here if we # copy the RecurrentFromFork class from my other projects. children = [] self._lookup = LookupTable(lookup_num_words, emb_dim) self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._question_transform = Linear(dim, dim, name='question_transform') self._bidir_fork = Linear(3 * dim if coattention else 2 * dim, 4 * dim, name='bidir_fork') self._bidir = Bidirectional(LSTM(dim), name='bidir') children.extend([ self._lookup, self._encoder_fork, self._encoder_rnn, self._question_transform, self._bidir, self._bidir_fork ]) activations = [Rectifier()] * len(readout_dims) + [None] readout_dims = [2 * dim] + readout_dims + [1] self._begin_readout = MLP(activations, readout_dims, name='begin_readout') self._end_readout = MLP(activations, readout_dims, name='end_readout') self._softmax = NDimensionalSoftmax() children.extend( [self._begin_readout, self._end_readout, self._softmax]) if self._use_definitions: # A potential bug here: we pass the same vocab to the def reader. # If a different token is reserved for UNK in text and in the definitions, # we can be screwed. def_reader_class = eval(def_reader) def_reader_kwargs = dict( num_input_words=def_num_input_words, dim=dim, emb_dim=emb_dim, vocab=vocab, lookup=self._lookup if reuse_word_embeddings else None) if def_reader_class == MeanPoolReadDefinitions: def_reader_kwargs.update(dict(normalize=True, translate=False)) self._def_reader = def_reader_class(**def_reader_kwargs) self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, def_word_gating=def_word_gating, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) super(ExtractiveQAModel, self).__init__(children=children, **kwargs) # create default input variables self.contexts = tensor.lmatrix('contexts') self.context_mask = tensor.matrix('contexts_mask') self.questions = tensor.lmatrix('questions') self.question_mask = tensor.matrix('questions_mask') self.answer_begins = tensor.lvector('answer_begins') self.answer_ends = tensor.lvector('answer_ends') input_vars = [ self.contexts, self.context_mask, self.questions, self.question_mask, self.answer_begins, self.answer_ends ] if self._use_definitions: self.defs = tensor.lmatrix('defs') self.def_mask = tensor.matrix('def_mask') self.contexts_def_map = tensor.lmatrix('contexts_def_map') self.questions_def_map = tensor.lmatrix('questions_def_map') input_vars.extend([ self.defs, self.def_mask, self.contexts_def_map, self.questions_def_map ]) self.input_vars = OrderedDict([(var.name, var) for var in input_vars]) def set_embeddings(self, embeddings): self._lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def embeddings_var(self): return self._lookup.parameters[0] def def_reading_parameters(self): parameters = Selector(self._def_reader).get_parameters().values() parameters.extend(Selector(self._combiner).get_parameters().values()) if self._reuse_word_embeddings: lookup_parameters = Selector( self._lookup).get_parameters().values() parameters = [p for p in parameters if p not in lookup_parameters] return parameters @application def _encode(self, application_call, text, mask, def_embs=None, def_map=None, text_name=None): if not self._random_unk: text = (tensor.lt(text, self._num_input_words) * text + tensor.ge(text, self._num_input_words) * self._vocab.unk) if text_name: application_call.add_auxiliary_variable( unk_ratio(text, mask, self._vocab.unk), name='{}_unk_ratio'.format(text_name)) embs = self._lookup.apply(text) if self._random_unk: embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs + tensor.ge(text, self._num_input_words)[:, :, None] * disconnected_grad(embs)) if def_embs: embs = self._combiner.apply(embs, mask, def_embs, def_map) add_role(embs, EMBEDDINGS) encoded = flip01( self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)), mask=mask.T)[0]) return encoded @application def apply(self, application_call, contexts, contexts_mask, questions, questions_mask, answer_begins, answer_ends, defs=None, def_mask=None, contexts_def_map=None, questions_def_map=None): def_embs = None if self._use_definitions: def_embs = self._def_reader.apply(defs, def_mask) context_enc = self._encode(contexts, contexts_mask, def_embs, contexts_def_map, 'context') question_enc_pre = self._encode(questions, questions_mask, def_embs, questions_def_map, 'question') question_enc = tensor.tanh( self._question_transform.apply(question_enc_pre)) # should be (batch size, context length, question_length) affinity = tensor.batched_dot(context_enc, flip12(question_enc)) affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :] affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask) # soft-aligns every position in the context to positions in the question d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1) application_call.add_auxiliary_variable(d2q_att_weights.copy(), name='d2q_att_weights') # soft-aligns every position in the question to positions in the document q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1) application_call.add_auxiliary_variable(q2d_att_weights.copy(), name='q2d_att_weights') # question encoding "in the view of the document" question_enc_informed = tensor.batched_dot(q2d_att_weights, context_enc) question_enc_concatenated = tensor.concatenate( [question_enc, question_enc_informed], 2) # document encoding "in the view of the question" context_enc_informed = tensor.batched_dot(d2q_att_weights, question_enc_concatenated) if self._coattention: context_enc_concatenated = tensor.concatenate( [context_enc, context_enc_informed], 2) else: question_repr_repeated = tensor.repeat(question_enc[:, [-1], :], context_enc.shape[1], axis=1) context_enc_concatenated = tensor.concatenate( [context_enc, question_repr_repeated], 2) # note: forward and backward LSTMs share the # input weights in the current impl bidir_states = flip01( self._bidir.apply(self._bidir_fork.apply( flip01(context_enc_concatenated)), mask=contexts_mask.T)[0]) begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0] begin_readouts = begin_readouts * contexts_mask - 1000.0 * ( 1 - contexts_mask) begin_costs = self._softmax.categorical_cross_entropy( answer_begins, begin_readouts) end_readouts = self._end_readout.apply(bidir_states)[:, :, 0] end_readouts = end_readouts * contexts_mask - 1000.0 * (1 - contexts_mask) end_costs = self._softmax.categorical_cross_entropy( answer_ends, end_readouts) predicted_begins = begin_readouts.argmax(axis=-1) predicted_ends = end_readouts.argmax(axis=-1) exact_match = (tensor.eq(predicted_begins, answer_begins) * tensor.eq(predicted_ends, answer_ends)) application_call.add_auxiliary_variable(predicted_begins, name='predicted_begins') application_call.add_auxiliary_variable(predicted_ends, name='predicted_ends') application_call.add_auxiliary_variable(exact_match, name='exact_match') return begin_costs + end_costs def apply_with_default_vars(self): return self.apply(*self.input_vars.values())
class LanguageModel(Initializable): """The dictionary-equipped language model. Parameters ---------- emb_dim: int The dimension of word embeddings (including for def model if standalone) dim : int The dimension of the RNNs states (including for def model if standalone) num_input_words : int The size of the LM's input vocabulary. num_output_words : int The size of the LM's output vocabulary. vocab The vocabulary object. retrieval The dictionary retrieval algorithm. If `None`, the language model does not use any dictionary. def_reader: either 'LSTM' or 'mean' standalone_def_rnn : bool If `True`, a standalone RNN with separate word embeddings is used to embed definition. If `False` the language model is reused. disregard_word_embeddings : bool If `True`, the word embeddings are not used, only the information from the definitions is used. compose_type : str If 'sum', the definition and word embeddings are averaged If 'fully_connected_linear', a learned perceptron compose the 2 embeddings linearly If 'fully_connected_relu', ... If 'fully_connected_tanh', ... """ def __init__(self, emb_dim, emb_def_dim, dim, num_input_words, def_num_input_words, num_output_words, vocab, retrieval=None, def_reader='LSTM', standalone_def_lookup=True, standalone_def_rnn=True, disregard_word_embeddings=False, compose_type='sum', very_rare_threshold=[10], cache_size=0, **kwargs): # TODO(tombosc): document if emb_dim == 0: emb_dim = dim if emb_def_dim == 0: emb_def_dim = emb_dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words if (num_input_words != def_num_input_words) and (not standalone_def_lookup): raise NotImplementedError() self._very_rare_threshold = very_rare_threshold self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._retrieval = retrieval self._disregard_word_embeddings = disregard_word_embeddings self._compose_type = compose_type self._word_to_id = WordToIdOp(self._vocab) self._word_to_count = WordToCountOp(self._vocab) children = [] self._cache = None if cache_size > 0: #TODO(tombosc) do we implement cache as LookupTable or theano matrix? #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim))) self._cache = LookupTable(cache_size, emb_dim, name='cache_def_embeddings') children.append(self._cache) if self._retrieval: self._retrieve = RetrievalOp(retrieval) self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork') self._main_rnn = DebugLSTM( dim, name='main_rnn') # TODO(tombosc): use regular LSTM? children.extend([self._main_lookup, self._main_fork, self._main_rnn]) if self._retrieval: if standalone_def_lookup: lookup = None else: if emb_dim != emb_def_dim: raise ValueError( "emb_dim != emb_def_dim: cannot share lookup") lookup = self._main_lookup if def_reader == 'LSTM': if standalone_def_rnn: fork_and_rnn = None else: fork_and_rnn = (self._main_fork, self._main_rnn) self._def_reader = LSTMReadDefinitions(def_num_input_words, emb_def_dim, dim, vocab, lookup, fork_and_rnn, cache=self._cache) elif def_reader == 'mean': self._def_reader = MeanPoolReadDefinitions( def_num_input_words, emb_def_dim, dim, vocab, lookup, translate=(emb_def_dim != dim), normalize=False) else: raise Exception("def reader not understood") self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) self._pre_softmax = Linear(dim, self._num_output_words) self._softmax = NDimensionalSoftmax() children.extend([self._pre_softmax, self._softmax]) super(LanguageModel, self).__init__(children=children, **kwargs) def _push_initialization_config(self): super(LanguageModel, self)._push_initialization_config() if self._cache: self._cache.weights_init = Constant(0.) def set_def_embeddings(self, embeddings): self._def_reader._def_lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def get_def_embeddings_params(self): return self._def_reader._def_lookup.parameters[0] def get_cache_params(self): return self._cache.W def add_perplexity_measure(self, application_call, minus_logs, mask, name): costs = (minus_logs * mask).sum(axis=0) perplexity = tensor.exp(costs.sum() / mask.sum()) perplexity.tag.aggregation_scheme = Perplexity(costs.sum(), mask.sum()) full_name = "perplexity_" + name application_call.add_auxiliary_variable(perplexity, name=full_name) return costs @application def apply(self, application_call, words, mask): """Compute the log-likelihood for a batch of sequences. words An integer matrix of shape (B, T), where T is the number of time step, B is the batch size. Note that this order of the axis is different from what all RNN bricks consume, hence and the axis should be transposed at some point. mask A float32 matrix of shape (B, T). Zeros indicate the padding. """ if self._retrieval: defs, def_mask, def_map = self._retrieve(words) def_embeddings = self._def_reader.apply(defs, def_mask) # Auxililary variable for debugging application_call.add_auxiliary_variable(def_embeddings.shape[0], name="num_definitions") word_ids = self._word_to_id(words) # shortlisting input_word_ids = ( tensor.lt(word_ids, self._num_input_words) * word_ids + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk) output_word_ids = ( tensor.lt(word_ids, self._num_output_words) * word_ids + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk) application_call.add_auxiliary_variable(unk_ratio( input_word_ids, mask, self._vocab.unk), name='unk_ratio') # Run the main rnn with combined inputs word_embs = self._main_lookup.apply(input_word_ids) application_call.add_auxiliary_variable(masked_root_mean_square( word_embs, mask), name='word_emb_RMS') if self._retrieval: rnn_inputs, updated, positions = self._combiner.apply( word_embs, mask, def_embeddings, def_map) else: rnn_inputs = word_embs updates = [] if self._cache: flat_word_ids = word_ids.flatten() flat_word_ids_to_update = flat_word_ids[positions] # computing updates for cache updates = [ (self._cache.W, tensor.set_subtensor(self._cache.W[flat_word_ids_to_update], updated)) ] application_call.add_auxiliary_variable(masked_root_mean_square( word_embs, mask), name='main_rnn_in_RMS') main_rnn_states = self._main_rnn.apply(tensor.transpose( self._main_fork.apply(rnn_inputs), (1, 0, 2)), mask=mask.T)[0] # The first token is not predicted logits = self._pre_softmax.apply(main_rnn_states[:-1]) targets = output_word_ids.T[1:] out_softmax = self._softmax.apply(logits, extra_ndim=1) application_call.add_auxiliary_variable(out_softmax.copy(), name="proba_out") minus_logs = self._softmax.categorical_cross_entropy(targets, logits, extra_ndim=1) targets_mask = mask.T[1:] costs = self.add_perplexity_measure(application_call, minus_logs, targets_mask, "") missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L) self.add_perplexity_measure(application_call, minus_logs, targets_mask * missing_embs.T[:-1], "after_mis_word_embs") self.add_perplexity_measure(application_call, minus_logs, targets_mask * (1 - missing_embs.T[:-1]), "after_word_embs") word_counts = self._word_to_count(words) very_rare_masks = [] for threshold in self._very_rare_threshold: very_rare_mask = tensor.lt(word_counts, threshold).astype('int32') very_rare_mask = targets_mask * (very_rare_mask.T[:-1]) very_rare_masks.append(very_rare_mask) self.add_perplexity_measure(application_call, minus_logs, very_rare_mask, "after_very_rare_" + str(threshold)) if self._retrieval: has_def = tensor.zeros_like(output_word_ids) has_def = tensor.inc_subtensor( has_def[def_map[:, 0], def_map[:, 1]], 1) mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs) self.add_perplexity_measure(application_call, minus_logs, mask_targets_has_def, "after_def_embs") for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks): self.add_perplexity_measure( application_call, minus_logs, very_rare_mask * mask_targets_has_def, "after_def_very_rare_" + str(thresh)) application_call.add_auxiliary_variable(mask_targets_has_def.T, name='mask_def_emb') return costs, updates