def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, question_encoder: Seq2SeqEncoder, passage_encoder: Seq2SeqEncoder, r: float = 0.8, dropout: float = 0.1, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(EvidenceExtraction, self).__init__(vocab, regularizer) self._embedder = embedder self._question_encoder = question_encoder self._passage_encoder = passage_encoder # size: 2H encoding_dim = question_encoder.get_output_dim() self._gru_cell = nn.GRUCell(2 * encoding_dim, encoding_dim) self._gate = nn.Linear(2 * encoding_dim, 2 * encoding_dim) self._match_layer_1 = nn.Linear(2 * encoding_dim, encoding_dim) self._match_layer_2 = nn.Linear(encoding_dim, 1) self._question_attention_for_passage = Attention( NonlinearSimilarity(encoding_dim)) self._question_attention_for_question = Attention( NonlinearSimilarity(encoding_dim)) self._passage_attention_for_answer = Attention( NonlinearSimilarity(encoding_dim), normalize=False) self._passage_attention_for_ranking = Attention( NonlinearSimilarity(encoding_dim)) self._passage_self_attention = Attention( NonlinearSimilarity(encoding_dim)) self._self_gru_cell = nn.GRUCell(2 * encoding_dim, encoding_dim) self._self_gate = nn.Linear(2 * encoding_dim, encoding_dim) self._answer_net = nn.GRUCell(encoding_dim, encoding_dim) self._v_r_Q = nn.Parameter(torch.rand(encoding_dim)) self._r = r self._span_start_accuracy = CategoricalAccuracy() self._span_end_accuracy = CategoricalAccuracy() self._span_accuracy = BooleanAccuracy() self._squad_metrics = SquadEmAndF1() if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x initializer(self)
def __init__(self, vocab_size, max_len, embed_size, hidden_size, sos_id=2, eos_id=3, n_layers=1, rnn_cell='GRU', input_dropout_p=0, dropout_p=0, use_attention=False): super(Decoder, self).__init__() self.vocab_size = vocab_size self.hidden_size = hidden_size self.n_layers = n_layers self.input_dropout = nn.Dropout(p=input_dropout_p) if rnn_cell == 'LSTM': self.rnn_cell = nn.LSTM elif rnn_cell == 'GRU': self.rnn_cell = nn.GRU else: raise ValueError("Unsupported RNN Cell: {0}".format(rnn_cell)) self.rnn = self.rnn_cell(embed_size, hidden_size, n_layers, batch_first=True, dropout=dropout_p) self.output_size = vocab_size self.max_length = max_len self.use_attention = use_attention self.eos_id = eos_id self.sos_id = sos_id self.init_input = None self.embedding = nn.Embedding(self.output_size, embed_size) if use_attention: self.attention = Attention(self.hidden_size) self.out = nn.Linear(self.hidden_size, self.output_size)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, seq2seq_encoder: Seq2SeqEncoder, initializer: InitializerApplicator) -> None: super(ProLocalModel, self).__init__(vocab) self.text_field_embedder = text_field_embedder self.seq2seq_encoder = seq2seq_encoder self.attention_layer = \ Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(), seq2seq_encoder.get_output_dim()), normalize=True) self.num_types = self.vocab.get_vocab_size("state_change_type_labels") self.aggregate_feedforward = Linear(seq2seq_encoder.get_output_dim(), self.num_types) self.span_metric = SpanBasedF1Measure(vocab, tag_namespace="state_change_tags") # by default "O" is ignored in metric computation self.num_tags = self.vocab.get_vocab_size("state_change_tags") self.tag_projection_layer = TimeDistributed(Linear(self.seq2seq_encoder.get_output_dim() + 2 , self.num_tags)) self._type_accuracy = CategoricalAccuracy() self.type_f1_metrics = {} self.type_labels_vocab = self.vocab.get_index_to_token_vocabulary("state_change_type_labels") for type_label in self.type_labels_vocab.values(): self.type_f1_metrics["type_" + type_label] = F1Measure(self.vocab.get_token_index(type_label, "state_change_type_labels")) self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, encoder_output_dim: int, action_embedding_dim: int, attention_function: SimilarityFunction, checklist_size: int = None) -> None: super(NlvrDecoderStep, self).__init__() self._input_attention = Attention(attention_function) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. output_dim = encoder_output_dim input_dim = output_dim # Our decoder input will be the concatenation of the decoder hidden state and the previous # action embedding, and we'll project that down to the decoder's `input_dim`, which we # arbitrarily set to be the same as `output_dim`. self._input_projection_layer = Linear( output_dim + action_embedding_dim, input_dim) # Before making a prediction, we'll compute an attention over the input given our updated # hidden state, and optionally a difference between the current checklist vector and its # target, if we are training to maximize coverage using a checklist. Then we concatenate # those with the decoder state and project to `action_embedding_dim` to make a prediction. if checklist_size is None: self._output_projection_layer = Linear( output_dim + encoder_output_dim, action_embedding_dim) else: self._output_projection_layer = Linear( output_dim + encoder_output_dim + checklist_size, action_embedding_dim) # TODO(pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(input_dim, output_dim)
def from_params(cls, vocab, params: Params) -> 'WikiTablesMmlSemanticParser': question_embedder = TextFieldEmbedder.from_params(vocab, params.pop("question_embedder")) action_embedding_dim = params.pop_int("action_embedding_dim") encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) entity_encoder = Seq2VecEncoder.from_params(params.pop('entity_encoder')) max_decoding_steps = params.pop_int("max_decoding_steps") mixture_feedforward_type = params.pop('mixture_feedforward', None) if mixture_feedforward_type is not None: mixture_feedforward = FeedForward.from_params(mixture_feedforward_type) else: mixture_feedforward = None decoder_beam_search = BeamSearch.from_params(params.pop("decoder_beam_search")) input_attention = Attention.from_params(params.pop("attention")) training_beam_size = params.pop_int('training_beam_size', None) use_neighbor_similarity_for_linking = params.pop_bool('use_neighbor_similarity_for_linking', False) dropout = params.pop_float('dropout', 0.0) num_linking_features = params.pop_int('num_linking_features', 10) tables_directory = params.pop('tables_directory', '/wikitables/') rule_namespace = params.pop('rule_namespace', 'rule_labels') params.assert_empty(cls.__name__) return cls(vocab, question_embedder=question_embedder, action_embedding_dim=action_embedding_dim, encoder=encoder, entity_encoder=entity_encoder, mixture_feedforward=mixture_feedforward, decoder_beam_search=decoder_beam_search, max_decoding_steps=max_decoding_steps, input_attention=input_attention, training_beam_size=training_beam_size, use_neighbor_similarity_for_linking=use_neighbor_similarity_for_linking, dropout=dropout, num_linking_features=num_linking_features, tables_directory=tables_directory, rule_namespace=rule_namespace)
def __init__(self, encoder_output_dim: int, action_embedding_dim: int, attention_function: SimilarityFunction, dropout: float = 0.0, use_coverage: bool = False) -> None: super(NlvrDecoderStep, self).__init__() self._input_attention = Attention(attention_function) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. output_dim = encoder_output_dim input_dim = output_dim # Our decoder input will be the concatenation of the decoder hidden state and the previous # action embedding, and we'll project that down to the decoder's `input_dim`, which we # arbitrarily set to be the same as `output_dim`. self._input_projection_layer = Linear( output_dim + action_embedding_dim, input_dim) # Before making a prediction, we'll compute an attention over the input given our updated # hidden state. Then we concatenate those with the decoder state and project to # `action_embedding_dim` to make a prediction. self._output_projection_layer = Linear(output_dim + encoder_output_dim, action_embedding_dim) if use_coverage: # This is a multiplicative factor that is used to add the embeddings of yet to be # produced actions to the predicted embedding and bias it. self._checklist_embedding_multiplier = Parameter( torch.FloatTensor([1.0])) # TODO(pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(input_dim, output_dim) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "target_tags", target_embedding_dim: int = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(SimpleSeq2SeqCrf, self).__init__(vocab, regularizer) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) self._crf = ConditionalRandomField(num_classes) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim( ) + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # self._decoder_cell = GRUCell(self._decoder_input_dim, self._decoder_output_dim, bias=False) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) self.metrics = { "accuracy": CategoricalAccuracy(), "accuracy3": CategoricalAccuracy(top_k=3) } self.span_metric = SpanBasedF1Measure( vocab, tag_namespace=target_namespace, ignore_classes=[START_SYMBOL[2:], END_SYMBOL[2:]]) initializer(self) # Initialize forget gate """
def test_masked(self): attention = Attention() # Testing general masked non-batched case. vector = Variable(torch.FloatTensor([[0.3, 0.1, 0.5]])) matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.1, 0.4, 0.3]]])) mask = Variable(torch.FloatTensor([[1.0, 0.0, 1.0]])) result = attention(vector, matrix, mask).data.numpy() assert_almost_equal(result, numpy.array([[0.52248482, 0.0, 0.47751518]]))
def __init__(self, encoder_output_dim: int, action_embedding_dim: int, attention_function: SimilarityFunction, num_start_types: int, num_entity_types: int, mixture_feedforward: FeedForward = None, dropout: float = 0.0, unlinked_terminal_indices: List[int] = None) -> None: super(WikiTablesDecoderStep, self).__init__() self._mixture_feedforward = mixture_feedforward self._entity_type_embedding = Embedding(num_entity_types, action_embedding_dim) self._input_attention = Attention(attention_function) self._num_start_types = num_start_types self._start_type_predictor = Linear(encoder_output_dim, num_start_types) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. output_dim = encoder_output_dim input_dim = output_dim # Our decoder input will be the concatenation of the decoder hidden state and the previous # action embedding, and we'll project that down to the decoder's `input_dim`, which we # arbitrarily set to be the same as `output_dim`. self._input_projection_layer = Linear( output_dim + action_embedding_dim, input_dim) # Before making a prediction, we'll compute an attention over the input given our updated # hidden state, and optionally a difference between the current checklist vector and its # target, if we are training to maximize coverage using a checklist. Then we concatenate # those with the decoder state and project to `action_embedding_dim` to make a prediction. if unlinked_terminal_indices is None: self._output_projection_layer = Linear( output_dim + encoder_output_dim, action_embedding_dim) else: unlinked_checklist_size = len(unlinked_terminal_indices) self._output_projection_layer = Linear( output_dim + encoder_output_dim + unlinked_checklist_size, action_embedding_dim) self._unlinked_terminal_indices = unlinked_terminal_indices # TODO(pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(input_dim, output_dim) if mixture_feedforward is not None: check_dimensions_match(output_dim, mixture_feedforward.get_input_dim(), "hidden state embedding dim", "mixture feedforward input dim") check_dimensions_match(mixture_feedforward.get_output_dim(), 1, "mixture feedforward output dim", "dimension for scalar value") if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
def test_non_normalized_attention_works(self): attention = Attention(normalize=False) sentence_tensor = Variable(torch.FloatTensor([[[-1, 0, 4], [1, 1, 1], [-1, 0, 4], [-1, 0, -1]]])) query_tensor = Variable(torch.FloatTensor([[.1, .8, .5]])) result = attention(query_tensor, sentence_tensor).data.numpy() assert_almost_equal(result, [[1.9, 1.4, 1.9, -.6]])
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, spans_per_word: float, target_namespace: str = "tokens", target_embedding_dim: int = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.0, spans_extractor: SpanExtractor = None, spans_scorer_feedforward: FeedForward = None) -> None: super(SpanAe, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() + 1 target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim( ) + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim self._decoder_cell = LSTMCell(self._decoder_input_dim + 1, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) self._span_extractor = spans_extractor feedforward_scorer = torch.nn.Sequential( TimeDistributed(spans_scorer_feedforward), TimeDistributed( torch.nn.Linear(spans_scorer_feedforward.get_output_dim(), 1))) self._span_pruner = SpanPruner(feedforward_scorer) self._spans_per_word = spans_per_word
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, question_encoder: Seq2SeqEncoder, passage_encoder: Seq2SeqEncoder, feed_forward: FeedForward, dropout: float = 0.1, num_decoding_steps: int = 40, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(AnswerSynthesis, self).__init__(vocab, regularizer) self._vocab = vocab self._vocab_size = vocab.get_vocab_size() # default: tokens self._num_decoding_steps = num_decoding_steps self._start_token_index = self._vocab.get_token_index(START_SYMBOL) self._end_token_index = self._vocab.get_token_index(END_SYMBOL) self._embedder = embedder self._question_encoder = question_encoder self._passage_encoder = passage_encoder encoding_dim = question_encoder.get_output_dim() embedding_dim = embedder.get_output_dim() self._span_start_embedding = nn.Embedding(2, 50) self._span_end_embedding = nn.Embedding(2, 50) self._gru_decoder = nn.GRUCell(encoding_dim + embedding_dim, encoding_dim) self._feed_forward = feed_forward self._attention = Attention(NonlinearSimilarity(encoding_dim)) self._W_r = nn.Linear(embedding_dim, encoding_dim, bias=False) self._U_r = nn.Linear(encoding_dim, encoding_dim, bias=False) self._V_r = nn.Linear(encoding_dim, encoding_dim, bias=False) self._max_out = Maxout(encoding_dim, num_layers=1, output_dims=int(encoding_dim / 2), pool_sizes=2) self._W_o = nn.Linear(int(encoding_dim / 2), self._vocab_size, bias=False) self._squad_metrics = SquadEmAndF1() #self._predict_acc = CategoricalAccuracy() if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x initializer(self) self._num_iter = 0
def test_batched_no_mask(self): attention = Attention() # Testing general batched case. vector = Variable(torch.FloatTensor([[0.3, 0.1, 0.5], [0.3, 0.1, 0.5]])) matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2]], [[0.6, 0.8, 0.1], [0.15, 0.5, 0.2]]])) result = attention(vector, matrix).data.numpy() assert_almost_equal(result, numpy.array([[0.52871835, 0.47128162], [0.52871835, 0.47128162]]))
def test_all_attention_works_the_same(attention_type: str): module_cls = Attention.by_name(attention_type) vector = torch.FloatTensor([[-7, -8, -9]]) matrix = torch.FloatTensor([[[1, 2, 3], [4, 5, 6]]]) if module_cls in {BilinearAttention, AdditiveAttention, LinearAttention}: module = module_cls(vector.size(-1), matrix.size(-1)) else: module = module_cls() output = module(vector, matrix) assert tuple(output.size()) == (1, 2)
def from_params(cls, vocab, params: Params) -> 'WikiTablesErmSemanticParser': question_embedder = TextFieldEmbedder.from_params( vocab, params.pop("question_embedder")) action_embedding_dim = params.pop_int("action_embedding_dim") encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) entity_encoder = Seq2VecEncoder.from_params( params.pop('entity_encoder')) mixture_feedforward_type = params.pop('mixture_feedforward', None) if mixture_feedforward_type is not None: mixture_feedforward = FeedForward.from_params( mixture_feedforward_type) else: mixture_feedforward = None input_attention = Attention.from_params(params.pop("attention")) decoder_beam_size = params.pop_int("decoder_beam_size") decoder_num_finished_states = params.pop_int( "decoder_num_finished_states", None) max_decoding_steps = params.pop_int("max_decoding_steps") normalize_beam_score_by_length = params.pop( "normalize_beam_score_by_length", False) use_neighbor_similarity_for_linking = params.pop_bool( "use_neighbor_similarity_for_linking", False) dropout = params.pop_float('dropout', 0.0) num_linking_features = params.pop_int('num_linking_features', 10) tables_directory = params.pop('tables_directory', '/wikitables/') rule_namespace = params.pop('rule_namespace', 'rule_labels') checklist_cost_weight = params.pop_float("checklist_cost_weight", 0.6) mml_model_file = params.pop('mml_model_file', None) params.assert_empty(cls.__name__) return cls( vocab, question_embedder=question_embedder, action_embedding_dim=action_embedding_dim, encoder=encoder, entity_encoder=entity_encoder, mixture_feedforward=mixture_feedforward, input_attention=input_attention, decoder_beam_size=decoder_beam_size, decoder_num_finished_states=decoder_num_finished_states, max_decoding_steps=max_decoding_steps, normalize_beam_score_by_length=normalize_beam_score_by_length, checklist_cost_weight=checklist_cost_weight, use_neighbor_similarity_for_linking= use_neighbor_similarity_for_linking, dropout=dropout, num_linking_features=num_linking_features, tables_directory=tables_directory, rule_namespace=rule_namespace, initial_mml_model_file=mml_model_file)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "tokens", target_embedding_dim: int = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.0) -> None: super(SimpleSeq2Seq, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) self.num_classes = num_classes # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim( ) + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def test_batched_masked(self): attention = Attention() # Testing general masked non-batched case. vector = Variable(torch.FloatTensor([[0.3, 0.1, 0.5], [0.3, 0.1, 0.5]])) matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]], [[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]]])) mask = Variable(torch.FloatTensor([[1.0, 1.0, 0.0], [1.0, 0.0, 1.0]])) result = attention(vector, matrix, mask).data.numpy() assert_almost_equal(result, numpy.array([[0.52871835, 0.47128162, 0.0], [0.50749944, 0.0, 0.49250056]])) # Test the case where a mask is all 0s and an input is all 0s. vector = Variable(torch.FloatTensor([[0.0, 0.0, 0.0], [0.3, 0.1, 0.5]])) matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]], [[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]]])) mask = Variable(torch.FloatTensor([[1.0, 1.0, 0.0], [0.0, 0.0, 0.0]])) result = attention(vector, matrix, mask).data.numpy() assert_almost_equal(result, numpy.array([[0.5, 0.5, 0.0], [0.0, 0.0, 0.0]]))
def from_params(cls, vocab, params: Params) -> 'NlvrDirectSemanticParser': sentence_embedder_params = params.pop("sentence_embedder") sentence_embedder = TextFieldEmbedder.from_params( vocab, sentence_embedder_params) action_embedding_dim = params.pop_int('action_embedding_dim') encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) dropout = params.pop_float('dropout', 0.0) input_attention = Attention.from_params(params.pop("attention")) decoder_beam_search = BeamSearch.from_params( params.pop("decoder_beam_search")) max_decoding_steps = params.pop_int("max_decoding_steps") params.assert_empty(cls.__name__) return cls(vocab, sentence_embedder=sentence_embedder, action_embedding_dim=action_embedding_dim, encoder=encoder, input_attention=input_attention, decoder_beam_search=decoder_beam_search, max_decoding_steps=max_decoding_steps, dropout=dropout)
def from_params(cls, vocab, params: Params) -> 'NlvrCoverageSemanticParser': sentence_embedder_params = params.pop("sentence_embedder") sentence_embedder = TextFieldEmbedder.from_params( vocab, sentence_embedder_params) action_embedding_dim = params.pop_int('action_embedding_dim') encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) dropout = params.pop_float('dropout', 0.0) input_attention = Attention.from_params(params.pop("attention")) beam_size = params.pop_int('beam_size') max_num_finished_states = params.pop_int('max_num_finished_states', None) normalize_beam_score_by_length = params.pop_bool( 'normalize_beam_score_by_length', False) max_decoding_steps = params.pop_int("max_decoding_steps") checklist_cost_weight = params.pop_float("checklist_cost_weight", 0.6) dynamic_cost_weight = params.pop("dynamic_cost_weight", None) penalize_non_agenda_actions = params.pop_bool( "penalize_non_agenda_actions", False) initial_mml_model_file = params.pop("initial_mml_model_file", None) params.assert_empty(cls.__name__) return cls( vocab, sentence_embedder=sentence_embedder, action_embedding_dim=action_embedding_dim, encoder=encoder, input_attention=input_attention, beam_size=beam_size, max_num_finished_states=max_num_finished_states, dropout=dropout, max_decoding_steps=max_decoding_steps, normalize_beam_score_by_length=normalize_beam_score_by_length, checklist_cost_weight=checklist_cost_weight, dynamic_cost_weight=dynamic_cost_weight, penalize_non_agenda_actions=penalize_non_agenda_actions, initial_mml_model_file=initial_mml_model_file)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "tokens", target_embedder: TextFieldEmbedder = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.25, pointer_gen: bool = True, language_model: bool = True, max_oovs: int = None) -> None: super(PointerGenerator, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio self._pointer_gen = pointer_gen self._language_model = language_model if self._pointer_gen: self._max_oovs = max_oovs self.vocab.set_max_oovs(self._max_oovs) # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._target_embedder = target_embedder or source_embedder #!!! attention on decoder output, not on decoder input !!!# self._decoder_input_dim = self._target_embedder.get_output_dim() # decoder use UniLSTM while encoder use BiLSTM self._decoder_hidden_dim = self._encoder.get_output_dim()//2 # decoder: h0 c0 projection_layer from final_encoder_output self.decode_h0_projection_layer = Linear(self._encoder.get_output_dim(),self._decoder_hidden_dim) self.decode_c0_projection_layer = Linear(self._encoder.get_output_dim(),self._decoder_hidden_dim) self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the decoder_hidden of the decoder at each time step. # V[s_t, h*_t] + b self._decoder_output_dim = self._decoder_hidden_dim + self._encoder.get_output_dim() #[s_t, h*_t] # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim) self._output_attention_layer = Linear(self._decoder_output_dim, self._decoder_hidden_dim) #V[s_t, h*_t] + b self._output_projection_layer = Linear(self._decoder_hidden_dim, num_classes) # num_classes->V' # generationp robability if self._pointer_gen: self._pointer_gen_layer = Linear(self._decoder_hidden_dim+self._encoder.get_output_dim()+self._decoder_input_dim, 1) # metrics self.metrics = { "ROUGE-1": Rouge(1), "ROUGE-2": Rouge(2), }
def from_params(cls, vocab: Vocabulary, params: Params) -> 'MtGan': # type: ignore # pylint: disable=arguments-differ vocab_namespace_A = params.pop("vocab_namespace_A", "vocab_A") vocab_namespace_B = params.pop("vocab_namespace_B", "vocab_B") num_classes_A = vocab.get_vocab_size(namespace=vocab_namespace_A) num_classes_B = vocab.get_vocab_size(namespace=vocab_namespace_B) params_generators = params.pop("generators") if params_generators.pop("type") == "rnn2rnn": generators_embedding_dim = params_generators.pop("embedding_dim") embedding_A_generator = Embedding(num_embeddings=num_classes_A, embedding_dim=generators_embedding_dim) embedding_B_generator = Embedding(num_embeddings=num_classes_B, embedding_dim=generators_embedding_dim) params_encoder_generators = params_generators.pop("encoder") generator_A_to_B_encoder = Seq2SeqEncoder.from_params(params_encoder_generators.duplicate()) generator_B_to_A_encoder = Seq2SeqEncoder.from_params(params_encoder_generators.duplicate()) generator_attention_params = params_generators.pop("attention") attention_generator_A_to_B = Attention.from_params(generator_attention_params.duplicate()) attention_generator_B_to_A = Attention.from_params(generator_attention_params.duplicate()) generators_max_decoding_steps = params_generators.pop("max_decoding_steps") generator_A_to_B = Rnn2Rnn(vocab=vocab, source_embedding=embedding_A_generator, target_embedding=embedding_B_generator, encoder=generator_A_to_B_encoder, max_decoding_steps=generators_max_decoding_steps, target_namespace=vocab_namespace_B, attention=attention_generator_A_to_B) generator_B_to_A = Rnn2Rnn(vocab=vocab, source_embedding=embedding_B_generator, target_embedding=embedding_A_generator, encoder=generator_B_to_A_encoder, max_decoding_steps=generators_max_decoding_steps, target_namespace=vocab_namespace_A, attention=attention_generator_B_to_A) else: raise ConfigurationError(message="This generators model type is not supported") discriminators_params = params.pop("discriminators") if discriminators_params.pop("type") == "seq2prob": params_encoder_discriminators = discriminators_params.pop("encoder") discriminator_A_encoder = Seq2VecEncoder.from_params(params_encoder_discriminators.duplicate()) discriminator_B_encoder = Seq2VecEncoder.from_params(params_encoder_discriminators.duplicate()) discriminators_embedding_dim = discriminators_params.pop("embedding_dim") embedding_A_discriminator = Embedding(num_classes_A, discriminators_embedding_dim) embedding_B_discriminator = Embedding(num_classes_B, discriminators_embedding_dim) discriminator_A = Seq2Prob(vocab=vocab, encoder=discriminator_A_encoder, embedding=embedding_A_discriminator) discriminator_B = Seq2Prob(vocab=vocab, encoder=discriminator_B_encoder, embedding=embedding_B_discriminator) else: raise ConfigurationError(message="This discriminators model type is not supported") return cls(vocab=vocab, generator_A_to_B=generator_A_to_B, generator_B_to_A=generator_B_to_A, discriminator_A=discriminator_A, discriminator_B=discriminator_B, vocab_namespace_A=vocab_namespace_A, vocab_namespace_B=vocab_namespace_B)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "tokens", target_embedder: TextFieldEmbedder = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.25) -> None: super(PointerGeneratorPattern, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio self._pattern_pos = [ '@@np@@', '@@ns@@', '@@ni@@', '@@nz@@', '@@m@@', '@@i@@', '@@id@@', '@@t@@', '@@j@@' ] self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) self._target_embedder = target_embedder or source_embedder #!!! attention on decoder output, not on decoder input !!!# self._decoder_input_dim = self._target_embedder.get_output_dim() # decoder use UniLSTM while encoder use BiLSTM self._decoder_hidden_dim = self._encoder.get_output_dim() # decoder: h0 c0 projection_layer from final_encoder_output self.decode_h0_projection_layer = Linear( self._encoder.get_output_dim(), self._decoder_hidden_dim) self.decode_c0_projection_layer = Linear( self._encoder.get_output_dim(), self._decoder_hidden_dim) self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the decoder_hidden of the decoder at each time step. # V[s_t, h*_t] + b self._decoder_output_dim = self._decoder_hidden_dim + self._encoder.get_output_dim( ) #[s_t, h*_t] # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim) self._output_attention_layer = Linear(self._decoder_output_dim, self._decoder_hidden_dim) #V[s_t, h*_t] + b self._output_projection_layer = Linear(self._decoder_hidden_dim, num_classes) # num_classes->V' # generationp robability self._pointer_gen_layer = Linear( self._decoder_hidden_dim + self._encoder.get_output_dim() + self._decoder_input_dim, 1) # metrics self.metrics = { "ROUGE-1": Rouge(1), "ROUGE-2": Rouge(2), }
import pytest import torch from allennlp.modules import Attention from allennlp.modules.attention import BilinearAttention, AdditiveAttention, LinearAttention @pytest.mark.parametrize("attention_type", Attention.list_available()) def test_all_attention_works_the_same(attention_type: str): module_cls = Attention.by_name(attention_type) vector = torch.FloatTensor([[-7, -8, -9]]) matrix = torch.FloatTensor([[[1, 2, 3], [4, 5, 6]]]) if module_cls in {BilinearAttention, AdditiveAttention, LinearAttention}: module = module_cls(vector.size(-1), matrix.size(-1)) else: module = module_cls() output = module(vector, matrix) assert tuple(output.size()) == (1, 2)
def test_can_build_from_params(self): params = Params({'similarity_function': {'type': 'cosine'}, 'normalize': False}) attention = Attention.from_params(params) # pylint: disable=protected-access assert attention._similarity_function.__class__.__name__ == 'CosineSimilarity' assert attention._normalize is False
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, use_attention: bool, seq2seq_encoder: Seq2SeqEncoder, seq2vec_encoder: Seq2VecEncoder, span_end_encoder_after: Seq2SeqEncoder, use_decoder_trainer: bool, decoder_beam_search: BeamSearch, kb_configs: dict, other_configs: dict, initializer: InitializerApplicator) -> None: super(ProStructModel, self).__init__(vocab) self.text_field_embedder = text_field_embedder self.num_actions = len(Action) # number of actions is hardcoded here. # They are defined in Action enum in propara_dataset_reader.py self.other_configs = other_configs # kb_coefficient * kb_score + (1-kb_coefficient) * model_score self.kb_coefficient = torch.nn.Parameter( torch.ones(1).mul(kb_configs.get('kb_coefficient', 0.5))) self.use_attention = use_attention self.use_decoder_trainer = use_decoder_trainer if self.use_attention: self.seq2seq_encoder = seq2seq_encoder self.time_distributed_seq2seq_encoder = TimeDistributed( TimeDistributed(self.seq2seq_encoder)) self.time_distributed_attention_layer = \ TimeDistributed(TimeDistributed( Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(), seq2seq_encoder.get_output_dim()), normalize=True))) self.aggregate_feedforward = Linear( seq2seq_encoder.get_output_dim(), self.num_actions) else: self.seq2vec_encoder = seq2vec_encoder self.time_distributed_seq2vec_encoder = TimeDistributed( TimeDistributed(self.seq2vec_encoder)) self.aggregate_feedforward = Linear( seq2vec_encoder.get_output_dim(), self.num_actions) self.span_end_encoder_after = span_end_encoder_after # per step per participant self.time_distributed_encoder_span_end_after = TimeDistributed( TimeDistributed(self.span_end_encoder_after)) # Fixme: dimensions self._span_start_predictor_after = TimeDistributed( TimeDistributed( torch.nn.Linear(2 + 2 * seq2seq_encoder.get_output_dim(), 1))) self._span_end_predictor_after = TimeDistributed( TimeDistributed( torch.nn.Linear(span_end_encoder_after.get_output_dim(), 1))) self._type_accuracy = BooleanAccuracy( ) # Fixme WRONG. Categorical accuracy should be right! self._loss = torch.nn.CrossEntropyLoss( ignore_index=-1 ) # Fixme: This is less robust. If the masking value # Fixme: add a metric for location span strings self.span_metric = SquadEmAndF1() if self.use_decoder_trainer: self.decoder_trainer = MaximumMarginalLikelihood() if kb_configs['kb_to_use'] == 'lexicalkb': kb = KBLexical(lexical_kb_path=kb_configs['lexical_kb_path'], fullgrid_prompts_load_path=kb_configs[ 'fullgrid_prompts_load_path']) # Makeshift arrangement to get number of participants in tiny.tsv . self.commonsense_based_action_generator = CommonsenseBasedActionGenerator( self.num_actions) self.rules_activated = [ int(rule_val.strip()) > 0 for rule_val in self.other_configs.get( 'constraint_rules_to_turn_on', '0,0,0,1').split(",") ] self.rule_2_fraction_participants = self.other_configs.get( 'rule_2_fraction_participants', 0.5) self.rule_3_fraction_steps = self.other_configs.get( 'rule_3_fraction_steps', 0.5) self.commonsense_based_action_generator.set_rules_used( self.rules_activated, self.rule_2_fraction_participants, self.rule_3_fraction_steps) # [self.rules_activated[0], # C/D/C/D cannot happen # self.rules_activated[1], # > 1/2 partic # self.rules_activated[2], # > 1/2 steps cannot change # self.rules_activated[3] # until mentioned # ]) self.decoder_step = ProParaDecoderStep( KBBasedActionScorer(kb=kb, kb_coefficient=self.kb_coefficient), valid_action_generator=self.commonsense_based_action_generator) self.beam_search = decoder_beam_search initializer(self)
def setUp(self): super().setUp() self.decoder_step = BasicTransitionFunction(encoder_output_dim=2, action_embedding_dim=2, input_attention=Attention.by_name('dot_product')(), num_start_types=3, add_action_bias=False) batch_indices = [0, 1, 0] action_history = [[1], [3, 4], []] score = [torch.FloatTensor([x]) for x in [.1, 1.1, 2.2]] hidden_state = torch.FloatTensor([[i, i] for i in range(len(batch_indices))]) memory_cell = torch.FloatTensor([[i, i] for i in range(len(batch_indices))]) previous_action_embedding = torch.FloatTensor([[i, i] for i in range(len(batch_indices))]) attended_question = torch.FloatTensor([[i, i] for i in range(len(batch_indices))]) # This maps non-terminals to valid actions, where the valid actions are grouped by _type_. # We have "global" actions, which are from the global grammar, and "linked" actions, which # are instance-specific and are generated based on question attention. Each action type # has a tuple which is (input representation, output representation, action ids). valid_actions = { 'e': { 'global': (torch.FloatTensor([[0, 0], [-1, -1], [-2, -2]]), torch.FloatTensor([[-1, -1], [-2, -2], [-3, -3]]), [0, 1, 2]), 'linked': (torch.FloatTensor([[.1, .2, .3], [.4, .5, .6]]), torch.FloatTensor([[3, 3], [4, 4]]), [3, 4]) }, 'd': { 'global': (torch.FloatTensor([[0, 0]]), torch.FloatTensor([[-1, -1]]), [0]), 'linked': (torch.FloatTensor([[-.1, -.2, -.3], [-.4, -.5, -.6], [-.7, -.8, -.9]]), torch.FloatTensor([[5, 5], [6, 6], [7, 7]]), [1, 2, 3]) } } grammar_state = [GrammarStatelet([nonterminal], valid_actions, is_nonterminal) for _, nonterminal in zip(batch_indices, ['e', 'd', 'e'])] self.encoder_outputs = torch.FloatTensor([[[1, 2], [3, 4], [5, 6]], [[10, 11], [12, 13], [14, 15]]]) self.encoder_output_mask = torch.FloatTensor([[1, 1, 1], [1, 1, 0]]) self.possible_actions = [[('e -> f', False, None), ('e -> g', True, None), ('e -> h', True, None), ('e -> i', True, None), ('e -> j', True, None)], [('d -> q', True, None), ('d -> g', True, None), ('d -> h', True, None), ('d -> i', True, None)]] rnn_state = [] for i in range(len(batch_indices)): rnn_state.append(RnnStatelet(hidden_state[i], memory_cell[i], previous_action_embedding[i], attended_question[i], self.encoder_outputs, self.encoder_output_mask)) self.state = GrammarBasedState(batch_indices=batch_indices, action_history=action_history, score=score, rnn_state=rnn_state, grammar_state=grammar_state, possible_actions=self.possible_actions)
def setUp(self): super().setUp() self.decoder_step = BasicTransitionFunction( encoder_output_dim=2, action_embedding_dim=2, input_attention=Attention.by_name("dot_product")(), add_action_bias=False, ) batch_indices = [0, 1, 0] action_history = [[1], [3, 4], []] score = [torch.FloatTensor([x]) for x in [0.1, 1.1, 2.2]] hidden_state = torch.FloatTensor([[i, i] for i in range(len(batch_indices))]) memory_cell = torch.FloatTensor([[i, i] for i in range(len(batch_indices))]) previous_action_embedding = torch.FloatTensor( [[i, i] for i in range(len(batch_indices))]) attended_question = torch.FloatTensor( [[i, i] for i in range(len(batch_indices))]) # This maps non-terminals to valid actions, where the valid actions are grouped by _type_. # We have "global" actions, which are from the global grammar, and "linked" actions, which # are instance-specific and are generated based on question attention. Each action type # has a tuple which is (input representation, output representation, action ids). valid_actions = { "e": { "global": ( torch.FloatTensor([[0, 0], [-1, -1], [-2, -2]]), torch.FloatTensor([[-1, -1], [-2, -2], [-3, -3]]), [0, 1, 2], ), "linked": ( torch.FloatTensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]), torch.FloatTensor([[3, 3], [4, 4]]), [3, 4], ), }, "d": { "global": (torch.FloatTensor([[0, 0]]), torch.FloatTensor([[-1, -1]]), [0]), "linked": ( torch.FloatTensor([[-0.1, -0.2, -0.3], [-0.4, -0.5, -0.6], [-0.7, -0.8, -0.9]]), torch.FloatTensor([[5, 5], [6, 6], [7, 7]]), [1, 2, 3], ), }, } grammar_state = [ GrammarStatelet([nonterminal], valid_actions, is_nonterminal) for _, nonterminal in zip(batch_indices, ["e", "d", "e"]) ] self.encoder_outputs = torch.FloatTensor([[[1, 2], [3, 4], [5, 6]], [[10, 11], [12, 13], [14, 15]]]) self.encoder_output_mask = torch.FloatTensor([[1, 1, 1], [1, 1, 0]]) self.possible_actions = [ [ ("e -> f", False, None), ("e -> g", True, None), ("e -> h", True, None), ("e -> i", True, None), ("e -> j", True, None), ], [ ("d -> q", True, None), ("d -> g", True, None), ("d -> h", True, None), ("d -> i", True, None), ], ] rnn_state = [] for i in range(len(batch_indices)): rnn_state.append( RnnStatelet( hidden_state[i], memory_cell[i], previous_action_embedding[i], attended_question[i], self.encoder_outputs, self.encoder_output_mask, )) self.state = GrammarBasedState( batch_indices=batch_indices, action_history=action_history, score=score, rnn_state=rnn_state, grammar_state=grammar_state, possible_actions=self.possible_actions, )
def setUp(self): super().setUp() self.decoder_step = BasicTransitionFunction( encoder_output_dim=2, action_embedding_dim=2, input_attention=Attention.by_name('dot_product')(), num_start_types=3, add_action_bias=False) batch_indices = [0, 1, 0] action_history = [[1], [3, 4], []] score = [torch.FloatTensor([x]) for x in [.1, 1.1, 2.2]] hidden_state = torch.FloatTensor([[i, i] for i in range(len(batch_indices))]) memory_cell = torch.FloatTensor([[i, i] for i in range(len(batch_indices))]) previous_action_embedding = torch.FloatTensor( [[i, i] for i in range(len(batch_indices))]) attended_question = torch.FloatTensor( [[i, i] for i in range(len(batch_indices))]) # This maps non-terminals to valid actions, where the valid actions are grouped by _type_. # We have "global" actions, which are from the global grammar, and "linked" actions, which # are instance-specific and are generated based on question attention. Each action type # has a tuple which is (input representation, output representation, action ids). valid_actions = { 'e': { 'global': (torch.FloatTensor([[0, 0], [-1, -1], [-2, -2]]), torch.FloatTensor([[-1, -1], [-2, -2], [-3, -3]]), [0, 1, 2]), 'linked': (torch.FloatTensor([[.1, .2, .3], [.4, .5, .6]]), torch.FloatTensor([[3, 3], [4, 4]]), [3, 4]) }, 'd': { 'global': (torch.FloatTensor([[0, 0]]), torch.FloatTensor([[-1, -1]]), [0]), 'linked': (torch.FloatTensor([[-.1, -.2, -.3], [-.4, -.5, -.6], [-.7, -.8, -.9]]), torch.FloatTensor([[5, 5], [6, 6], [7, 7]]), [1, 2, 3]) } } grammar_state = [ GrammarState([nonterminal], {}, valid_actions, {}, is_nonterminal) for _, nonterminal in zip(batch_indices, ['e', 'd', 'e']) ] self.encoder_outputs = torch.FloatTensor([[[1, 2], [3, 4], [5, 6]], [[10, 11], [12, 13], [14, 15]]]) self.encoder_output_mask = torch.FloatTensor([[1, 1, 1], [1, 1, 0]]) self.possible_actions = [[ ('e -> f', False, None), ('e -> g', True, None), ('e -> h', True, None), ('e -> i', True, None), ('e -> j', True, None) ], [ ('d -> q', True, None), ('d -> g', True, None), ('d -> h', True, None), ('d -> i', True, None) ]] rnn_state = [] for i in range(len(batch_indices)): rnn_state.append( RnnState(hidden_state[i], memory_cell[i], previous_action_embedding[i], attended_question[i], self.encoder_outputs, self.encoder_output_mask)) self.state = GrammarBasedDecoderState( batch_indices=batch_indices, action_history=action_history, score=score, rnn_state=rnn_state, grammar_state=grammar_state, possible_actions=self.possible_actions)