예제 #1
0
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 question_encoder: Seq2SeqEncoder,
                 passage_encoder: Seq2SeqEncoder,
                 r: float = 0.8,
                 dropout: float = 0.1,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(EvidenceExtraction, self).__init__(vocab, regularizer)

        self._embedder = embedder

        self._question_encoder = question_encoder
        self._passage_encoder = passage_encoder

        # size: 2H
        encoding_dim = question_encoder.get_output_dim()

        self._gru_cell = nn.GRUCell(2 * encoding_dim, encoding_dim)

        self._gate = nn.Linear(2 * encoding_dim, 2 * encoding_dim)

        self._match_layer_1 = nn.Linear(2 * encoding_dim, encoding_dim)
        self._match_layer_2 = nn.Linear(encoding_dim, 1)

        self._question_attention_for_passage = Attention(
            NonlinearSimilarity(encoding_dim))
        self._question_attention_for_question = Attention(
            NonlinearSimilarity(encoding_dim))
        self._passage_attention_for_answer = Attention(
            NonlinearSimilarity(encoding_dim), normalize=False)
        self._passage_attention_for_ranking = Attention(
            NonlinearSimilarity(encoding_dim))

        self._passage_self_attention = Attention(
            NonlinearSimilarity(encoding_dim))
        self._self_gru_cell = nn.GRUCell(2 * encoding_dim, encoding_dim)
        self._self_gate = nn.Linear(2 * encoding_dim, encoding_dim)

        self._answer_net = nn.GRUCell(encoding_dim, encoding_dim)

        self._v_r_Q = nn.Parameter(torch.rand(encoding_dim))
        self._r = r

        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._squad_metrics = SquadEmAndF1()

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x

        initializer(self)
예제 #2
0
    def __init__(self, vocab_size, max_len, embed_size, hidden_size, sos_id=2, eos_id=3, n_layers=1, rnn_cell='GRU',
            input_dropout_p=0, dropout_p=0, use_attention=False):
        super(Decoder, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.input_dropout = nn.Dropout(p=input_dropout_p)
        if rnn_cell == 'LSTM':
            self.rnn_cell = nn.LSTM
        elif rnn_cell == 'GRU':
            self.rnn_cell = nn.GRU
        else:
            raise ValueError("Unsupported RNN Cell: {0}".format(rnn_cell))
        self.rnn = self.rnn_cell(embed_size, hidden_size, n_layers, batch_first=True, dropout=dropout_p)

        self.output_size = vocab_size
        self.max_length = max_len
        self.use_attention = use_attention
        self.eos_id = eos_id
        self.sos_id = sos_id

        self.init_input = None

        self.embedding = nn.Embedding(self.output_size, embed_size)
        if use_attention:
            self.attention = Attention(self.hidden_size)

        self.out = nn.Linear(self.hidden_size, self.output_size)
예제 #3
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 seq2seq_encoder: Seq2SeqEncoder,
                 initializer: InitializerApplicator) -> None:
        super(ProLocalModel, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.seq2seq_encoder = seq2seq_encoder

        self.attention_layer = \
            Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(),
                                                             seq2seq_encoder.get_output_dim()), normalize=True)

        self.num_types = self.vocab.get_vocab_size("state_change_type_labels")
        self.aggregate_feedforward = Linear(seq2seq_encoder.get_output_dim(),
                                            self.num_types)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace="state_change_tags")  # by default "O" is ignored in metric computation
        self.num_tags = self.vocab.get_vocab_size("state_change_tags")

        self.tag_projection_layer = TimeDistributed(Linear(self.seq2seq_encoder.get_output_dim() + 2
                                                           , self.num_tags))
        self._type_accuracy = CategoricalAccuracy()

        self.type_f1_metrics = {}
        self.type_labels_vocab = self.vocab.get_index_to_token_vocabulary("state_change_type_labels")
        for type_label in self.type_labels_vocab.values():
            self.type_f1_metrics["type_" + type_label] = F1Measure(self.vocab.get_token_index(type_label, "state_change_type_labels"))

        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
예제 #4
0
    def __init__(self,
                 encoder_output_dim: int,
                 action_embedding_dim: int,
                 attention_function: SimilarityFunction,
                 checklist_size: int = None) -> None:
        super(NlvrDecoderStep, self).__init__()
        self._input_attention = Attention(attention_function)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        output_dim = encoder_output_dim
        input_dim = output_dim
        # Our decoder input will be the concatenation of the decoder hidden state and the previous
        # action embedding, and we'll project that down to the decoder's `input_dim`, which we
        # arbitrarily set to be the same as `output_dim`.
        self._input_projection_layer = Linear(
            output_dim + action_embedding_dim, input_dim)
        # Before making a prediction, we'll compute an attention over the input given our updated
        # hidden state, and optionally a difference between the current checklist vector and its
        # target, if we are training to maximize coverage using a checklist. Then we concatenate
        # those with the decoder state and project to `action_embedding_dim` to make a prediction.
        if checklist_size is None:
            self._output_projection_layer = Linear(
                output_dim + encoder_output_dim, action_embedding_dim)
        else:
            self._output_projection_layer = Linear(
                output_dim + encoder_output_dim + checklist_size,
                action_embedding_dim)

        # TODO(pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(input_dim, output_dim)
예제 #5
0
 def from_params(cls, vocab, params: Params) -> 'WikiTablesMmlSemanticParser':
     question_embedder = TextFieldEmbedder.from_params(vocab, params.pop("question_embedder"))
     action_embedding_dim = params.pop_int("action_embedding_dim")
     encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))
     entity_encoder = Seq2VecEncoder.from_params(params.pop('entity_encoder'))
     max_decoding_steps = params.pop_int("max_decoding_steps")
     mixture_feedforward_type = params.pop('mixture_feedforward', None)
     if mixture_feedforward_type is not None:
         mixture_feedforward = FeedForward.from_params(mixture_feedforward_type)
     else:
         mixture_feedforward = None
     decoder_beam_search = BeamSearch.from_params(params.pop("decoder_beam_search"))
     input_attention = Attention.from_params(params.pop("attention"))
     training_beam_size = params.pop_int('training_beam_size', None)
     use_neighbor_similarity_for_linking = params.pop_bool('use_neighbor_similarity_for_linking', False)
     dropout = params.pop_float('dropout', 0.0)
     num_linking_features = params.pop_int('num_linking_features', 10)
     tables_directory = params.pop('tables_directory', '/wikitables/')
     rule_namespace = params.pop('rule_namespace', 'rule_labels')
     params.assert_empty(cls.__name__)
     return cls(vocab,
                question_embedder=question_embedder,
                action_embedding_dim=action_embedding_dim,
                encoder=encoder,
                entity_encoder=entity_encoder,
                mixture_feedforward=mixture_feedforward,
                decoder_beam_search=decoder_beam_search,
                max_decoding_steps=max_decoding_steps,
                input_attention=input_attention,
                training_beam_size=training_beam_size,
                use_neighbor_similarity_for_linking=use_neighbor_similarity_for_linking,
                dropout=dropout,
                num_linking_features=num_linking_features,
                tables_directory=tables_directory,
                rule_namespace=rule_namespace)
예제 #6
0
    def __init__(self,
                 encoder_output_dim: int,
                 action_embedding_dim: int,
                 attention_function: SimilarityFunction,
                 dropout: float = 0.0,
                 use_coverage: bool = False) -> None:
        super(NlvrDecoderStep, self).__init__()
        self._input_attention = Attention(attention_function)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        output_dim = encoder_output_dim
        input_dim = output_dim
        # Our decoder input will be the concatenation of the decoder hidden state and the previous
        # action embedding, and we'll project that down to the decoder's `input_dim`, which we
        # arbitrarily set to be the same as `output_dim`.
        self._input_projection_layer = Linear(
            output_dim + action_embedding_dim, input_dim)
        # Before making a prediction, we'll compute an attention over the input given our updated
        # hidden state. Then we concatenate those with the decoder state and project to
        # `action_embedding_dim` to make a prediction.
        self._output_projection_layer = Linear(output_dim + encoder_output_dim,
                                               action_embedding_dim)
        if use_coverage:
            # This is a multiplicative factor that is used to add the embeddings of yet to be
            # produced actions to the predicted embedding and bias it.
            self._checklist_embedding_multiplier = Parameter(
                torch.FloatTensor([1.0]))
        # TODO(pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(input_dim, output_dim)
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
예제 #7
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "target_tags",
                 target_embedding_dim: int = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SimpleSeq2SeqCrf, self).__init__(vocab, regularizer)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        self._crf = ConditionalRandomField(num_classes)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim()
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = Attention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim(
            ) + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        # self._decoder_cell = GRUCell(self._decoder_input_dim, self._decoder_output_dim, bias=False)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.span_metric = SpanBasedF1Measure(
            vocab,
            tag_namespace=target_namespace,
            ignore_classes=[START_SYMBOL[2:], END_SYMBOL[2:]])
        initializer(self)

        # Initialize forget gate
        """
 def test_masked(self):
     attention = Attention()
     # Testing general masked non-batched case.
     vector = Variable(torch.FloatTensor([[0.3, 0.1, 0.5]]))
     matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.1, 0.4, 0.3]]]))
     mask = Variable(torch.FloatTensor([[1.0, 0.0, 1.0]]))
     result = attention(vector, matrix, mask).data.numpy()
     assert_almost_equal(result, numpy.array([[0.52248482, 0.0, 0.47751518]]))
    def __init__(self,
                 encoder_output_dim: int,
                 action_embedding_dim: int,
                 attention_function: SimilarityFunction,
                 num_start_types: int,
                 num_entity_types: int,
                 mixture_feedforward: FeedForward = None,
                 dropout: float = 0.0,
                 unlinked_terminal_indices: List[int] = None) -> None:
        super(WikiTablesDecoderStep, self).__init__()
        self._mixture_feedforward = mixture_feedforward
        self._entity_type_embedding = Embedding(num_entity_types,
                                                action_embedding_dim)
        self._input_attention = Attention(attention_function)

        self._num_start_types = num_start_types
        self._start_type_predictor = Linear(encoder_output_dim,
                                            num_start_types)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        output_dim = encoder_output_dim
        input_dim = output_dim
        # Our decoder input will be the concatenation of the decoder hidden state and the previous
        # action embedding, and we'll project that down to the decoder's `input_dim`, which we
        # arbitrarily set to be the same as `output_dim`.
        self._input_projection_layer = Linear(
            output_dim + action_embedding_dim, input_dim)
        # Before making a prediction, we'll compute an attention over the input given our updated
        # hidden state, and optionally a difference between the current checklist vector and its
        # target, if we are training to maximize coverage using a checklist. Then we concatenate
        # those with the decoder state and project to `action_embedding_dim` to make a prediction.
        if unlinked_terminal_indices is None:
            self._output_projection_layer = Linear(
                output_dim + encoder_output_dim, action_embedding_dim)
        else:
            unlinked_checklist_size = len(unlinked_terminal_indices)
            self._output_projection_layer = Linear(
                output_dim + encoder_output_dim + unlinked_checklist_size,
                action_embedding_dim)

        self._unlinked_terminal_indices = unlinked_terminal_indices
        # TODO(pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(input_dim, output_dim)

        if mixture_feedforward is not None:
            check_dimensions_match(output_dim,
                                   mixture_feedforward.get_input_dim(),
                                   "hidden state embedding dim",
                                   "mixture feedforward input dim")
            check_dimensions_match(mixture_feedforward.get_output_dim(), 1,
                                   "mixture feedforward output dim",
                                   "dimension for scalar value")

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
 def test_non_normalized_attention_works(self):
     attention = Attention(normalize=False)
     sentence_tensor = Variable(torch.FloatTensor([[[-1, 0, 4],
                                                    [1, 1, 1],
                                                    [-1, 0, 4],
                                                    [-1, 0, -1]]]))
     query_tensor = Variable(torch.FloatTensor([[.1, .8, .5]]))
     result = attention(query_tensor, sentence_tensor).data.numpy()
     assert_almost_equal(result, [[1.9, 1.4, 1.9, -.6]])
예제 #11
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 spans_per_word: float,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.0,
                 spans_extractor: SpanExtractor = None,
                 spans_scorer_feedforward: FeedForward = None) -> None:
        super(SpanAe, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim() + 1
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = Attention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim(
            ) + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        self._decoder_cell = LSTMCell(self._decoder_input_dim + 1,
                                      self._decoder_output_dim)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)

        self._span_extractor = spans_extractor

        feedforward_scorer = torch.nn.Sequential(
            TimeDistributed(spans_scorer_feedforward),
            TimeDistributed(
                torch.nn.Linear(spans_scorer_feedforward.get_output_dim(), 1)))
        self._span_pruner = SpanPruner(feedforward_scorer)

        self._spans_per_word = spans_per_word
예제 #12
0
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 question_encoder: Seq2SeqEncoder,
                 passage_encoder: Seq2SeqEncoder,
                 feed_forward: FeedForward,
                 dropout: float = 0.1,
                 num_decoding_steps: int = 40,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(AnswerSynthesis, self).__init__(vocab, regularizer)
        self._vocab = vocab
        self._vocab_size = vocab.get_vocab_size()  # default: tokens
        self._num_decoding_steps = num_decoding_steps
        self._start_token_index = self._vocab.get_token_index(START_SYMBOL)
        self._end_token_index = self._vocab.get_token_index(END_SYMBOL)

        self._embedder = embedder
        self._question_encoder = question_encoder
        self._passage_encoder = passage_encoder

        encoding_dim = question_encoder.get_output_dim()
        embedding_dim = embedder.get_output_dim()

        self._span_start_embedding = nn.Embedding(2, 50)
        self._span_end_embedding = nn.Embedding(2, 50)
        self._gru_decoder = nn.GRUCell(encoding_dim + embedding_dim,
                                       encoding_dim)
        self._feed_forward = feed_forward

        self._attention = Attention(NonlinearSimilarity(encoding_dim))

        self._W_r = nn.Linear(embedding_dim, encoding_dim, bias=False)
        self._U_r = nn.Linear(encoding_dim, encoding_dim, bias=False)
        self._V_r = nn.Linear(encoding_dim, encoding_dim, bias=False)

        self._max_out = Maxout(encoding_dim,
                               num_layers=1,
                               output_dims=int(encoding_dim / 2),
                               pool_sizes=2)
        self._W_o = nn.Linear(int(encoding_dim / 2),
                              self._vocab_size,
                              bias=False)

        self._squad_metrics = SquadEmAndF1()
        #self._predict_acc = CategoricalAccuracy()

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x

        initializer(self)
        self._num_iter = 0
    def test_batched_no_mask(self):
        attention = Attention()

        # Testing general batched case.
        vector = Variable(torch.FloatTensor([[0.3, 0.1, 0.5], [0.3, 0.1, 0.5]]))
        matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2]],
                                             [[0.6, 0.8, 0.1], [0.15, 0.5, 0.2]]]))

        result = attention(vector, matrix).data.numpy()
        assert_almost_equal(result, numpy.array([[0.52871835, 0.47128162],
                                                 [0.52871835, 0.47128162]]))
예제 #14
0
def test_all_attention_works_the_same(attention_type: str):
    module_cls = Attention.by_name(attention_type)

    vector = torch.FloatTensor([[-7, -8, -9]])
    matrix = torch.FloatTensor([[[1, 2, 3], [4, 5, 6]]])

    if module_cls in {BilinearAttention, AdditiveAttention, LinearAttention}:
        module = module_cls(vector.size(-1), matrix.size(-1))
    else:
        module = module_cls()

    output = module(vector, matrix)
    assert tuple(output.size()) == (1, 2)
 def from_params(cls, vocab,
                 params: Params) -> 'WikiTablesErmSemanticParser':
     question_embedder = TextFieldEmbedder.from_params(
         vocab, params.pop("question_embedder"))
     action_embedding_dim = params.pop_int("action_embedding_dim")
     encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))
     entity_encoder = Seq2VecEncoder.from_params(
         params.pop('entity_encoder'))
     mixture_feedforward_type = params.pop('mixture_feedforward', None)
     if mixture_feedforward_type is not None:
         mixture_feedforward = FeedForward.from_params(
             mixture_feedforward_type)
     else:
         mixture_feedforward = None
     input_attention = Attention.from_params(params.pop("attention"))
     decoder_beam_size = params.pop_int("decoder_beam_size")
     decoder_num_finished_states = params.pop_int(
         "decoder_num_finished_states", None)
     max_decoding_steps = params.pop_int("max_decoding_steps")
     normalize_beam_score_by_length = params.pop(
         "normalize_beam_score_by_length", False)
     use_neighbor_similarity_for_linking = params.pop_bool(
         "use_neighbor_similarity_for_linking", False)
     dropout = params.pop_float('dropout', 0.0)
     num_linking_features = params.pop_int('num_linking_features', 10)
     tables_directory = params.pop('tables_directory', '/wikitables/')
     rule_namespace = params.pop('rule_namespace', 'rule_labels')
     checklist_cost_weight = params.pop_float("checklist_cost_weight", 0.6)
     mml_model_file = params.pop('mml_model_file', None)
     params.assert_empty(cls.__name__)
     return cls(
         vocab,
         question_embedder=question_embedder,
         action_embedding_dim=action_embedding_dim,
         encoder=encoder,
         entity_encoder=entity_encoder,
         mixture_feedforward=mixture_feedforward,
         input_attention=input_attention,
         decoder_beam_size=decoder_beam_size,
         decoder_num_finished_states=decoder_num_finished_states,
         max_decoding_steps=max_decoding_steps,
         normalize_beam_score_by_length=normalize_beam_score_by_length,
         checklist_cost_weight=checklist_cost_weight,
         use_neighbor_similarity_for_linking=
         use_neighbor_similarity_for_linking,
         dropout=dropout,
         num_linking_features=num_linking_features,
         tables_directory=tables_directory,
         rule_namespace=rule_namespace,
         initial_mml_model_file=mml_model_file)
예제 #16
0
 def __init__(self,
              vocab: Vocabulary,
              source_embedder: TextFieldEmbedder,
              encoder: Seq2SeqEncoder,
              max_decoding_steps: int,
              target_namespace: str = "tokens",
              target_embedding_dim: int = None,
              attention_function: SimilarityFunction = None,
              scheduled_sampling_ratio: float = 0.0) -> None:
     super(SimpleSeq2Seq, self).__init__(vocab)
     self._source_embedder = source_embedder
     self._encoder = encoder
     self._max_decoding_steps = max_decoding_steps
     self._target_namespace = target_namespace
     self._attention_function = attention_function
     self._scheduled_sampling_ratio = scheduled_sampling_ratio
     # We need the start symbol to provide as the input at the first timestep of decoding, and
     # end symbol as a way to indicate the end of the decoded sequence.
     self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                    self._target_namespace)
     self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                  self._target_namespace)
     num_classes = self.vocab.get_vocab_size(self._target_namespace)
     self.num_classes = num_classes
     # Decoder output dim needs to be the same as the encoder output dim since we initialize the
     # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
     # we're using attention with ``DotProductSimilarity``, this is needed.
     self._decoder_output_dim = self._encoder.get_output_dim()
     target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
     )
     self._target_embedder = Embedding(num_classes, target_embedding_dim)
     if self._attention_function:
         self._decoder_attention = Attention(self._attention_function)
         # The output of attention, a weighted average over encoder outputs, will be
         # concatenated to the input vector of the decoder at each time step.
         self._decoder_input_dim = self._encoder.get_output_dim(
         ) + target_embedding_dim
     else:
         self._decoder_input_dim = target_embedding_dim
     # TODO (pradeep): Do not hardcode decoder cell type.
     self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                   self._decoder_output_dim)
     self._output_projection_layer = Linear(self._decoder_output_dim,
                                            num_classes)
    def test_batched_masked(self):
        attention = Attention()

        # Testing general masked non-batched case.
        vector = Variable(torch.FloatTensor([[0.3, 0.1, 0.5], [0.3, 0.1, 0.5]]))
        matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]],
                                             [[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]]]))
        mask = Variable(torch.FloatTensor([[1.0, 1.0, 0.0], [1.0, 0.0, 1.0]]))
        result = attention(vector, matrix, mask).data.numpy()
        assert_almost_equal(result, numpy.array([[0.52871835, 0.47128162, 0.0],
                                                 [0.50749944, 0.0, 0.49250056]]))

        # Test the case where a mask is all 0s and an input is all 0s.
        vector = Variable(torch.FloatTensor([[0.0, 0.0, 0.0], [0.3, 0.1, 0.5]]))
        matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]],
                                             [[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]]]))
        mask = Variable(torch.FloatTensor([[1.0, 1.0, 0.0], [0.0, 0.0, 0.0]]))
        result = attention(vector, matrix, mask).data.numpy()
        assert_almost_equal(result, numpy.array([[0.5, 0.5, 0.0],
                                                 [0.0, 0.0, 0.0]]))
예제 #18
0
 def from_params(cls, vocab, params: Params) -> 'NlvrDirectSemanticParser':
     sentence_embedder_params = params.pop("sentence_embedder")
     sentence_embedder = TextFieldEmbedder.from_params(
         vocab, sentence_embedder_params)
     action_embedding_dim = params.pop_int('action_embedding_dim')
     encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))
     dropout = params.pop_float('dropout', 0.0)
     input_attention = Attention.from_params(params.pop("attention"))
     decoder_beam_search = BeamSearch.from_params(
         params.pop("decoder_beam_search"))
     max_decoding_steps = params.pop_int("max_decoding_steps")
     params.assert_empty(cls.__name__)
     return cls(vocab,
                sentence_embedder=sentence_embedder,
                action_embedding_dim=action_embedding_dim,
                encoder=encoder,
                input_attention=input_attention,
                decoder_beam_search=decoder_beam_search,
                max_decoding_steps=max_decoding_steps,
                dropout=dropout)
예제 #19
0
 def from_params(cls, vocab,
                 params: Params) -> 'NlvrCoverageSemanticParser':
     sentence_embedder_params = params.pop("sentence_embedder")
     sentence_embedder = TextFieldEmbedder.from_params(
         vocab, sentence_embedder_params)
     action_embedding_dim = params.pop_int('action_embedding_dim')
     encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))
     dropout = params.pop_float('dropout', 0.0)
     input_attention = Attention.from_params(params.pop("attention"))
     beam_size = params.pop_int('beam_size')
     max_num_finished_states = params.pop_int('max_num_finished_states',
                                              None)
     normalize_beam_score_by_length = params.pop_bool(
         'normalize_beam_score_by_length', False)
     max_decoding_steps = params.pop_int("max_decoding_steps")
     checklist_cost_weight = params.pop_float("checklist_cost_weight", 0.6)
     dynamic_cost_weight = params.pop("dynamic_cost_weight", None)
     penalize_non_agenda_actions = params.pop_bool(
         "penalize_non_agenda_actions", False)
     initial_mml_model_file = params.pop("initial_mml_model_file", None)
     params.assert_empty(cls.__name__)
     return cls(
         vocab,
         sentence_embedder=sentence_embedder,
         action_embedding_dim=action_embedding_dim,
         encoder=encoder,
         input_attention=input_attention,
         beam_size=beam_size,
         max_num_finished_states=max_num_finished_states,
         dropout=dropout,
         max_decoding_steps=max_decoding_steps,
         normalize_beam_score_by_length=normalize_beam_score_by_length,
         checklist_cost_weight=checklist_cost_weight,
         dynamic_cost_weight=dynamic_cost_weight,
         penalize_non_agenda_actions=penalize_non_agenda_actions,
         initial_mml_model_file=initial_mml_model_file)
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "tokens",
                 target_embedder: TextFieldEmbedder = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.25,
                 pointer_gen: bool = True,
                 language_model: bool = True,
                 max_oovs: int = None) -> None:
        super(PointerGenerator, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._pointer_gen = pointer_gen
        self._language_model = language_model
        if self._pointer_gen:
            self._max_oovs = max_oovs
            self.vocab.set_max_oovs(self._max_oovs)
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.

        self._target_embedder = target_embedder or source_embedder
        #!!! attention on decoder output, not on decoder input !!!#
        self._decoder_input_dim = self._target_embedder.get_output_dim()
                
        # decoder use UniLSTM while encoder use BiLSTM 
        self._decoder_hidden_dim = self._encoder.get_output_dim()//2
        
        # decoder: h0 c0 projection_layer from final_encoder_output
        self.decode_h0_projection_layer = Linear(self._encoder.get_output_dim(),self._decoder_hidden_dim)
        self.decode_c0_projection_layer = Linear(self._encoder.get_output_dim(),self._decoder_hidden_dim)

        self._decoder_attention = Attention(self._attention_function)
        # The output of attention, a weighted average over encoder outputs, will be
        # concatenated to the decoder_hidden of the decoder at each time step.
        # V[s_t, h*_t] + b
        self._decoder_output_dim = self._decoder_hidden_dim + self._encoder.get_output_dim() #[s_t, h*_t]
        
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim)
        self._output_attention_layer = Linear(self._decoder_output_dim, self._decoder_hidden_dim)
        #V[s_t, h*_t] + b
        self._output_projection_layer = Linear(self._decoder_hidden_dim, num_classes)
        # num_classes->V'
        # generationp robability
        if self._pointer_gen:
            self._pointer_gen_layer = Linear(self._decoder_hidden_dim+self._encoder.get_output_dim()+self._decoder_input_dim, 1)
        # metrics
        self.metrics = {
                "ROUGE-1": Rouge(1),
                "ROUGE-2": Rouge(2),
        }
예제 #21
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'MtGan':  # type: ignore
        # pylint: disable=arguments-differ
        vocab_namespace_A = params.pop("vocab_namespace_A", "vocab_A")
        vocab_namespace_B = params.pop("vocab_namespace_B", "vocab_B")

        num_classes_A = vocab.get_vocab_size(namespace=vocab_namespace_A)
        num_classes_B = vocab.get_vocab_size(namespace=vocab_namespace_B)

        params_generators = params.pop("generators")
        if params_generators.pop("type") == "rnn2rnn":
            generators_embedding_dim = params_generators.pop("embedding_dim")
            embedding_A_generator = Embedding(num_embeddings=num_classes_A, embedding_dim=generators_embedding_dim)
            embedding_B_generator = Embedding(num_embeddings=num_classes_B, embedding_dim=generators_embedding_dim)

            params_encoder_generators = params_generators.pop("encoder")
            generator_A_to_B_encoder = Seq2SeqEncoder.from_params(params_encoder_generators.duplicate())
            generator_B_to_A_encoder = Seq2SeqEncoder.from_params(params_encoder_generators.duplicate())

            generator_attention_params = params_generators.pop("attention")
            attention_generator_A_to_B = Attention.from_params(generator_attention_params.duplicate())
            attention_generator_B_to_A = Attention.from_params(generator_attention_params.duplicate())

            generators_max_decoding_steps = params_generators.pop("max_decoding_steps")

            generator_A_to_B = Rnn2Rnn(vocab=vocab,
                                       source_embedding=embedding_A_generator,
                                       target_embedding=embedding_B_generator,
                                       encoder=generator_A_to_B_encoder,
                                       max_decoding_steps=generators_max_decoding_steps,
                                       target_namespace=vocab_namespace_B,
                                       attention=attention_generator_A_to_B)

            generator_B_to_A = Rnn2Rnn(vocab=vocab,
                                       source_embedding=embedding_B_generator,
                                       target_embedding=embedding_A_generator,
                                       encoder=generator_B_to_A_encoder,
                                       max_decoding_steps=generators_max_decoding_steps,
                                       target_namespace=vocab_namespace_A,
                                       attention=attention_generator_B_to_A)
        else:
            raise ConfigurationError(message="This generators model type is not supported")

        discriminators_params = params.pop("discriminators")
        if discriminators_params.pop("type") == "seq2prob":
            params_encoder_discriminators = discriminators_params.pop("encoder")
            discriminator_A_encoder = Seq2VecEncoder.from_params(params_encoder_discriminators.duplicate())
            discriminator_B_encoder = Seq2VecEncoder.from_params(params_encoder_discriminators.duplicate())

            discriminators_embedding_dim = discriminators_params.pop("embedding_dim")
            embedding_A_discriminator = Embedding(num_classes_A, discriminators_embedding_dim)
            embedding_B_discriminator = Embedding(num_classes_B, discriminators_embedding_dim)

            discriminator_A = Seq2Prob(vocab=vocab, encoder=discriminator_A_encoder, embedding=embedding_A_discriminator)
            discriminator_B = Seq2Prob(vocab=vocab, encoder=discriminator_B_encoder, embedding=embedding_B_discriminator)
        else:
            raise ConfigurationError(message="This discriminators model type is not supported")

        return cls(vocab=vocab,
                   generator_A_to_B=generator_A_to_B,
                   generator_B_to_A=generator_B_to_A,
                   discriminator_A=discriminator_A,
                   discriminator_B=discriminator_B,
                   vocab_namespace_A=vocab_namespace_A,
                   vocab_namespace_B=vocab_namespace_B)
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "tokens",
                 target_embedder: TextFieldEmbedder = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.25) -> None:
        super(PointerGeneratorPattern, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._pattern_pos = [
            '@@np@@', '@@ns@@', '@@ni@@', '@@nz@@', '@@m@@', '@@i@@', '@@id@@',
            '@@t@@', '@@j@@'
        ]
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        self._target_embedder = target_embedder or source_embedder
        #!!! attention on decoder output, not on decoder input !!!#
        self._decoder_input_dim = self._target_embedder.get_output_dim()

        # decoder use UniLSTM while encoder use BiLSTM
        self._decoder_hidden_dim = self._encoder.get_output_dim()

        # decoder: h0 c0 projection_layer from final_encoder_output
        self.decode_h0_projection_layer = Linear(
            self._encoder.get_output_dim(), self._decoder_hidden_dim)
        self.decode_c0_projection_layer = Linear(
            self._encoder.get_output_dim(), self._decoder_hidden_dim)

        self._decoder_attention = Attention(self._attention_function)
        # The output of attention, a weighted average over encoder outputs, will be
        # concatenated to the decoder_hidden of the decoder at each time step.
        # V[s_t, h*_t] + b
        self._decoder_output_dim = self._decoder_hidden_dim + self._encoder.get_output_dim(
        )  #[s_t, h*_t]

        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_hidden_dim)
        self._output_attention_layer = Linear(self._decoder_output_dim,
                                              self._decoder_hidden_dim)
        #V[s_t, h*_t] + b
        self._output_projection_layer = Linear(self._decoder_hidden_dim,
                                               num_classes)
        # num_classes->V'
        # generationp robability
        self._pointer_gen_layer = Linear(
            self._decoder_hidden_dim + self._encoder.get_output_dim() +
            self._decoder_input_dim, 1)
        # metrics
        self.metrics = {
            "ROUGE-1": Rouge(1),
            "ROUGE-2": Rouge(2),
        }
예제 #23
0
import pytest
import torch

from allennlp.modules import Attention
from allennlp.modules.attention import BilinearAttention, AdditiveAttention, LinearAttention


@pytest.mark.parametrize("attention_type", Attention.list_available())
def test_all_attention_works_the_same(attention_type: str):
    module_cls = Attention.by_name(attention_type)

    vector = torch.FloatTensor([[-7, -8, -9]])
    matrix = torch.FloatTensor([[[1, 2, 3], [4, 5, 6]]])

    if module_cls in {BilinearAttention, AdditiveAttention, LinearAttention}:
        module = module_cls(vector.size(-1), matrix.size(-1))
    else:
        module = module_cls()

    output = module(vector, matrix)
    assert tuple(output.size()) == (1, 2)
 def test_can_build_from_params(self):
     params = Params({'similarity_function': {'type': 'cosine'}, 'normalize': False})
     attention = Attention.from_params(params)
     # pylint: disable=protected-access
     assert attention._similarity_function.__class__.__name__ == 'CosineSimilarity'
     assert attention._normalize is False
예제 #25
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder, use_attention: bool,
                 seq2seq_encoder: Seq2SeqEncoder,
                 seq2vec_encoder: Seq2VecEncoder,
                 span_end_encoder_after: Seq2SeqEncoder,
                 use_decoder_trainer: bool, decoder_beam_search: BeamSearch,
                 kb_configs: dict, other_configs: dict,
                 initializer: InitializerApplicator) -> None:
        super(ProStructModel, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.num_actions = len(Action)  # number of actions is hardcoded here.
        # They are defined in Action enum in propara_dataset_reader.py
        self.other_configs = other_configs

        # kb_coefficient * kb_score + (1-kb_coefficient) * model_score
        self.kb_coefficient = torch.nn.Parameter(
            torch.ones(1).mul(kb_configs.get('kb_coefficient', 0.5)))

        self.use_attention = use_attention
        self.use_decoder_trainer = use_decoder_trainer
        if self.use_attention:
            self.seq2seq_encoder = seq2seq_encoder
            self.time_distributed_seq2seq_encoder = TimeDistributed(
                TimeDistributed(self.seq2seq_encoder))
            self.time_distributed_attention_layer = \
                TimeDistributed(TimeDistributed(
                    Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(),
                                                                     seq2seq_encoder.get_output_dim()),
                              normalize=True)))
            self.aggregate_feedforward = Linear(
                seq2seq_encoder.get_output_dim(), self.num_actions)
        else:
            self.seq2vec_encoder = seq2vec_encoder
            self.time_distributed_seq2vec_encoder = TimeDistributed(
                TimeDistributed(self.seq2vec_encoder))
            self.aggregate_feedforward = Linear(
                seq2vec_encoder.get_output_dim(), self.num_actions)

        self.span_end_encoder_after = span_end_encoder_after
        # per step per participant
        self.time_distributed_encoder_span_end_after = TimeDistributed(
            TimeDistributed(self.span_end_encoder_after))

        # Fixme: dimensions

        self._span_start_predictor_after = TimeDistributed(
            TimeDistributed(
                torch.nn.Linear(2 + 2 * seq2seq_encoder.get_output_dim(), 1)))

        self._span_end_predictor_after = TimeDistributed(
            TimeDistributed(
                torch.nn.Linear(span_end_encoder_after.get_output_dim(), 1)))

        self._type_accuracy = BooleanAccuracy(
        )  # Fixme WRONG. Categorical accuracy should be right!
        self._loss = torch.nn.CrossEntropyLoss(
            ignore_index=-1
        )  # Fixme: This is less robust. If the masking value

        # Fixme: add a metric for location span strings
        self.span_metric = SquadEmAndF1()

        if self.use_decoder_trainer:
            self.decoder_trainer = MaximumMarginalLikelihood()
            if kb_configs['kb_to_use'] == 'lexicalkb':
                kb = KBLexical(lexical_kb_path=kb_configs['lexical_kb_path'],
                               fullgrid_prompts_load_path=kb_configs[
                                   'fullgrid_prompts_load_path'])

            # Makeshift arrangement to get number of participants in tiny.tsv .
            self.commonsense_based_action_generator = CommonsenseBasedActionGenerator(
                self.num_actions)
            self.rules_activated = [
                int(rule_val.strip()) > 0
                for rule_val in self.other_configs.get(
                    'constraint_rules_to_turn_on', '0,0,0,1').split(",")
            ]
            self.rule_2_fraction_participants = self.other_configs.get(
                'rule_2_fraction_participants', 0.5)
            self.rule_3_fraction_steps = self.other_configs.get(
                'rule_3_fraction_steps', 0.5)

            self.commonsense_based_action_generator.set_rules_used(
                self.rules_activated, self.rule_2_fraction_participants,
                self.rule_3_fraction_steps)
            # [self.rules_activated[0],  # C/D/C/D cannot happen
            #  self.rules_activated[1],  # > 1/2 partic
            #  self.rules_activated[2],  # > 1/2 steps cannot change
            #  self.rules_activated[3]  # until mentioned
            #  ])
            self.decoder_step = ProParaDecoderStep(
                KBBasedActionScorer(kb=kb, kb_coefficient=self.kb_coefficient),
                valid_action_generator=self.commonsense_based_action_generator)

        self.beam_search = decoder_beam_search
        initializer(self)
    def setUp(self):
        super().setUp()
        self.decoder_step = BasicTransitionFunction(encoder_output_dim=2,
                                                    action_embedding_dim=2,
                                                    input_attention=Attention.by_name('dot_product')(),
                                                    num_start_types=3,
                                                    add_action_bias=False)

        batch_indices = [0, 1, 0]
        action_history = [[1], [3, 4], []]
        score = [torch.FloatTensor([x]) for x in [.1, 1.1, 2.2]]
        hidden_state = torch.FloatTensor([[i, i] for i in range(len(batch_indices))])
        memory_cell = torch.FloatTensor([[i, i] for i in range(len(batch_indices))])
        previous_action_embedding = torch.FloatTensor([[i, i] for i in range(len(batch_indices))])
        attended_question = torch.FloatTensor([[i, i] for i in range(len(batch_indices))])
        # This maps non-terminals to valid actions, where the valid actions are grouped by _type_.
        # We have "global" actions, which are from the global grammar, and "linked" actions, which
        # are instance-specific and are generated based on question attention.  Each action type
        # has a tuple which is (input representation, output representation, action ids).
        valid_actions = {
                'e': {
                        'global': (torch.FloatTensor([[0, 0], [-1, -1], [-2, -2]]),
                                   torch.FloatTensor([[-1, -1], [-2, -2], [-3, -3]]),
                                   [0, 1, 2]),
                        'linked': (torch.FloatTensor([[.1, .2, .3], [.4, .5, .6]]),
                                   torch.FloatTensor([[3, 3], [4, 4]]),
                                   [3, 4])
                },
                'd': {
                        'global': (torch.FloatTensor([[0, 0]]),
                                   torch.FloatTensor([[-1, -1]]),
                                   [0]),
                        'linked': (torch.FloatTensor([[-.1, -.2, -.3], [-.4, -.5, -.6], [-.7, -.8, -.9]]),
                                   torch.FloatTensor([[5, 5], [6, 6], [7, 7]]),
                                   [1, 2, 3])
                }
        }
        grammar_state = [GrammarStatelet([nonterminal], valid_actions, is_nonterminal)
                         for _, nonterminal in zip(batch_indices, ['e', 'd', 'e'])]
        self.encoder_outputs = torch.FloatTensor([[[1, 2], [3, 4], [5, 6]], [[10, 11], [12, 13], [14, 15]]])
        self.encoder_output_mask = torch.FloatTensor([[1, 1, 1], [1, 1, 0]])
        self.possible_actions = [[('e -> f', False, None),
                                  ('e -> g', True, None),
                                  ('e -> h', True, None),
                                  ('e -> i', True, None),
                                  ('e -> j', True, None)],
                                 [('d -> q', True, None),
                                  ('d -> g', True, None),
                                  ('d -> h', True, None),
                                  ('d -> i', True, None)]]

        rnn_state = []
        for i in range(len(batch_indices)):
            rnn_state.append(RnnStatelet(hidden_state[i],
                                         memory_cell[i],
                                         previous_action_embedding[i],
                                         attended_question[i],
                                         self.encoder_outputs,
                                         self.encoder_output_mask))
        self.state = GrammarBasedState(batch_indices=batch_indices,
                                       action_history=action_history,
                                       score=score,
                                       rnn_state=rnn_state,
                                       grammar_state=grammar_state,
                                       possible_actions=self.possible_actions)
예제 #27
0
    def setUp(self):
        super().setUp()
        self.decoder_step = BasicTransitionFunction(
            encoder_output_dim=2,
            action_embedding_dim=2,
            input_attention=Attention.by_name("dot_product")(),
            add_action_bias=False,
        )

        batch_indices = [0, 1, 0]
        action_history = [[1], [3, 4], []]
        score = [torch.FloatTensor([x]) for x in [0.1, 1.1, 2.2]]
        hidden_state = torch.FloatTensor([[i, i]
                                          for i in range(len(batch_indices))])
        memory_cell = torch.FloatTensor([[i, i]
                                         for i in range(len(batch_indices))])
        previous_action_embedding = torch.FloatTensor(
            [[i, i] for i in range(len(batch_indices))])
        attended_question = torch.FloatTensor(
            [[i, i] for i in range(len(batch_indices))])
        # This maps non-terminals to valid actions, where the valid actions are grouped by _type_.
        # We have "global" actions, which are from the global grammar, and "linked" actions, which
        # are instance-specific and are generated based on question attention.  Each action type
        # has a tuple which is (input representation, output representation, action ids).
        valid_actions = {
            "e": {
                "global": (
                    torch.FloatTensor([[0, 0], [-1, -1], [-2, -2]]),
                    torch.FloatTensor([[-1, -1], [-2, -2], [-3, -3]]),
                    [0, 1, 2],
                ),
                "linked": (
                    torch.FloatTensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]),
                    torch.FloatTensor([[3, 3], [4, 4]]),
                    [3, 4],
                ),
            },
            "d": {
                "global":
                (torch.FloatTensor([[0, 0]]), torch.FloatTensor([[-1,
                                                                  -1]]), [0]),
                "linked": (
                    torch.FloatTensor([[-0.1, -0.2, -0.3], [-0.4, -0.5, -0.6],
                                       [-0.7, -0.8, -0.9]]),
                    torch.FloatTensor([[5, 5], [6, 6], [7, 7]]),
                    [1, 2, 3],
                ),
            },
        }
        grammar_state = [
            GrammarStatelet([nonterminal], valid_actions, is_nonterminal)
            for _, nonterminal in zip(batch_indices, ["e", "d", "e"])
        ]
        self.encoder_outputs = torch.FloatTensor([[[1, 2], [3, 4], [5, 6]],
                                                  [[10, 11], [12, 13],
                                                   [14, 15]]])
        self.encoder_output_mask = torch.FloatTensor([[1, 1, 1], [1, 1, 0]])
        self.possible_actions = [
            [
                ("e -> f", False, None),
                ("e -> g", True, None),
                ("e -> h", True, None),
                ("e -> i", True, None),
                ("e -> j", True, None),
            ],
            [
                ("d -> q", True, None),
                ("d -> g", True, None),
                ("d -> h", True, None),
                ("d -> i", True, None),
            ],
        ]

        rnn_state = []
        for i in range(len(batch_indices)):
            rnn_state.append(
                RnnStatelet(
                    hidden_state[i],
                    memory_cell[i],
                    previous_action_embedding[i],
                    attended_question[i],
                    self.encoder_outputs,
                    self.encoder_output_mask,
                ))
        self.state = GrammarBasedState(
            batch_indices=batch_indices,
            action_history=action_history,
            score=score,
            rnn_state=rnn_state,
            grammar_state=grammar_state,
            possible_actions=self.possible_actions,
        )
    def setUp(self):
        super().setUp()
        self.decoder_step = BasicTransitionFunction(
            encoder_output_dim=2,
            action_embedding_dim=2,
            input_attention=Attention.by_name('dot_product')(),
            num_start_types=3,
            add_action_bias=False)

        batch_indices = [0, 1, 0]
        action_history = [[1], [3, 4], []]
        score = [torch.FloatTensor([x]) for x in [.1, 1.1, 2.2]]
        hidden_state = torch.FloatTensor([[i, i]
                                          for i in range(len(batch_indices))])
        memory_cell = torch.FloatTensor([[i, i]
                                         for i in range(len(batch_indices))])
        previous_action_embedding = torch.FloatTensor(
            [[i, i] for i in range(len(batch_indices))])
        attended_question = torch.FloatTensor(
            [[i, i] for i in range(len(batch_indices))])
        # This maps non-terminals to valid actions, where the valid actions are grouped by _type_.
        # We have "global" actions, which are from the global grammar, and "linked" actions, which
        # are instance-specific and are generated based on question attention.  Each action type
        # has a tuple which is (input representation, output representation, action ids).
        valid_actions = {
            'e': {
                'global': (torch.FloatTensor([[0, 0], [-1, -1], [-2, -2]]),
                           torch.FloatTensor([[-1, -1], [-2, -2],
                                              [-3, -3]]), [0, 1, 2]),
                'linked': (torch.FloatTensor([[.1, .2, .3], [.4, .5, .6]]),
                           torch.FloatTensor([[3, 3], [4, 4]]), [3, 4])
            },
            'd': {
                'global':
                (torch.FloatTensor([[0, 0]]), torch.FloatTensor([[-1,
                                                                  -1]]), [0]),
                'linked': (torch.FloatTensor([[-.1, -.2, -.3], [-.4, -.5, -.6],
                                              [-.7, -.8, -.9]]),
                           torch.FloatTensor([[5, 5], [6, 6], [7,
                                                               7]]), [1, 2, 3])
            }
        }
        grammar_state = [
            GrammarState([nonterminal], {}, valid_actions, {}, is_nonterminal)
            for _, nonterminal in zip(batch_indices, ['e', 'd', 'e'])
        ]
        self.encoder_outputs = torch.FloatTensor([[[1, 2], [3, 4], [5, 6]],
                                                  [[10, 11], [12, 13],
                                                   [14, 15]]])
        self.encoder_output_mask = torch.FloatTensor([[1, 1, 1], [1, 1, 0]])
        self.possible_actions = [[
            ('e -> f', False, None), ('e -> g', True, None),
            ('e -> h', True, None), ('e -> i', True, None),
            ('e -> j', True, None)
        ],
                                 [
                                     ('d -> q', True, None),
                                     ('d -> g', True, None),
                                     ('d -> h', True, None),
                                     ('d -> i', True, None)
                                 ]]

        rnn_state = []
        for i in range(len(batch_indices)):
            rnn_state.append(
                RnnState(hidden_state[i], memory_cell[i],
                         previous_action_embedding[i], attended_question[i],
                         self.encoder_outputs, self.encoder_output_mask))
        self.state = GrammarBasedDecoderState(
            batch_indices=batch_indices,
            action_history=action_history,
            score=score,
            rnn_state=rnn_state,
            grammar_state=grammar_state,
            possible_actions=self.possible_actions)