Esempio n. 1
0
 def __init__(self,
              vocab: Vocabulary,
              source_embedder: TextFieldEmbedder,
              encoder: Seq2SeqEncoder,
              max_decoding_steps: int,
              target_namespace: str = "tokens",
              target_embedding_dim: int = None,
              attention_function: SimilarityFunction = None,
              scheduled_sampling_ratio: float = 0.0) -> None:
     super(SimpleSeq2Seq, self).__init__(vocab)
     self._source_embedder = source_embedder
     self._encoder = encoder
     self._max_decoding_steps = max_decoding_steps
     self._target_namespace = target_namespace
     self._attention_function = attention_function
     self._scheduled_sampling_ratio = scheduled_sampling_ratio
     # We need the start symbol to provide as the input at the first timestep of decoding, and
     # end symbol as a way to indicate the end of the decoded sequence.
     self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
     self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
     num_classes = self.vocab.get_vocab_size(self._target_namespace)
     # Decoder output dim needs to be the same as the encoder output dim since we initialize the
     # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
     # we're using attention with ``DotProductSimilarity``, this is needed.
     self._decoder_output_dim = self._encoder.get_output_dim()
     target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim()
     self._target_embedder = Embedding(num_classes, target_embedding_dim)
     if self._attention_function:
         self._decoder_attention = Attention(self._attention_function)
         # The output of attention, a weighted average over encoder outputs, will be
         # concatenated to the input vector of the decoder at each time step.
         self._decoder_input_dim = self._encoder.get_output_dim() + target_embedding_dim
     else:
         self._decoder_input_dim = target_embedding_dim
     # TODO (pradeep): Do not hardcode decoder cell type.
     self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim)
     self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
Esempio n. 2
0
    def __init__(self,
                 args,
                 vocab_indexer,
                 vocab,
                 decoder_hidden_size=600,
                 emb_size=128,
                 num_classes=None,
                 start_idx=1,
                 end_idx=2,
                 padding_idx=0,
                 typ='lstm',
                 max_decoding_steps=120,
                 sampling_scheme: str = "first_word",
                 line_separator_symbol: str = "<eos>",
                 reverse_each_line: str = False,
                 n_lines_per_sample: int = 14,
                 tie_weights: bool = True,
                 dropout_ratio: float = 0.3,
                 phoneme_embeddings_dim: int = 128,
                 encoder_type: str = None,
                 encoder_input_size: int = 100,
                 encoder_hidden_size: int = 100,
                 encoder_n_layers: int = 1,
                 n_lines_to_gen: int = 4):

        super(VanillaLM, self).__init__()

        self.args = args
        self.vocab_indexer = vocab_indexer
        self.vocab = vocab

        self._scheduled_sampling_ratio = 0.0

        self._max_decoding_steps = max_decoding_steps
        decoder_input_size = emb_size

        self._decoder_input_dim = decoder_input_size
        self._decoder_output_dim = decoder_hidden_size

        self._target_embedder = nn.Embedding(num_classes, emb_size)

        self._context_embedder = nn.Embedding(
            num_classes, phoneme_embeddings_dim
        )  ## TODO: Not clear why this is phoneme_embeddings_dim

        self.padding_idx = padding_idx
        self.start_idx = start_idx
        self.end_idx = end_idx

        self.type = typ
        self.use_cuda = args.use_cuda  #True

        decoder_embedding_dim = emb_size
        self._target_embedding_dim = decoder_embedding_dim

        assert self.type == "lstm", "Incorrect decoder type"
        self._lm_cell = LSTMCell(self._decoder_input_dim,
                                 self._decoder_output_dim)

        self._intermediate_projection_layer = Linear(
            self._decoder_output_dim,
            self._target_embedding_dim)  # , bias=False)
        self._activation = torch.tanh
        self._num_classes = num_classes
        self._output_projection_layer = Linear(self._target_embedding_dim,
                                               self._num_classes)

        self._dropout_ratio = dropout_ratio
        self._dropout = nn.Dropout(p=dropout_ratio, inplace=False)
        self._lockdropout = LockedDropout()

        self._encoder_type = encoder_type

        if self._encoder_type is not None:
            self._encoder_input_size = encoder_input_size
            self._encoder_hidden_size = encoder_hidden_size
            self._encoder_namespace = encoder_namespace
            self._encoder = nn.LSTM(input_size=self._encoder_input_size,
                                    hidden_size=self._encoder_hidden_size,
                                    batch_first=True,
                                    bias=False,
                                    num_layers=encoder_n_layers,
                                    bidirectional=False)

        if tie_weights:
            # assert self._target_embedding_dim == self._target_embedder.token_embedder_tokens.get_output_dim(), "Dimension mis-match!"
            self._output_projection_layer.weight = self._target_embedder.weight

        # in the config, make these options consistent with those in the reader
        self._sampling_scheme = sampling_scheme  # "first_sentence" # "first_word"
        self.line_separator = line_separator_symbol
        self.reverse_each_line = reverse_each_line
        self.n_lines_per_sample = n_lines_per_sample

        self._n_lines_to_gen = n_lines_to_gen

        self._attention = False
        self.END_SYMBOL = line_separator_symbol
Esempio n. 3
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 attention: Attention,
                 schema_path: str = None,
                 missing_alignment_int: int = 0,
                 indexfield_padding_index: int = -1,
                 beam_size: int = None,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 scheduled_sampling_ratio: float = 0.,
                 use_bleu: bool = True,
                 emb_dropout: float = 0.0,
                 dec_dropout: float = 0.0,
                 attn_loss_lambda: float = 0.5,
                 token_based_metric: Metric = None) -> None:
        super(AttnSupSeq2Seq, self).__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._indexfield_padding_index = indexfield_padding_index
        self._missing_alignment_int = missing_alignment_int

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)  # pylint: disable=protected-access
            self._bleu = BLEU(exclude_indices={
                pad_index, self._end_index, self._start_index
            })
        else:
            self._bleu = None

        if token_based_metric:
            self._token_based_metric = token_based_metric
        else:
            self._token_based_metric = TokenSequenceAccuracy()
        # log attention supervision CE loss as a metric
        self._attn_sup_loss = Average()
        self._sql_metrics = schema_path is not None
        if self._sql_metrics:
            # SQL specific metrics: match between the templates free of schema constants,
            # and match between the schema constants
            self._schema_free_match = GlobalTemplAccuracy(
                schema_path=schema_path)
            self._kb_match = KnowledgeBaseConstsAccuracy(
                schema_path=schema_path)

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder
        self._emb_dropout = Dropout(p=emb_dropout)
        self._dec_dropout = Dropout(p=dec_dropout)
        self._attn_loss_lambda = attn_loss_lambda
        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder

        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Attention mechanism applied to the encoder output for each step.
        self._attention = attention
        self._attention._normalize = False

        # Dense embedding of vocab words in the target space.
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim

        # A weighted average over encoder outputs will be concatenated to the previous target embedding
        # to form the input to the decoder at each time step.
        self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
Esempio n. 4
0
    def __init__(
        self,
        vocab: Vocabulary,
        input_dim: int,
        decoder_hidden_size: int,
        max_decoding_steps: int,
        output_proj_input_dim: int,
        target_namespace: str = "targets",
        target_embedding_dim: int = None,
        attention: str = "none",
        dropout: float = 0.0,
        scheduled_sampling_ratio: float = 0.0,
    ) -> None:
        super(Seq2SeqDecoder, self).__init__(vocab)

        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self._unk_index = self.vocab.get_token_index("@@UNKNOWN@@",
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._encoder_output_dim = input_dim
        self._decoder_hidden_dim = decoder_hidden_size
        if self._encoder_output_dim != self._decoder_hidden_dim:
            self._projection_encoder_out = Linear(self._encoder_output_dim,
                                                  self._decoder_hidden_dim)
        else:
            self._projection_encoder_out = lambda x: x
        self._decoder_output_dim = self._decoder_hidden_dim
        self._output_proj_input_dim = output_proj_input_dim
        self._target_embedding_dim = target_embedding_dim
        self._target_embedder = Embedding(num_classes,
                                          self._target_embedding_dim)

        # Used to get an initial hidden state from the encoder states
        self._sent_pooler = Pooler(project=True,
                                   d_inp=input_dim,
                                   d_proj=decoder_hidden_size)

        if attention == "Bahdanau":
            self._decoder_attention = BahdanauAttention(
                decoder_hidden_size + target_embedding_dim, input_dim)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time
            # step.
            self._decoder_input_dim = input_dim + target_embedding_dim
        elif attention == "bilinear":
            self._decoder_attention = BilinearAttention(
                decoder_hidden_size + target_embedding_dim, input_dim)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time
            # step.
            self._decoder_input_dim = input_dim + target_embedding_dim
        elif attention == "none":
            self._decoder_attention = None
            self._decoder_input_dim = target_embedding_dim
        else:
            raise Exception("attention not implemented {}".format(attention))

        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_hidden_dim)
        # Allow for a bottleneck layer between encoder outputs and distribution over vocab
        # The bottleneck layer consists of a linear transform and helps to reduce
        # number of parameters
        if self._output_proj_input_dim != self._decoder_output_dim:
            self._projection_bottleneck = Linear(self._decoder_output_dim,
                                                 self._output_proj_input_dim)
        else:
            self._projection_bottleneck = lambda x: x
        self._output_projection_layer = Linear(self._output_proj_input_dim,
                                               num_classes)
        self._dropout = torch.nn.Dropout(p=dropout)
Esempio n. 5
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 attention: Attention,
                 beam_size: int,
                 max_decoding_steps: int,
                 target_embedding_dim: int = 30,
                 copy_token: str = "@COPY@",
                 source_namespace: str = "source_tokens",
                 target_namespace: str = "target_tokens",
                 metric: Metric = BLEU()) -> None:
        super(CopyNet, self).__init__(vocab)
        self._metric = metric
        self._source_namespace = source_namespace
        self._target_namespace = target_namespace
        self._src_start_index = self.vocab.get_token_index(START_SYMBOL, self._source_namespace)
        self._src_end_index = self.vocab.get_token_index(END_SYMBOL, self._source_namespace)
        self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
        self._oov_index = self.vocab.get_token_index(self.vocab._oov_token, self._target_namespace)  # pylint: disable=protected-access
        self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace)  # pylint: disable=protected-access
        self._copy_index = self.vocab.get_token_index(copy_token, self._target_namespace)
        if self._copy_index == self._oov_index:
            raise ConfigurationError(f"Special copy token {copy_token} missing from target vocab namespace. "
                                     f"You can ensure this token is added to the target namespace with the "
                                     f"vocabulary parameter 'tokens_to_add'.")

        self._target_vocab_size = self.vocab.get_vocab_size(self._target_namespace)

        # Encoding modules.
        self._source_embedder = source_embedder
        self._encoder = encoder

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        # We arbitrarily set the decoder's input dimension to be the same as the output dimension.
        self.encoder_output_dim = self._encoder.get_output_dim()
        self.decoder_output_dim = self.encoder_output_dim
        self.decoder_input_dim = self.decoder_output_dim

        target_vocab_size = self.vocab.get_vocab_size(self._target_namespace)

        # The decoder input will be a function of the embedding of the previous predicted token,
        # an attended encoder hidden state called the "attentive read", and another
        # weighted sum of the encoder hidden state called the "selective read".
        # While the weights for the attentive read are calculated by an `Attention` module,
        # the weights for the selective read are simply the predicted probabilities
        # corresponding to each token in the source sentence from the previous timestep.
        self._target_embedder = Embedding(target_vocab_size, target_embedding_dim)
        self._attention = attention
        self._input_projection_layer = Linear(
                target_embedding_dim + self.encoder_output_dim * 2,
                self.decoder_input_dim)

        # We then run the projected decoder input through an LSTM cell to produce
        # the next hidden state.
        self._decoder_cell = LSTMCell(self.decoder_input_dim, self.decoder_output_dim)

        # We create a "generation" score for each token in the target vocab
        # with a linear projection of the decoder hidden state.
        self._output_generation_layer = Linear(self.decoder_output_dim, target_vocab_size)

        # We create a "copying" score for each source token by applying a non-linearity
        # (tanh) to a linear projection of the encoded hidden state for that token,
        # and then taking the dot product of the result with the decoder hidden state.
        self._output_copying_layer = Linear(self.encoder_output_dim, self.decoder_output_dim)

        # At prediction time, we'll use a beam search to find the best target sequence.
        self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)
Esempio n. 6
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 attention: Attention = None,
                 attention_function: SimilarityFunction = None,
                 beam_size: int = None,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 scheduled_sampling_ratio: float = 0.,
                 use_bleu: bool = True,
                 emb_dropout: float = 0.5) -> None:
        super(Seq2Seq, self).__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace)  # pylint: disable=protected-access
            self._bleu = BLEU(exclude_indices={pad_index, self._end_index, self._start_index})
        else:
            self._bleu = None

        self._token_based_metric = TokenSequenceAccuracy()

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder
        self._emb_dropout = Dropout(p=emb_dropout)

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder

        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Attention mechanism applied to the encoder output for each step.
        if attention:
            if attention_function:
                raise ConfigurationError("You can only specify an attention module or an "
                                         "attention function, but not both.")
            self._attention = attention
        elif attention_function:
            self._attention = LegacyAttention(attention_function)
        else:
            self._attention = None

        # Dense embedding of vocab words in the target space.
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim()
        self._target_embedder = Embedding(num_classes, target_embedding_dim)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim

        if self._attention:
            # If using attention, a weighted average over encoder outputs will be concatenated
            # to the previous target embedding to form the input to the decoder at each
            # time step.
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim
        else:
            # Otherwise, the input to the decoder is just the previous target embedding.
            self._decoder_input_dim = target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim)

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 target_embedder: TextFieldEmbedder,
                 source_encoder: Seq2VecEncoder,
                 target_encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 attention: Attention = None,
                 beam_size: int = None,
                 target_namespace: str = "tokens",
                 scheduled_sampling_ratio: float = 0.,
                 use_bleu: bool = True) -> None:
        super(AssociativeSeq2SeqHiddenDiff, self).__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)  # pylint: disable=protected-access
            self._bleu = BLEU(exclude_indices={
                pad_index, self._end_index, self._start_index
            })
        else:
            self._bleu = None

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._source_encoder = source_encoder
        self._target_encoder = target_encoder

        self._encoder_output_dim = self._target_encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim
        target_embedding_dim = source_embedder.get_output_dim()

        if attention:
            self._attention = attention
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim

        else:
            self._attention = None
            self._decoder_input_dim = target_embedding_dim + self._source_encoder.get_output_dim(
            )

        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        self._target_embedder = target_embedder

        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
Esempio n. 8
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder_1: TextFieldEmbedder,
                 source_encoder_1: Seq2SeqEncoder,
                 beam_size: int,
                 max_decoding_steps: int,
                 decoder_output_dim: int,
                 target_embedding_dim: int = 30,
                 namespace: str = "tokens",
                 tensor_based_metric: Metric = None,
                 align_embeddings: bool = True,
                 source_embedder_2: TextFieldEmbedder = None,
                 source_encoder_2: Seq2SeqEncoder = None) -> None:
        super().__init__(vocab)
        self._source_embedder_1 = source_embedder_1
        self._source_embedder_2 = source_embedder_1 or self._source_embedder_1
        self._source_encoder_1 = source_encoder_1
        self._source_encoder_2 = source_encoder_2 or self._source_encoder_1

        self._source_namespace = namespace
        self._target_namespace = namespace

        self.encoder_output_dim_1 = self._source_encoder_1.get_output_dim()
        self.encoder_output_dim_2 = self._source_encoder_2.get_output_dim()
        self.cated_encoder_out_dim = self.encoder_output_dim_1 + self.encoder_output_dim_2
        self.decoder_output_dim = decoder_output_dim

        # TODO: AllenNLP实现的Addictive Attention可能没有bias
        self._attention_1 = AdditiveAttention(self.decoder_output_dim,
                                              self.encoder_output_dim_1)
        self._attention_2 = AdditiveAttention(self.decoder_output_dim,
                                              self.encoder_output_dim_2)

        if not align_embeddings:
            self.target_embedding_dim = target_embedding_dim
            self._target_vocab_size = self.vocab.get_vocab_size(
                namespace=self._target_namespace)
            self._target_embedder = Embedding(self._target_vocab_size,
                                              target_embedding_dim)
        else:
            self._target_embedder = self._source_embedder_1._token_embedders[
                "tokens"]
            self._target_vocab_size = self.vocab.get_vocab_size(
                namespace=self._target_namespace)
            self.target_embedding_dim = self._target_embedder.get_output_dim()

        self.decoder_input_dim = self.encoder_output_dim_1 + self.encoder_output_dim_2 + \
                                 self.target_embedding_dim

        self._decoder_cell = LSTMCell(self.decoder_input_dim,
                                      self.decoder_output_dim)

        # 用于将两个encoder的最后隐层状态映射成解码器初始状态
        self._encoder_out_projection_layer = torch.nn.Linear(
            in_features=self.cated_encoder_out_dim,
            out_features=self.decoder_output_dim
        )  #  TODO: bias - true of false?

        # 软门控机制参数,用于计算lambda
        self._gate_projection_layer = torch.nn.Linear(
            in_features=self.decoder_output_dim + self.decoder_input_dim,
            out_features=1,
            bias=False)

        self._start_index = self.vocab.get_token_index(START_SYMBOL, namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL, namespace)
        self._pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                     namespace)
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        self._tensor_based_metric = tensor_based_metric or \
            BLEU(exclude_indices={self._pad_index, self._end_index, self._start_index})
Esempio n. 9
0
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        vecoder: Seq2VecEncoder,
        sen_encoder: Seq2VecEncoder,
        max_decoding_steps: int = 32,
        attention: Attention = None,
        beam_size: int = None,
        target_namespace: str = "tokens",
        scheduled_sampling_ratio: float = 0.5,
    ) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio  # Maybe we can try
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self.pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                    self._target_namespace)
        self._max_decoding_steps = max_decoding_steps
        self.vocab = vocab
        # anything about dims
        self.sen_num = 10
        # with open('../data/0510/cy/kg_and_train.pk', 'rb') as f:
        with open('cy/openkg.pk', 'rb') as f:
            self.kg_mat = torch.tensor(pickle.load(f)).float()
        self.symp_mat = torch.nn.Parameter(self.kg_mat).cuda()
        self.evovl_mat = torch.zeros(len(self.kg_mat), len(self.kg_mat)).cuda()

        # with open('../data/0510/cy/comp_topic2num.pk', 'rb') as f:
        with open('cy/comp_topic2num.pk', 'rb') as f:
            self.word_idx = pickle.load(f)
        self.idx_word = {v: k for k, v in self.word_idx.items()}
        self.vocab_to_idx = {}
        self.idx_to_vocab_list = []
        self.vocab_list = []
        for word, k in self.word_idx.items():
            self.vocab_to_idx[vocab.get_token_index(word.strip())] = k
            self.idx_to_vocab_list.append(vocab.get_token_index(word.strip()))

        self.symp_size = len(self.symp_mat) + self.sen_num
        self.topic = len(self.symp_mat)
        self._encoder = encoder
        self._vecoder = vecoder
        self._sen_encoder = sen_encoder

        self.outfeature = self._sen_encoder.get_output_dim()
        # anything about graph
        self.symp_state = torch.nn.Parameter(
            torch.Tensor(self.symp_size, self.outfeature))
        torch.nn.init.xavier_uniform_(self.symp_state, gain=1.414)
        self.predict_layer = torch.nn.Parameter(
            torch.Tensor(self.symp_size, self.outfeature))
        self.predict_bias = torch.nn.Parameter(torch.Tensor(self.symp_size))
        torch.nn.init.kaiming_uniform_(self.predict_layer)
        torch.nn.init.uniform_(self.predict_bias, -1 / self.symp_size**0.5,
                               1 / self.symp_size**0.5)

        self.attn_one = GATAttention(self.outfeature, self.outfeature, 1)
        self.attn_two = GATAttention(self.outfeature, self.outfeature, 1)
        self.attn_three = GATAttention(self.outfeature, self.outfeature, 1)

        # Metric
        self.kd_metric = KD_Metric()
        self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25))
        self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0))
        self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0))
        self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1))
        self.topic_acc = Average()
        # anything about module
        self._source_embedder = source_embedder
        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        target_embedding_dim = source_embedder.get_output_dim()
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        self._encoder_output_dim = self._encoder.get_output_dim(
        )  # 600  要不把前两个都换成outfeater得了
        self._decoder_output_dim = self._encoder_output_dim * 2
        self._decoder_input_dim = target_embedding_dim
        self._attention = None
        if attention:
            self._attention = attention
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim

        # 在这里把那个embedding融合进入试试?
        self.before_linear = Linear(2 * self.outfeature, self.outfeature)
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)

        self._output_projection_layer = Linear(self.outfeature * 2,
                                               num_classes)

        self.linear_all = Linear(self.outfeature * 3 + self._decoder_input_dim,
                                 1)
        self.attention_linear = Linear(self.outfeature, self.outfeature)
        self.decoder_linear = Linear(self.outfeature * 2, self.outfeature)

        self.get_attn = Linear(self.outfeature, 1, bias=False)
        self.topic_acc = MyAverage()
        self.topic_rec = MyAverage()
        self.topic_f1 = F1()
        self.dink1 = Distinct1()
        self.dink2 = Distinct2()
        self.last_sen = 2
Esempio n. 10
0
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,  # just Embedding layer
        encoder1: Seq2SeqEncoder,  # user encoder
        encoder2: Seq2SeqEncoder,  # system encoder
        attention: Attention,  # decoding attention
        max_decoding_steps: int = 200,  # max timesteps of decoder
        beam_size: int = 3,  # beam search parameter
        target_namespace: str = "target_tokens",  # two separate vocabulary
        target_embedding_dim: int = None,  # target word embedding dimension
        scheduled_sampling_ratio: float = 0.,  # maybe unnecessary
        projection_dim: int = None,  #
        use_coverage: bool = False,  # coverage penalty, optional
        coverage_loss_weight: float = None,
        domain_lambda:
        float = 0.5,  # the penalty weight in final loss function, need to be tuned
        initializer: InitializerApplicator = InitializerApplicator()
    ) -> None:

        super(SPNet, self).__init__(vocab)

        # General variables
        # target_namespace: target_tokens; source_namespace: tokens;
        self._target_namespace = target_namespace
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self._source_unk_index = self.vocab.get_token_index(DEFAULT_OOV_TOKEN)
        self._target_unk_index = self.vocab.get_token_index(
            DEFAULT_OOV_TOKEN, self._target_namespace)
        self._source_vocab_size = self.vocab.get_vocab_size()
        self._target_vocab_size = self.vocab.get_vocab_size(
            self._target_namespace)

        # Encoder setting
        self._source_embedder = source_embedder
        self._encoder1 = encoder1
        self._encoder2 = encoder2
        # We assume that the 2 encoders have the same hidden state size
        self._encoder_output_dim = self._encoder1.get_output_dim()

        # Decoder setting
        self._target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        self._num_classes = self.vocab.get_vocab_size(self._target_namespace)
        self._target_embedder = Embedding(self._num_classes,
                                          self._target_embedding_dim)
        self._decoder_input_dim = self._encoder_output_dim * 2  # default as the decoder_output_dim
        # input projection of decoder: [context_attn, target_emb] -> [decoder_input_dim]
        self._input_projection_layer = Linear(
            self._target_embedding_dim + self._encoder_output_dim * 2,
            self._decoder_input_dim)
        self._decoder_output_dim = self._encoder_output_dim * 2
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        self._projection_dim = projection_dim or self._source_embedder.get_output_dim(
        )
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               self._num_classes)
        self._p_gen_layer = Linear(
            self._encoder_output_dim * 2 + self._decoder_output_dim * 2 +
            self._decoder_input_dim, 1)
        self._attention = attention

        # coverage penalty setting
        self._use_coverage = use_coverage
        self._coverage_loss_weight = coverage_loss_weight
        self._eps = 1e-45

        # Decoding strategy setting
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        # multitasking of domain classification
        self._domain_penalty = domain_lambda  # penalty term = 0.5 as default
        self._classifier_params = Params({
            "input_dim": self._decoder_output_dim,
            "hidden_dims": [128, 7],
            "activations": ["relu", "linear"],
            "dropout": [0.2, 0.0],
            "num_layers": 2
        })
        self._domain_classifier = FeedForward.from_params(
            self._classifier_params)

        initializer(self)
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "tokens",
                 target_embedder: TextFieldEmbedder = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.25) -> None:
        super(PointerGeneratorPattern, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._pattern_pos = [
            '@@np@@', '@@ns@@', '@@ni@@', '@@nz@@', '@@m@@', '@@i@@', '@@id@@',
            '@@t@@', '@@j@@'
        ]
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        self._target_embedder = target_embedder or source_embedder
        #!!! attention on decoder output, not on decoder input !!!#
        self._decoder_input_dim = self._target_embedder.get_output_dim()

        # decoder use UniLSTM while encoder use BiLSTM
        self._decoder_hidden_dim = self._encoder.get_output_dim()

        # decoder: h0 c0 projection_layer from final_encoder_output
        self.decode_h0_projection_layer = Linear(
            self._encoder.get_output_dim(), self._decoder_hidden_dim)
        self.decode_c0_projection_layer = Linear(
            self._encoder.get_output_dim(), self._decoder_hidden_dim)

        self._decoder_attention = Attention(self._attention_function)
        # The output of attention, a weighted average over encoder outputs, will be
        # concatenated to the decoder_hidden of the decoder at each time step.
        # V[s_t, h*_t] + b
        self._decoder_output_dim = self._decoder_hidden_dim + self._encoder.get_output_dim(
        )  #[s_t, h*_t]

        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_hidden_dim)
        self._output_attention_layer = Linear(self._decoder_output_dim,
                                              self._decoder_hidden_dim)
        #V[s_t, h*_t] + b
        self._output_projection_layer = Linear(self._decoder_hidden_dim,
                                               num_classes)
        # num_classes->V'
        # generationp robability
        self._pointer_gen_layer = Linear(
            self._decoder_hidden_dim + self._encoder.get_output_dim() +
            self._decoder_input_dim, 1)
        # metrics
        self.metrics = {
            "ROUGE-1": Rouge(1),
            "ROUGE-2": Rouge(2),
        }
Esempio n. 12
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 target_namespace: str,
                 encoder: Seq2SeqEncoder,
                 decoder: Dict,
                 max_decoding_steps: int,
                 target_embedding_dim: int = None,
                 attention: Dict = None,
                 beam_size: int = None,
                 scheduled_sampling_ratio: float = 0.,
                 use_bleu: bool = True,
                 visualize_attention: bool = True) -> None:
        super(NmtSeq2Seq, self).__init__(vocab)

        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._target_namespace = target_namespace
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)  # pylint: disable=protected-access
            self._bleu = BLEU(exclude_indices={
                pad_index, self._end_index, self._start_index
            })
        else:
            self._bleu = None

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder

        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Attention mechanism params applied to the encoder output for each step.
        self._attention = attention

        self._visualize_attention = visualize_attention

        # Dense embedding of vocab words in the target space.
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = self._encoder.get_output_dim()
        # self._decoder_output_dim = self._encoder_output_dim

        self._decoder_input_dim = decoder["input_size"]
        # If using attention make sure the .jsonnet params reflect this architecture:
        # input_to_decoder_rnn = [prev_word + attended_context_vector]
        self._decoder_output_dim = decoder['hidden_size']

        # We'll use an RNN cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        decoder_cell_type = decoder["type"]

        if decoder_cell_type == "gru":
            self._decoder_cell = GRUCell(self._decoder_input_dim,
                                         self._decoder_output_dim)
        elif decoder_cell_type == "lstm":
            self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                          self._decoder_output_dim)
        else:
            raise ValueError(
                "Dialogue encoder of type {} not supported yet!".format(
                    decoder_cell_type))

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
Esempio n. 13
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 extra_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 extra_encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 beam_size: int = None,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 scheduled_sampling_ratio: float = 0.,
                 use_bleu: bool = True) -> None:
        super(InformedSeq2Seq, self).__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)  # pylint: disable=protected-access
            self._bleu = BLEU(exclude_indices={
                pad_index, self._end_index, self._start_index
            })
        else:
            self._bleu = None

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder
        self._extra_embedder = extra_embedder

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder
        self._extra_encoder = extra_encoder

        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Dense embedding of vocab words in the target space.
        # TODO: target_embedding_dim should be size of the concatenated vector
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        # TODO: encoder_output_dim should be size of the concatenated vector
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim
        self._decoder_input_dim = target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
Esempio n. 14
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedding: Embedding,
                 target_embedding: Embedding,
                 encoder: Seq2SeqEncoder,
                 target_namespace: str,
                 max_decoding_steps: int,
                 attention: Attention = None,
                 attention_function: SimilarityFunction = None,
                 beam_size: int = None,
                 scheduled_sampling_ratio: float = 0.) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        # Dense embedding of source vocab tokens.
        self._source_embedding = source_embedding

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder

        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Attention mechanism applied to the encoder output for each step.
        if attention:
            if attention_function:
                raise ConfigurationError(
                    "You can only specify an attention module or an "
                    "attention function, but not both.")
            self._attention = attention
        elif attention_function:
            self._attention = LegacyAttention(attention_function)
        else:
            self._attention = None

        # Dense embedding of vocab words in the target space.

        self._target_embedding = target_embedding
        target_embedding_dim = self._target_embedding.get_output_dim()

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim

        if self._attention:
            # If using attention, a weighted average over encoder output will be concatenated
            # to the previous target embedding to form the input to the decoder at each
            # time step.
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim
        else:
            # Otherwise, the input to the decoder is just the previous target embedding.
            self._decoder_input_dim = target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)

        # At prediction time, we can use a beam search to find the most likely sequence of target tokens.
        # If the beam_size parameter is not given, we'll just use a greedy search (equivalent to beam_size = 1).
        self._max_decoding_steps = max_decoding_steps
        if beam_size is not None:
            self._beam_search = BeamSearch(self._end_index,
                                           max_steps=max_decoding_steps,
                                           beam_size=beam_size)
        else:
            self._beam_search = None
Esempio n. 15
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 attention: Attention,
                 max_decoding_steps: int,
                 beam_size: int = None,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 scheduled_sampling_ratio: float = 0.,
                 projection_dim: int = None,
                 use_coverage: bool = False,
                 coverage_shift: float = 0.,
                 coverage_loss_weight: float = None,
                 embed_attn_to_output: bool = False) -> None:
        super(PointerGeneratorNetwork, self).__init__(vocab)

        self._target_namespace = target_namespace
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     target_namespace)
        self._unk_index = self.vocab.get_token_index(DEFAULT_OOV_TOKEN,
                                                     target_namespace)
        self._vocab_size = self.vocab.get_vocab_size(target_namespace)
        assert self._vocab_size > 2, \
            "Target vocabulary is empty. Make sure 'target_namespace' option of the model is correct."

        # Encoder
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._encoder_output_dim = self._encoder.get_output_dim()

        # Decoder
        self._target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        self._num_classes = self.vocab.get_vocab_size(target_namespace)
        self._target_embedder = Embedding(self._num_classes,
                                          self._target_embedding_dim)

        self._decoder_input_dim = self._encoder_output_dim + self._target_embedding_dim
        self._decoder_output_dim = self._encoder_output_dim
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)

        self._projection_dim = projection_dim or self._source_embedder.get_output_dim(
        )
        hidden_projection_dim = self._decoder_output_dim if not embed_attn_to_output else self._decoder_output_dim * 2
        self._hidden_projection_layer = Linear(hidden_projection_dim,
                                               self._projection_dim)
        self._output_projection_layer = Linear(self._projection_dim,
                                               self._num_classes)

        self._p_gen_layer = Linear(
            self._decoder_output_dim * 3 + self._decoder_input_dim, 1)
        self._attention = attention
        self._use_coverage = use_coverage
        self._coverage_loss_weight = coverage_loss_weight
        self._eps = 1e-31
        self._embed_attn_to_output = embed_attn_to_output
        self._coverage_shift = coverage_shift

        # Metrics
        self._p_gen_sum = 0.0
        self._p_gen_iterations = 0
        self._coverage_loss_sum = 0.0
        self._coverage_iterations = 0

        # Decoding
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size or 1)
Esempio n. 16
0
    def __init__(self,
                 vocab: Vocabulary,
                 bert_model: BertQA,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 attention: Attention,
                 beam_size: int,
                 max_decoding_steps: int,
                 target_embedding_dim: int = 30,
                 copy_token: str = "@COPY@",
                 source_namespace: str = "source_tokens",
                 target_namespace: str = "target_tokens",
                 tensor_based_metric: Metric = None,
                 token_based_metric: Metric = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 dropout: float = 0.0) -> None:
        super().__init__(vocab)

        self.bert_model = bert_model
        self._source_namespace = source_namespace
        self._target_namespace = target_namespace
        self._src_start_index = self.vocab.get_token_index(
            START_SYMBOL, self._source_namespace)
        self._src_end_index = self.vocab.get_token_index(
            END_SYMBOL, self._source_namespace)
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self._oov_index = self.vocab.get_token_index(self.vocab._oov_token,
                                                     self._target_namespace)  # pylint: disable=protected-access
        self._pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                     self._target_namespace)  # pylint: disable=protected-access
        self._copy_index = self.vocab.add_token_to_namespace(
            copy_token, self._target_namespace)

        self._tensor_based_metric = tensor_based_metric or \
            BLEU(exclude_indices={self._pad_index, self._end_index, self._start_index})
        self._token_based_metric = token_based_metric
        self._action_accuracy = CategoricalAccuracy()

        self._target_vocab_size = self.vocab.get_vocab_size(
            self._target_namespace)

        # Encoding modules.
        self._source_embedder = source_embedder
        self._encoder = encoder

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        # We arbitrarily set the decoder's input dimension to be the same as the output dimension.
        self.encoder_output_dim = self._encoder.get_output_dim()
        self.decoder_output_dim = self.encoder_output_dim
        self.decoder_input_dim = self.decoder_output_dim

        embedding_dim = self.bert_model._text_field_embedder.get_output_dim()
        self._action_predictor = Linear(embedding_dim, 4)
        self._init_decoder_projection = Linear(self.encoder_output_dim,
                                               self.decoder_output_dim)

        target_vocab_size = self.vocab.get_vocab_size(self._target_namespace)

        # The decoder input will be a function of the embedding of the previous predicted token,
        # an attended encoder hidden state called the "attentive read", and another
        # weighted sum of the encoder hidden state called the "selective read".
        # While the weights for the attentive read are calculated by an `Attention` module,
        # the weights for the selective read are simply the predicted probabilities
        # corresponding to each token in the source sentence that matches the target
        # token from the previous timestep.
        self._target_embedder = Embedding(target_vocab_size,
                                          target_embedding_dim)
        self._attention = attention
        self._input_projection_layer = Linear(
            target_embedding_dim + self.encoder_output_dim * 2,
            self.decoder_input_dim)

        # We then run the projected decoder input through an LSTM cell to produce
        # the next hidden state.
        self._decoder_cell = LSTMCell(self.decoder_input_dim,
                                      self.decoder_output_dim)

        # We create a "generation" score for each token in the target vocab
        # with a linear projection of the decoder hidden state.
        self._output_generation_layer = Linear(self.decoder_output_dim,
                                               target_vocab_size)

        # We create a "copying" score for each source token by applying a non-linearity
        # (tanh) to a linear projection of the encoded hidden state for that token,
        # and then taking the dot product of the result with the decoder hidden state.
        self._output_copying_layer = Linear(self.encoder_output_dim,
                                            self.decoder_output_dim)

        # At prediction time, we'll use a beam search to find the best target sequence.
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        initializer(self)
Esempio n. 17
0
    def __init__(
        self,
        task: str,
        vocab: Vocabulary,
        input_dim: int,
        max_decoding_steps: int,
        loss_weight: float = 1.0,
        attention: Attention = None,
        beam_size: int = None,
        target_namespace: str = "target_tokens",
        target_embedding_dim: int = None,
        scheduled_sampling_ratio: float = 0.0,
        use_bleu: bool = True,
        bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25),
        target_decoder_layers: int = 1,
        **kwargs,
    ) -> None:

        super().__init__(vocab, **kwargs)

        self.task = task
        self.vocab = vocab
        self.loss_weight = loss_weight
        self._target_namespace = task + '_target_words'
        self._target_decoder_layers = target_decoder_layers
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)
            self._bleu = BLEU(bleu_ngram_weights,
                              exclude_indices={
                                  pad_index, self._end_index, self._start_index
                              })
        else:
            self._bleu = None
        self.metrics = {"bleu": self._bleu}

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        num_classes = self.vocab.get_vocab_size(
            namespace=self._target_namespace)

        # Attention mechanism applied to the encoder output for each step.
        self._attention = attention

        # The input to the decoder is just the previous target embedding.
        target_embedding_dim = target_embedding_dim or self._encoder_output_dim
        self._decoder_input_dim = target_embedding_dim

        # Dense embedding of vocab words in the target space.
        self._target_embedder = Embedding(num_embeddings=num_classes,
                                          embedding_dim=target_embedding_dim)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = input_dim
        self._decoder_output_dim = self._encoder_output_dim

        if self._attention:
            # If using attention, a weighted average over encoder outputs will be concatenated
            # to the previous target embedding to form the input to the decoder at each
            # time step.
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim
        else:
            # Otherwise, the input to the decoder is just the previous target embedding.
            self._decoder_input_dim = target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        if self._target_decoder_layers > 1:
            self._decoder_cell = LSTM(
                self._decoder_input_dim,
                self._decoder_output_dim,
                self._target_decoder_layers,
            )
        else:
            self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                          self._decoder_output_dim)
        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        max_decoding_steps: int,
        target_namespace: str = "tokens",
        target_embedding_dim: int = None,
        attention_function: SimilarityFunction = None,
        scheduled_sampling_ratio: float = 0.0,
        weight_function="softmax",
        gumbel_tau: float = 0.66,
        gumbel_hard: bool = True,
        gumbel_eps: float = 1e-10,
        infer_with: str = "distribution",
        self_feed_with: str = "argmax_distribution",
    ) -> None:
        super(Rnn2RnnDifferentiableNll, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim()
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = LegacyAttention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim(
            ) + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)

        self._weights_calculation_function = weight_function

        self._gumbel_tau = gumbel_tau
        self._gumbel_hard = gumbel_hard
        self._gamble_eps = gumbel_eps

        if self_feed_with not in {
                "distribution", "argmax_logits", "argmax_distribution",
                "detach_distribution"
        }:
            raise ValueError(
                "Allowed vals for selffeed are {distribution, argmax_logits, argmax_distribution, detach_distribution}"
            )

        if infer_with not in {
                "distribution", "argmax_logits", "argmax_distribution"
        }:
            raise ValueError(
                "Allowed vals for ifer_with are {distribution, argmax_logits, argmax_distribution}"
            )

        self._infer_with = infer_with
        self._self_feed_with = self_feed_with
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 seq_metrics: Metric,
                 attention: Attention,
                 beam_size: int = None,
                 source_namespace: str = 'source_tokens',
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 scheduled_sampling_ratio: float = 0.,
                 use_bleu: bool = False,
                 encoder_input_dropout: int = 0.0,
                 encoder_output_dropout: int = 0.0,
                 dropout=0.0,
                 feed_output_attention_to_decoder: bool = False,
                 keep_decoder_output_dim_same_as_encoder: bool = True,
                 initializer: InitializerApplicator = InitializerApplicator()) -> None:

        super(RecombinationSeq2SeqWithCopy, self).__init__(vocab)
        self._source_namespace = source_namespace
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
        self._pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                     self._target_namespace)  # pylint: disable=protected-access

        # Evaluation Metrics
        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace)  # pylint: disable=protected-access
            self._bleu = BLEU(exclude_indices={pad_index, self._end_index, self._start_index})
        else:
            self._bleu = None
        self._seq_metric = seq_metrics

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder

        # Encoder

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder
        self._encoder_output_dim = self._encoder.get_output_dim()

        # Attention mechanism applied to the encoder output for each step.
        self._attention = attention
        self._feed_output_attention_to_decoder = feed_output_attention_to_decoder
        if self._feed_output_attention_to_decoder:
            # If using attention, a weighted average over encoder outputs will be concatenated
            # to the previous target embedding to form the input to the decoder at each
            # time step.
            self._decoder_input_dim = self._encoder_output_dim + target_embedding_dim
        else:
            # Otherwise, the input to the decoder is just the previous target embedding.
            self._decoder_input_dim = target_embedding_dim

        # Decoder

        # Dense embedding of vocab words in the target space.
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        self._num_classes = num_classes
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim()
        self._target_embedder = Embedding(num_classes, target_embedding_dim)

        # TODO: relax this assumption
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._keep_decoder_output_dim_same_as_encoder = keep_decoder_output_dim_same_as_encoder
        if not self._keep_decoder_output_dim_same_as_encoder:
            self._decoder_output_dim = int(self._encoder_output_dim / 2) if encoder.is_bidirectional() \
                else self._encoder_output_dim
        else:
            self._decoder_output_dim = self._encoder_output_dim

        self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim)

        self._transform_decoder_init_state = torch.nn.Sequential(
            torch.nn.Linear(self._encoder_output_dim, self._decoder_output_dim),
            torch.nn.Tanh()
        )

        # Generate Score
        self._output_projection_layer = Linear(self._decoder_output_dim + self._encoder_output_dim, num_classes)

        # Dropout Layers
        self._encoder_input_dropout = torch.nn.Dropout(p=encoder_input_dropout)
        self._encoder_output_dropout = torch.nn.Dropout(p=encoder_output_dropout)
        self._output_dropout = torch.nn.Dropout(p=dropout)
        self._embedded_dropout = torch.nn.Dropout(p=dropout)

        initializer(self)
Esempio n. 20
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 attention: Attention,
                 beam_size: int,
                 max_decoding_steps: int,
                 binary_pred_feature_dim: int = 0,
                 language_flag_dim: int = 0,
                 number_of_languages: int = 2,
                 target_embedding_dim: int = 100,
                 copy_token: str = "@COPY@",
                 source_namespace: str = "source_tokens",
                 target_namespace: str = "target_tokens",
                 tensor_based_metric: Metric = None,
                 token_based_metric: Metric = None) -> None:
        super().__init__(vocab)
        self._source_namespace = source_namespace
        self._target_namespace = target_namespace
        self._src_start_index = self.vocab.get_token_index(
            START_SYMBOL, self._source_namespace)
        self._src_end_index = self.vocab.get_token_index(
            END_SYMBOL, self._source_namespace)
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self._oov_index = self.vocab.get_token_index(self.vocab._oov_token,
                                                     self._target_namespace)  # pylint: disable=protected-access
        self._pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                     self._target_namespace)  # pylint: disable=protected-access
        self._copy_index = self.vocab.add_token_to_namespace(
            copy_token, self._target_namespace)

        self._tensor_based_metric = tensor_based_metric or \
            BLEU(exclude_indices={self._pad_index, self._end_index, self._start_index})
        self._token_based_metric = token_based_metric

        self._target_vocab_size = self.vocab.get_vocab_size(
            self._target_namespace)

        # There are exactly features for the verb predicate embedding (Verb or Non-Verb).
        if binary_pred_feature_dim > 0:
            self._binary_feature_embedding = Embedding(
                2, binary_pred_feature_dim)
        else:
            self._binary_feature_embedding = None

        # Language Token Embeddings!
        if language_flag_dim > 0:
            self._language_embedding = Embedding(number_of_languages,
                                                 language_flag_dim)
        else:
            self._language_embedding = None

        # Encoding modules
        self._source_embedder = source_embedder
        self._encoder = encoder

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        # We arbitrarily set the decoder's input dimension to be the same as the output dimension.
        self.encoder_output_dim = self._encoder.get_output_dim()
        self.decoder_output_dim = self.encoder_output_dim
        self.decoder_input_dim = self.decoder_output_dim

        target_vocab_size = self.vocab.get_vocab_size(self._target_namespace)

        # The decoder input will be a function of the embedding of the previous predicted token,
        # an attended encoder hidden state called the "attentive read", and another
        # weighted sum of the encoder hidden state called the "selective read".
        # While the weights for the attentive read are calculated by an `Attention` module,
        # the weights for the selective read are simply the predicted probabilities
        # corresponding to each token in the source sentence that matches the target
        # token from the previous timestep.
        self._target_embedder = Embedding(target_vocab_size,
                                          target_embedding_dim)
        self._attention = attention
        self._input_projection_layer = Linear(
            target_embedding_dim + language_flag_dim +
            self.encoder_output_dim * 2, self.decoder_input_dim)

        self._language_dec_indicator = None
        self._beam_size = beam_size

        # We then run the projected decoder input through an LSTM cell to produce
        # the next hidden state.
        self._decoder_cell = LSTMCell(self.decoder_input_dim,
                                      self.decoder_output_dim)

        # We create a "generation" score for each token in the target vocab
        # with a linear projection of the decoder hidden state.
        self._output_generation_layer = Linear(self.decoder_output_dim,
                                               target_vocab_size)

        # We create a "copying" score for each source token by applying a non-linearity
        # (tanh) to a linear projection of the encoded hidden state for that token,
        # and then taking the dot product of the result with the decoder hidden state.
        self._output_copying_layer = Linear(self.encoder_output_dim,
                                            self.decoder_output_dim)

        # At prediction time, we'll use a beam search to find the best target sequence.
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "tokens",
                 target_embedder: TextFieldEmbedder = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.25,
                 pointer_gen: bool = True,
                 language_model: bool = True,
                 max_oovs: int = None) -> None:
        super(PointerGenerator, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._pointer_gen = pointer_gen
        self._language_model = language_model
        if self._pointer_gen:
            self._max_oovs = max_oovs
            self.vocab.set_max_oovs(self._max_oovs)
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.

        self._target_embedder = target_embedder or source_embedder
        #!!! attention on decoder output, not on decoder input !!!#
        self._decoder_input_dim = self._target_embedder.get_output_dim()
                
        # decoder use UniLSTM while encoder use BiLSTM 
        self._decoder_hidden_dim = self._encoder.get_output_dim()//2
        
        # decoder: h0 c0 projection_layer from final_encoder_output
        self.decode_h0_projection_layer = Linear(self._encoder.get_output_dim(),self._decoder_hidden_dim)
        self.decode_c0_projection_layer = Linear(self._encoder.get_output_dim(),self._decoder_hidden_dim)

        self._decoder_attention = Attention(self._attention_function)
        # The output of attention, a weighted average over encoder outputs, will be
        # concatenated to the decoder_hidden of the decoder at each time step.
        # V[s_t, h*_t] + b
        self._decoder_output_dim = self._decoder_hidden_dim + self._encoder.get_output_dim() #[s_t, h*_t]
        
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim)
        self._output_attention_layer = Linear(self._decoder_output_dim, self._decoder_hidden_dim)
        #V[s_t, h*_t] + b
        self._output_projection_layer = Linear(self._decoder_hidden_dim, num_classes)
        # num_classes->V'
        # generationp robability
        if self._pointer_gen:
            self._pointer_gen_layer = Linear(self._decoder_hidden_dim+self._encoder.get_output_dim()+self._decoder_input_dim, 1)
        # metrics
        self.metrics = {
                "ROUGE-1": Rouge(1),
                "ROUGE-2": Rouge(2),
        }
Esempio n. 22
0
class SimpleSeq2Seq(Model):
    """
    This ``SimpleSeq2Seq`` class is a :class:`Model` which takes a sequence, encodes it, and then
    uses the encoded representations to decode another sequence.  You can use this as the basis for
    a neural machine translation system, an abstractive summarization system, or any other common
    seq2seq problem.  The model here is simple, but should be a decent starting place for
    implementing recent models for these tasks.

    This ``SimpleSeq2Seq`` model takes an encoder (:class:`Seq2SeqEncoder`) as an input, and
    implements the functionality of the decoder.  In this implementation, the decoder uses the
    encoder's outputs in two ways. The hidden state of the decoder is initialized with the output
    from the final time-step of the encoder, and when using attention, a weighted average of the
    outputs from the encoder is concatenated to the inputs of the decoder at every timestep.

    Parameters
    ----------
    vocab : ``Vocabulary``, required
        Vocabulary containing source and target vocabularies. They may be under the same namespace
        (``tokens``) or the target tokens can have a different namespace, in which case it needs to
        be specified as ``target_namespace``.
    source_embedder : ``TextFieldEmbedder``, required
        Embedder for source side sequences
    encoder : ``Seq2SeqEncoder``, required
        The encoder of the "encoder/decoder" model
    max_decoding_steps : int, required
        Length of decoded sequences
    target_namespace : str, optional (default = 'tokens')
        If the target side vocabulary is different from the source side's, you need to specify the
        target's namespace here. If not, we'll assume it is "tokens", which is also the default
        choice for the source side, and this might cause them to share vocabularies.
    target_embedding_dim : int, optional (default = source_embedding_dim)
        You can specify an embedding dimensionality for the target side. If not, we'll use the same
        value as the source embedder's.
    attention_function: ``SimilarityFunction``, optional (default = None)
        If you want to use attention to get a dynamic summary of the encoder outputs at each step
        of decoding, this is the function used to compute similarity between the decoder hidden
        state and encoder outputs.
    scheduled_sampling_ratio: float, optional (default = 0.0)
        At each timestep during training, we sample a random number between 0 and 1, and if it is
        not less than this value, we use the ground truth labels for the whole batch. Else, we use
        the predictions from the previous time step for the whole batch. If this value is 0.0
        (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not
        using target side ground truth labels.  See the following paper for more information:
        Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. Bengio et al.,
        2015.
    """
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "target_tags",
                 target_embedding_dim: int = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SimpleSeq2Seq, self).__init__(vocab, regularizer)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim()
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = Attention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim(
            ) + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        # self._decoder_cell = GRUCell(self._decoder_input_dim, self._decoder_output_dim, bias=False)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.span_metric = SpanBasedF1Measure(
            vocab,
            tag_namespace=target_namespace,
            ignore_classes=[START_SYMBOL[2:], END_SYMBOL[2:]])
        initializer(self)

        # Initialize forget gate
        encoder_parameters = self._encoder.state_dict()
        for pname in encoder_parameters:
            if 'bias_' in pname:
                print(pname)
                b = encoder_parameters[pname]
                l = len(b)
                b[l // 4:l // 2] = 1.0
        decoder_parameters = self._decoder_cell.state_dict()
        for pname in decoder_parameters:
            if 'bias_' in pname:
                print(pname)
                b = decoder_parameters[pname]
                l = len(b)
                b[l // 4:l // 2] = 1.0

    def _examine_source_indices(self, preindices):
        if not isinstance(preindices, numpy.ndarray):
            preindices = preindices.data.cpu().numpy()
        all_predicted_tokens = []
        for indices in preindices:
            predicted_tokens = [
                self.vocab.get_token_from_index(x, namespace="source_tokens")
                for x in list(indices)
            ]
            all_predicted_tokens.append(predicted_tokens)
        return all_predicted_tokens

    def _examine_target_indices(self, preindices):
        if not isinstance(preindices, numpy.ndarray):
            preindices = preindices.data.cpu().numpy()
        all_predicted_tokens = []
        for indices in preindices:
            indices = list(indices)
            # Collect indices till the first end_symbol
            # if self._end_index in indices:
            #     indices = indices[:indices.index(self._end_index)]
            predicted_tokens = [
                self.vocab.get_token_from_index(
                    x, namespace=self._target_namespace) for x in indices
            ]
            all_predicted_tokens.append(predicted_tokens)
        return all_predicted_tokens

    def _print_source_target_triplets(self, src, tgt, true_tgt):
        src = self._examine_source_indices(src)
        true_tgt = self._examine_target_indices(true_tgt)
        tgt = self._examine_target_indices(tgt)
        for i in [0, int(len(src) / 2), -1]:
            print('Source:      ', ' '.join(src[i]))
            print('Target:      ', ' '.join(tgt[i]))
            print('True target: ', ' '.join(true_tgt[i][1:]))
        print('')

    @overrides
    def forward(
        self,  # type: ignore
        source_tokens: Dict[str, torch.LongTensor],
        target_tokens: Dict[str, torch.LongTensor] = None
    ) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Decoder logic for producing the entire target sequence.

        Parameters
        ----------
        source_tokens : Dict[str, torch.LongTensor]
           The output of ``TextField.as_array()`` applied on the source ``TextField``. This will be
           passed through a ``TextFieldEmbedder`` and then through an encoder.
        target_tokens : Dict[str, torch.LongTensor], optional (default = None)
           Output of ``Textfield.as_array()`` applied on target ``TextField``. We assume that the
           target tokens are also represented as a ``TextField``.
        """
        # embed()
        # (batch_size, input_sequence_length, encoder_output_dim)
        embedded_input = self._source_embedder(source_tokens)
        batch_size, _, _ = embedded_input.size()
        source_mask = get_text_field_mask(source_tokens)
        encoder_outputs = self._encoder(embedded_input, source_mask)
        final_encoder_output = encoder_outputs[:,
                                               -1]  # (batch_size, encoder_output_dim)
        if target_tokens:
            targets = target_tokens["tokens"]
            target_sequence_length = targets.size()[1]
            # The last input from the target is either padding or the end symbol. Either way, we
            # don't have to process it.
            num_decoding_steps = target_sequence_length - 1
        else:
            num_decoding_steps = self._max_decoding_steps
        decoder_hidden = final_encoder_output
        decoder_context = Variable(encoder_outputs.data.new().resize_(
            batch_size, self._decoder_output_dim).fill_(0))
        last_predictions = None
        step_logits = []
        step_probabilities = []
        step_predictions = []
        for timestep in range(num_decoding_steps):
            if self.training and all(
                    torch.rand(1) >= self._scheduled_sampling_ratio):
                input_choices = targets[:, timestep]
            else:
                if timestep == 0:
                    # For the first timestep, when we do not have targets, we input start symbols.
                    # (batch_size,)
                    input_choices = Variable(
                        source_mask.data.new().resize_(batch_size).fill_(
                            self._start_index))
                else:
                    input_choices = last_predictions
            decoder_input = self._prepare_decode_step_input(
                input_choices, decoder_hidden, encoder_outputs, source_mask)
            decoder_hidden, decoder_context = self._decoder_cell(
                decoder_input, (decoder_hidden, decoder_context))
            # (batch_size, num_classes)
            output_projections = self._output_projection_layer(decoder_hidden)
            # list of (batch_size, 1, num_classes)
            step_logits.append(output_projections.unsqueeze(1))
            class_probabilities = F.softmax(output_projections, dim=-1)
            _, predicted_classes = torch.max(class_probabilities, 1)
            step_probabilities.append(class_probabilities.unsqueeze(1))
            last_predictions = predicted_classes
            # (batch_size, 1)
            step_predictions.append(last_predictions.unsqueeze(1))
        # step_logits is a list containing tensors of shape (batch_size, 1, num_classes)
        # This is (batch_size, num_decoding_steps, num_classes)
        logits = torch.cat(step_logits, 1)
        class_probabilities = torch.cat(step_probabilities, 1)
        all_predictions = torch.cat(step_predictions, 1)
        output_dict = {
            "logits": logits,
            "class_probabilities": class_probabilities,
            "predictions": all_predictions
        }
        if target_tokens:
            target_mask = get_text_field_mask(target_tokens)
            loss = self._get_loss(logits, targets, target_mask)
            output_dict["loss"] = loss
            # TODO: Define metrics
            relevant_targets = targets[:, 1:].contiguous(
            )  # (batch_size, num_decoding_steps)
            relevant_mask = target_mask[:, 1:].contiguous()
            for metric in self.metrics.values():
                metric(logits, relevant_targets, relevant_mask.float())
            class_probabilities = logits * 0.
            for i, instance_tags in enumerate(
                    all_predictions.cpu().data.numpy()):
                for j, tag_id in enumerate(instance_tags):
                    class_probabilities[i, j, tag_id] = 1
            # embed()
            self.span_metric(class_probabilities, relevant_targets,
                             relevant_mask)
            self._print_source_target_triplets(source_tokens['tokens'],
                                               all_predictions,
                                               target_tokens['tokens'])
        return output_dict

    def _prepare_decode_step_input(
            self,
            input_indices: torch.LongTensor,
            decoder_hidden_state: torch.LongTensor = None,
            encoder_outputs: torch.LongTensor = None,
            encoder_outputs_mask: torch.LongTensor = None) -> torch.LongTensor:
        """
        Given the input indices for the current timestep of the decoder, and all the encoder
        outputs, compute the input at the current timestep.  Note: This method is agnostic to
        whether the indices are gold indices or the predictions made by the decoder at the last
        timestep. So, this can be used even if we're doing some kind of scheduled sampling.

        If we're not using attention, the output of this method is just an embedding of the input
        indices.  If we are, the output will be a concatentation of the embedding and an attended
        average of the encoder inputs.

        Parameters
        ----------
        input_indices : torch.LongTensor
            Indices of either the gold inputs to the decoder or the predicted labels from the
            previous timestep.
        decoder_hidden_state : torch.LongTensor, optional (not needed if no attention)
            Output of from the decoder at the last time step. Needed only if using attention.
        encoder_outputs : torch.LongTensor, optional (not needed if no attention)
            Encoder outputs from all time steps. Needed only if using attention.
        encoder_outputs_mask : torch.LongTensor, optional (not needed if no attention)
            Masks on encoder outputs. Needed only if using attention.
        """
        # input_indices : (batch_size,)  since we are processing these one timestep at a time.
        # (batch_size, target_embedding_dim)
        embedded_input = self._target_embedder(input_indices)
        if self._attention_function:
            # encoder_outputs : (batch_size, input_sequence_length, encoder_output_dim)
            # Ensuring mask is also a FloatTensor. Or else the multiplication within attention will
            # complain.
            encoder_outputs_mask = encoder_outputs_mask.float()
            # (batch_size, input_sequence_length)
            input_weights = self._decoder_attention(decoder_hidden_state,
                                                    encoder_outputs,
                                                    encoder_outputs_mask)
            # (batch_size, encoder_output_dim)
            attended_input = weighted_sum(encoder_outputs, input_weights)
            # (batch_size, encoder_output_dim + target_embedding_dim)
            return torch.cat((attended_input, embedded_input), -1)
        else:
            return embedded_input

    @staticmethod
    def _get_loss(logits: torch.LongTensor, targets: torch.LongTensor,
                  target_mask: torch.LongTensor) -> torch.LongTensor:
        """
        Takes logits (unnormalized outputs from the decoder) of size (batch_size,
        num_decoding_steps, num_classes), target indices of size (batch_size, num_decoding_steps+1)
        and corresponding masks of size (batch_size, num_decoding_steps+1) steps and computes cross
        entropy loss while taking the mask into account.

        The length of ``targets`` is expected to be greater than that of ``logits`` because the
        decoder does not need to compute the output corresponding to the last timestep of
        ``targets``. This method aligns the inputs appropriately to compute the loss.

        During training, we want the logit corresponding to timestep i to be similar to the target
        token from timestep i + 1. That is, the targets should be shifted by one timestep for
        appropriate comparison.  Consider a single example where the target has 3 words, and
        padding is to 7 tokens.
           The complete sequence would correspond to <S> w1  w2  w3  <E> <P> <P>
           and the mask would be                     1   1   1   1   1   0   0
           and let the logits be                     l1  l2  l3  l4  l5  l6
        We actually need to compare:
           the sequence           w1  w2  w3  <E> <P> <P>
           with masks             1   1   1   1   0   0
           against                l1  l2  l3  l4  l5  l6
           (where the input was)  <S> w1  w2  w3  <E> <P>
        """
        relevant_targets = targets[:, 1:].contiguous(
        )  # (batch_size, num_decoding_steps)
        relevant_mask = target_mask[:, 1:].contiguous(
        )  # (batch_size, num_decoding_steps)
        loss = sequence_cross_entropy_with_logits(logits, relevant_targets,
                                                  relevant_mask)
        return loss

    @overrides
    def decode(
            self, output_dict: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        This method overrides ``Model.decode``, which gets called after ``Model.forward``, at test
        time, to finalize predictions. The logic for the decoder part of the encoder-decoder lives
        within the ``forward`` method.

        This method trims the output predictions to the first end symbol, replaces indices with
        corresponding tokens, and adds a field called ``predicted_tokens`` to the ``output_dict``.
        """
        predicted_indices = output_dict["predictions"]
        if not isinstance(predicted_indices, numpy.ndarray):
            predicted_indices = predicted_indices.data.cpu().numpy()
        all_predicted_tokens = []
        for indices in predicted_indices:
            indices = list(indices)
            # Collect indices till the first end_symbol
            if self._end_index in indices:
                indices = indices[:indices.index(self._end_index)]
            predicted_tokens = [
                self.vocab.get_token_from_index(
                    x, namespace=self._target_namespace) for x in indices
            ]
            all_predicted_tokens.append(predicted_tokens)
        output_dict["predicted_tokens"] = all_predicted_tokens
        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        accs = {
            metric_name: metric.get_metric(reset)
            for metric_name, metric in self.metrics.items()
        }
        metric_dict = self.span_metric.get_metric(reset=reset)
        f1 = {x: y for x, y in metric_dict.items() if "overall" in x}
        return {**f1, **accs}

    @classmethod
    def from_params(cls, vocab, params: Params) -> 'SimpleSeq2Seq':
        source_embedder_params = params.pop("source_embedder")
        source_embedder = TextFieldEmbedder.from_params(
            vocab, source_embedder_params)
        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))
        max_decoding_steps = params.pop("max_decoding_steps")
        target_namespace = params.pop("target_namespace", "target_tags")
        # If no attention function is specified, we should not use attention, not attention with
        # default similarity function.
        attention_function_type = params.pop("attention_function", None)
        if attention_function_type is not None:
            attention_function = SimilarityFunction.from_params(
                attention_function_type)
        else:
            attention_function = None
        scheduled_sampling_ratio = params.pop_float("scheduled_sampling_ratio",
                                                    0.0)
        initializer = InitializerApplicator.from_params(
            params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(
            params.pop('regularizer', []))
        return cls(vocab,
                   source_embedder=source_embedder,
                   encoder=encoder,
                   max_decoding_steps=max_decoding_steps,
                   target_namespace=target_namespace,
                   attention_function=attention_function,
                   scheduled_sampling_ratio=scheduled_sampling_ratio,
                   initializer=initializer,
                   regularizer=regularizer)
Esempio n. 23
0
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2VecEncoder,
        kg_encoder: Seq2VecEncoder,
        max_decoding_steps: int = 64,
        attention: Attention = None,
        target_namespace: str = "tokens",
        scheduled_sampling_ratio: float = 0.4,
    ) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio  # Maybe we can try
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self.pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                    self._target_namespace)
        self.hidden_dim = 300
        self._max_decoding_steps = max_decoding_steps
        self.kd_metric = KD_Metric()
        self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25))
        self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0))
        self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0))
        self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1))
        self.topic_acc = Average()
        self.distinct1 = Distinct1()
        self.distinct2 = Distinct2()
        # anything about module
        self._source_embedder = source_embedder
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        target_embedding_dim = source_embedder.get_output_dim()
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        self._encoder = encoder
        self._kg_encoder = kg_encoder
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim
        # self.select_entity_num = 3
        self._decoder_input_dim = self.hidden_dim * 2 + total_entiy  #self.select_entity_num
        self._attention = None
        if attention:
            self._attention = attention
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim

        self._decoder_cell = LSTMCell(self.hidden_dim * 2,
                                      self._decoder_output_dim)
        self._output_projection_layer = Linear(self.hidden_dim, num_classes)
        # with open('cy/comp_topic2num.pk', 'rb') as f:
        with open('fd/word2idx.pk', 'rb') as f:
            self.word_idx = pickle.load(f)
        self.vocab_to_idx = {}
        self.idx_to_vocab_list = []
        for word, k in self.word_idx.items():
            self.vocab_to_idx[vocab.get_token_index(word.strip())] = k
            self.idx_to_vocab_list.append(vocab.get_token_index(word.strip()))
        self.entity_size = total_entiy
        self.entity_embedding = torch.nn.Parameter(
            torch.Tensor(self.entity_size, self.hidden_dim))
        torch.nn.init.xavier_uniform_(self.entity_embedding, gain=1.414)
        self.entity_linear = Linear(self.hidden_dim * 2, self.entity_size)
        self.gen_linear = Linear(self.hidden_dim, 1)
        self.clac_num = 0
Esempio n. 24
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "target_tags",
                 target_embedding_dim: int = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SimpleSeq2Seq, self).__init__(vocab, regularizer)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim()
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = Attention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim(
            ) + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        # self._decoder_cell = GRUCell(self._decoder_input_dim, self._decoder_output_dim, bias=False)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.span_metric = SpanBasedF1Measure(
            vocab,
            tag_namespace=target_namespace,
            ignore_classes=[START_SYMBOL[2:], END_SYMBOL[2:]])
        initializer(self)

        # Initialize forget gate
        encoder_parameters = self._encoder.state_dict()
        for pname in encoder_parameters:
            if 'bias_' in pname:
                print(pname)
                b = encoder_parameters[pname]
                l = len(b)
                b[l // 4:l // 2] = 1.0
        decoder_parameters = self._decoder_cell.state_dict()
        for pname in decoder_parameters:
            if 'bias_' in pname:
                print(pname)
                b = decoder_parameters[pname]
                l = len(b)
                b[l // 4:l // 2] = 1.0
Esempio n. 25
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 beam_search: Lazy[BeamSearch] = Lazy(BeamSearch),
                 attention: Attention = None,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 scheduled_sampling_ratio: float = 0.0,
                 use_bleu: bool = True,
                 bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25,
                                                        0.25),
                 target_pretrain_file: str = None,
                 target_decoder_layers: int = 1,
                 **kwargs) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._target_decoder_layers = target_decoder_layers
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)
            self._bleu = BLEU(
                bleu_ngram_weights,
                exclude_indices={
                    pad_index, self._end_index, self._start_index
                },
            )
        else:
            self._bleu = None

        # At prediction time, we'll use a beam search to find the best target sequence.
        # For backwards compatibility, check if beam_size or max_decoding_steps were passed in as
        # kwargs. If so, update the BeamSearch object before constructing and raise a DeprecationWarning
        deprecation_warning = (
            "The parameter {} has been deprecated."
            " Provide this parameter as argument to beam_search instead.")
        beam_search_extras = {}
        if "beam_size" in kwargs:
            beam_search_extras["beam_size"] = kwargs["beam_size"]
            warnings.warn(deprecation_warning.format("beam_size"),
                          DeprecationWarning)
        if "max_decoding_steps" in kwargs:
            beam_search_extras["max_steps"] = kwargs["max_decoding_steps"]
            warnings.warn(deprecation_warning.format("max_decoding_steps"),
                          DeprecationWarning)
        self._beam_search = beam_search.construct(end_index=self._end_index,
                                                  vocab=self.vocab,
                                                  **beam_search_extras)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder

        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Attention mechanism applied to the encoder output for each step.
        self._attention = attention

        # Dense embedding of vocab words in the target space.
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        if not target_pretrain_file:
            self._target_embedder = Embedding(
                num_embeddings=num_classes, embedding_dim=target_embedding_dim)
        else:
            self._target_embedder = Embedding(
                embedding_dim=target_embedding_dim,
                pretrained_file=target_pretrain_file,
                vocab_namespace=self._target_namespace,
                vocab=self.vocab,
            )

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim

        if self._attention:
            # If using attention, a weighted average over encoder outputs will be concatenated
            # to the previous target embedding to form the input to the decoder at each
            # time step.
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim
        else:
            # Otherwise, the input to the decoder is just the previous target embedding.
            self._decoder_input_dim = target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        # TODO (pradeep): Do not hardcode decoder cell type.
        if self._target_decoder_layers > 1:
            self._decoder_cell = LSTM(
                self._decoder_input_dim,
                self._decoder_output_dim,
                self._target_decoder_layers,
            )
        else:
            self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                          self._decoder_output_dim)

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
Esempio n. 26
0
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        max_decoding_steps: int,
        attention: Attention = None,
        beam_size: int = None,
        target_namespace: str = "tokens",
        target_embedding_dim: int = None,
        scheduled_sampling_ratio: float = 0.0,
        use_bleu: bool = True,
        bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25),
        target_pretrain_file: str = None,
        target_decoder_layers: int = 1,
    ) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._target_decoder_layers = target_decoder_layers
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)
            self._bleu = BLEU(
                bleu_ngram_weights,
                exclude_indices={
                    pad_index, self._end_index, self._start_index
                },
            )
        else:
            self._bleu = None

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder

        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Attention mechanism applied to the encoder output for each step.
        self._attention = attention

        # Dense embedding of vocab words in the target space.
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        if not target_pretrain_file:
            self._target_embedder = Embedding(
                num_embeddings=num_classes, embedding_dim=target_embedding_dim)
        else:
            self._target_embedder = Embedding(
                embedding_dim=target_embedding_dim,
                pretrained_file=target_pretrain_file,
                vocab_namespace=self._target_namespace,
                vocab=self.vocab,
            )

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim

        if self._attention:
            # If using attention, a weighted average over encoder outputs will be concatenated
            # to the previous target embedding to form the input to the decoder at each
            # time step.
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim
        else:
            # Otherwise, the input to the decoder is just the previous target embedding.
            self._decoder_input_dim = target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        # TODO (pradeep): Do not hardcode decoder cell type.
        if self._target_decoder_layers > 1:
            self._decoder_cell = LSTM(
                self._decoder_input_dim,
                self._decoder_output_dim,
                self._target_decoder_layers,
            )
        else:
            self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                          self._decoder_output_dim)

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
Esempio n. 27
0
    def __init__(
        self,
        vocab: Vocabulary,
        attention: Attention,
        beam_size: int,
        max_decoding_steps: int,
        target_embedding_dim: int = 30,
        copy_token: str = "@COPY@",
        source_namespace: str = "bert",
        target_namespace: str = "target_tokens",
        tensor_based_metric: Metric = None,
        token_based_metric: Metric = None,
        initializer: InitializerApplicator = InitializerApplicator(),
    ) -> None:
        super().__init__(vocab)
        self._source_namespace = source_namespace
        self._target_namespace = target_namespace
        self._src_start_index = self.vocab.get_token_index(
            START_SYMBOL, self._source_namespace)
        self._src_end_index = self.vocab.get_token_index(
            END_SYMBOL, self._source_namespace)
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self._oov_index = self.vocab.get_token_index(self.vocab._oov_token,
                                                     self._target_namespace)
        self._pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                     self._target_namespace)
        self._copy_index = self.vocab.add_token_to_namespace(
            copy_token, self._target_namespace)

        self._tensor_based_metric = tensor_based_metric or BLEU(
            exclude_indices={
                self._pad_index, self._end_index, self._start_index
            })
        self._token_based_metric = token_based_metric

        self._target_vocab_size = self.vocab.get_vocab_size(
            self._target_namespace)

        # Encoding modules.
        bert_token_embedding = PretrainedBertEmbedder('bert-base-uncased',
                                                      requires_grad=True)

        self._source_embedder = bert_token_embedding
        self._encoder = PassThroughEncoder(
            input_dim=self._source_embedder.get_output_dim())

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        # We arbitrarily set the decoder's input dimension to be the same as the output dimension.
        self.encoder_output_dim = self._encoder.get_output_dim()
        self.decoder_output_dim = self.encoder_output_dim
        self.decoder_input_dim = self.decoder_output_dim

        target_vocab_size = self.vocab.get_vocab_size(self._target_namespace)

        # The decoder input will be a function of the embedding of the previous predicted token,
        # an attended encoder hidden state called the "attentive read", and another
        # weighted sum of the encoder hidden state called the "selective read".
        # While the weights for the attentive read are calculated by an `Attention` module,
        # the weights for the selective read are simply the predicted probabilities
        # corresponding to each token in the source sentence that matches the target
        # token from the previous timestep.
        self._target_embedder = Embedding(target_vocab_size,
                                          target_embedding_dim)
        self._attention = attention
        self._input_projection_layer = Linear(
            target_embedding_dim + self.encoder_output_dim * 2,
            self.decoder_input_dim)

        # We then run the projected decoder input through an LSTM cell to produce
        # the next hidden state.
        self._decoder_cell = LSTMCell(self.decoder_input_dim,
                                      self.decoder_output_dim)

        # We create a "generation" score for each token in the target vocab
        # with a linear projection of the decoder hidden state.
        self._output_generation_layer = Linear(self.decoder_output_dim,
                                               target_vocab_size)

        # We create a "copying" score for each source token by applying a non-linearity
        # (tanh) to a linear projection of the encoded hidden state for that token,
        # and then taking the dot product of the result with the decoder hidden state.
        self._output_copying_layer = Linear(self.encoder_output_dim,
                                            self.decoder_output_dim)

        # At prediction time, we'll use a beam search to find the best target sequence.
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        initializer(self)
Esempio n. 28
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 attention: Attention,
                 max_decoding_steps: int,
                 beam_size: int = None,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 scheduled_sampling_ratio: float = 0.,
                 projection_dim: int = None,
                 use_coverage: bool = False,
                 coverage_loss_weight: float = None) -> None:
        super(PointerGeneratorNetwork, self).__init__(vocab)

        self._target_namespace = target_namespace
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self._source_unk_index = self.vocab.get_token_index(DEFAULT_OOV_TOKEN)
        self._target_unk_index = self.vocab.get_token_index(
            DEFAULT_OOV_TOKEN, self._target_namespace)
        self._source_vocab_size = self.vocab.get_vocab_size()
        self._target_vocab_size = self.vocab.get_vocab_size(
            self._target_namespace)

        # Encoder
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._encoder_output_dim = self._encoder.get_output_dim()

        # Decoder
        self._target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        self._num_classes = self.vocab.get_vocab_size(self._target_namespace)
        self._target_embedder = Embedding(self._num_classes,
                                          self._target_embedding_dim)
        self._decoder_input_dim = self._encoder_output_dim + self._target_embedding_dim
        self._decoder_output_dim = self._encoder_output_dim
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        self._projection_dim = projection_dim or self._source_embedder.get_output_dim(
        )
        self._hidden_projection_layer = Linear(self._decoder_output_dim,
                                               self._projection_dim)
        self._output_projection_layer = Linear(self._projection_dim,
                                               self._num_classes)
        self._p_gen_layer = Linear(
            self._decoder_output_dim * 3 + self._decoder_input_dim, 1)
        self._attention = attention
        self._use_coverage = use_coverage
        self._coverage_loss_weight = coverage_loss_weight
        self._eps = 1e-31

        # Decoding
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size or 1)