Beispiel #1
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            decoder_inputs = self.embedding_softmax_layer(targets)
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(length, self.params["hidden_size"])
            if self.train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params["layer_postprocess_dropout"])

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias)
            logits = self.embedding_softmax_layer.linear(outputs)
            return logits
Beispiel #2
0
    def _get_symbols_to_logits_fn(self, max_decode_length, training):
        """Returns a decoding function that calculates logits of the next tokens."""

        timing_signal = model_utils.get_position_encoding(
            max_decode_length + 1, self.params["hidden_size"])
        timing_signal = tf.cast(timing_signal, self.params["dtype"])
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length, dtype=self.params["dtype"])

        # TODO(b/139770046): Refactor code with better naming of i.
        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

      Args:
        ids: Current decoded sequences. int tensor with shape [batch_size *
          beam_size, i + 1].
        i: Loop index.
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]

            # Preprocess decoder input by getting embeddings and adding timing signal.
            decoder_input = self.embedding_softmax_layer(decoder_input)

            if self.params["padded_decode"]:
                timing_signal_shape = timing_signal.shape.as_list()
                decoder_input += tf.slice(timing_signal, [i, 0],
                                          [1, timing_signal_shape[1]])

                bias_shape = decoder_self_attention_bias.shape.as_list()
                self_attention_bias = tf.slice(
                    decoder_self_attention_bias, [0, 0, i, 0],
                    [bias_shape[0], bias_shape[1], 1, bias_shape[3]])
            else:
                decoder_input += timing_signal[i:i + 1]

                self_attention_bias = decoder_self_attention_bias[:, :, i:i +
                                                                  1, :i + 1]

            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"),
                self_attention_bias,
                cache.get("encoder_decoder_attention_bias"),
                training=training,
                cache=cache,
                decode_loop_step=i if self.params["padded_decode"] else None)
            logits = self.embedding_softmax_layer(decoder_outputs,
                                                  mode="linear")
            logits = tf.squeeze(logits, axis=[1])
            return logits, cache

        return symbols_to_logits_fn
Beispiel #3
0
  def decode(self, _, inputs, encoder_outputs, attention_bias):
    """Generate logits for each value in the target sequence.

    Args:
      inputs:
        int tensor (old dst sentence) with shape [batch_size, input_length].
      encoder_outputs: continuous representation of diff sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
    with tf.name_scope("decode"):
      # Prepare inputs to decoder layers by adding positional
      # encoding and applying dropout.
      decoder_inputs = self.embedding_softmax_layer(inputs)

      with tf.name_scope("add_pos_encoding"):
        length = tf.shape(decoder_inputs)[1]
        decoder_inputs += model_utils.get_position_encoding(
            length, self.params["hidden_size"])
      if self.train:
        decoder_inputs = tf.nn.dropout(
            decoder_inputs, 1 - self.params["layer_postprocess_dropout"])
      
      # Run values
      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
          length)
      outputs = self.decoder_stack(
          decoder_inputs, encoder_outputs, decoder_self_attention_bias,
          attention_bias)
      logits = self.embedding_softmax_layer.linear(outputs)
      return logits
Beispiel #4
0
    def decode(self, start_tokens, targets, encoder_outputs, attention_bias):
        with tf.name_scope("decode"):
            with tf.name_scope("shift_targets"):
                decoder_inputs = tf.concat(
                    [tf.expand_dims(start_tokens, axis=1), targets[:, :-1]],
                    axis=1)
            decoder_inputs = self.decoder_embedding_layer(decoder_inputs)
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params["hidden_size"])
            if self.train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - self.params["layer_postprocess_dropout"])

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            decoder_outputs = self.decoder_stack(decoder_inputs,
                                                 encoder_outputs,
                                                 decoder_self_attention_bias,
                                                 attention_bias)
            outputs = self.output_embedding_layer(decoder_outputs)
            return outputs
    def test_get_decoder_self_attention_bias(self):
        length = 5
        bias = model_utils.get_decoder_self_attention_bias(length)
        with self.test_session() as sess:
            bias = sess.run(bias)

        self.assertAllEqual(
            [[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
               [0, 0, NEG_INF, NEG_INF, NEG_INF], [0, 0, 0, NEG_INF, NEG_INF],
               [0, 0, 0, 0, NEG_INF], [0, 0, 0, 0, 0]]]], bias)
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""
        # 返回一个能够计算下一个token的decode函数

        timing_signal = model_utils.get_position_encoding(  # 时序信息,形状是[length, hidden_size]
            max_decode_length + 1, self.params["hidden_size"])
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length)  # self attention 的偏差, 形状是[1, 1, length, length]

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.
      这个函数可以做到,给出已经预测的tokens的id,使用decoder和encode的信息,预测下一个token
      ids表示已经预测出来的tokens
      i表示当前是第i个位置,要被预测
      cache应该是因为:训练时的decode只需要做一次,但是inference时的decode需要做多次,因为要逐个单词预测,多次decode用的encode信息是一样的,因此需要提前存储好。

      Args:
        ids: Current decoded sequences.
          int tensor with shape [batch_size * beam_size, i + 1],忽略batch_size,可以看出,这个ids不是整个句子的ids,而是从开始到某一位置的候选tokens的id
        i: Loop index
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:,
                                -1:]  # 貌似是想要获得句子中当前位置的候选tokens的ids,也就是获得形状 [batch_size * beam_size, 1]

            # Preprocess decoder input by getting embeddings and adding timing signal.
            # 做embedding,也就是[batch_size * beam_size, 1, hidden_size]
            # 从这一步可以看出,在inference的decode的输入,就是用已经预测的tokens的最后一个token,构成句子长度为1的句子,做embedding,输入到decode进行解码
            decoder_input = self.embedding_softmax_layer(decoder_input)
            decoder_input += timing_signal[i:i + 1]  # 加上第i个token的时序信息

            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i +
                                                              1]  # self attention,形状是[1, 1, 1, i+1]
            decoder_outputs = self.decoder_stack(  # 进行decode,输出tensor的形状和输入decoder_input一样,也是[batch_size * beam_size, 1, hidden_size]
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            # softmax,从[batch_size * beam_size, 1, hidden_size]映射到[batch_size * beam_size, 1, vocab_size]
            logits = self.embedding_softmax_layer.linear(decoder_outputs)
            logits = tf.squeeze(
                logits, axis=[1]
            )  # 去掉中间那个长度为1的维度,即由[batch_size * beam_size, 1, vocab_size]变为[batch_size * beam_size, vocab_size]
            return logits, cache

        return symbols_to_logits_fn
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""

        timing_signal = model_utils.get_position_encoding(
            max_decode_length + 1, self.params["hidden_size"])
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length)

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

      Args:
        ids: Current decoded sequences.
          int tensor with shape [batch_size * beam_size, i + 1]
        i: Loop index
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]

            ### domyounglee 2020.2.12
            cls_dec_bias = model_utils.get_cls_dec_attention_bias(
                tf.cast(tf.equal(decoder_input, 2), tf.int64))
            #self.cls_attention_bias=None
            # Preprocess decoder input by getting embeddings and adding timing signal.
            decoder_input = self.embedding_softmax_layer(decoder_input)
            decoder_input += timing_signal[i:i + 1]

            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i + 1]
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"),
                self_attention_bias,
                cache.get("encoder_decoder_attention_bias"),
                cls_attention_bias=None,
                cls_dec_attention_bias=None,
                identity_mask=None,
                cache=cache)
            logits = self.embedding_softmax_layer.linear(decoder_outputs)
            logits = tf.squeeze(logits, axis=[1])
            return logits, cache

        return symbols_to_logits_fn
    def decode(self, targets, encoder_outputs, attention_bias, training):
        """Generate logits for each value in the target sequence.

        Args:
          targets: target values for the output sequence. int tensor with shape
            [batch_size, target_length]
          encoder_outputs: continuous representation of input sequence. float tensor
            with shape [batch_size, input_length, hidden_size]
          attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
          training: boolean, whether in training mode or not.

        Returns:
          float32 tensor with shape [batch_size, target_length, vocab_size]
        """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            decoder_inputs = self.embedding_softmax_layer(targets)
            decoder_inputs = tf.cast(decoder_inputs, self.params["dtype"])
            attention_bias = tf.cast(attention_bias, self.params["dtype"])
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                # 第二维在维首加了一行padding,去掉最后的EOS,看来输入没有BOS
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                pos_encoding = model_utils.get_position_encoding(
                    length, self.params["hidden_size"])
                pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
                decoder_inputs += pos_encoding
            if training:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    rate=self.params["layer_postprocess_dropout"])

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length, dtype=self.params["dtype"])
            outputs = self.decoder_stack(decoder_inputs,
                                         encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias,
                                         training=training)
            logits = self.embedding_softmax_layer(outputs, mode="linear")
            logits = tf.cast(logits, tf.float32)
            return logits
Beispiel #9
0
    def predict(self, start_tokens, encoder_outputs,
                encoder_decoder_attention_bias):
        """Return predicted sequence."""
        with tf.name_scope('decode'):
            batch_size = tf.shape(encoder_outputs)[0]
            max_decode_length = self.params['sequence_length']
            timing_signal = model_utils.get_position_encoding(
                max_decode_length, self.params['hidden_size'])
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                max_decode_length)

            # Create cache storing decoder attention values for each layer.
            cache = {
                'layer_%d' % layer: {
                    'k': tf.zeros([batch_size, 0, self.params['hidden_size']]),
                    'v': tf.zeros([batch_size, 0, self.params['hidden_size']])
                }
                for layer in range(self.params['num_hidden_layers'])
            }

            # Add encoder output and attention bias to the cache.
            cache['encoder_outputs'] = encoder_outputs
            cache[
                'encoder_decoder_attention_bias'] = encoder_decoder_attention_bias

            # Forward decoder_inputs to decoder_stack max_decode_length times instead of applying beam search.
            decoder_outputs = tf.zeros(
                [batch_size, 0, self.params['output_size']])
            decoder_inputs = tf.expand_dims(start_tokens, axis=1)
            for i in range(max_decode_length):
                decoder_inputs = self.decoder_embedding_layer(decoder_inputs)
                decoder_inputs += timing_signal[i:i + 1]
                self_attention_bias = decoder_self_attention_bias[:, :, i:i +
                                                                  1, :i + 1]
                decoder_inputs = self.decoder_stack(
                    decoder_inputs, cache.get('encoder_outputs'),
                    self_attention_bias,
                    cache.get('encoder_decoder_attention_bias'), cache)
                decoder_inputs = self.output_embedding_layer(decoder_inputs)
                decoder_outputs = tf.concat([decoder_outputs, decoder_inputs],
                                            axis=1)
        return decoder_outputs
  def _get_symbols_to_logits_fn(self, max_decode_length):
    """Returns a decoding function that calculates logits of the next tokens."""

    timing_signal = model_utils.get_position_encoding(
        max_decode_length + 1, self.params.hidden_size)
    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
        max_decode_length)

    def symbols_to_logits_fn(ids, i, cache):
      """Generate logits for next potential IDs.

      Args:
        ids: Current decoded sequences.
          int tensor with shape [batch_size * beam_size, i + 1]
        i: Loop index
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
      # Set decoder input to the last generated IDs
      decoder_input = ids[:, -1:]

      # Preprocess decoder input by getting embeddings and adding timing signal.
      decoder_input = self.embedding_softmax_layer(decoder_input)
      decoder_input += timing_signal[i:i + 1]

      self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
      decoder_outputs = self.decoder_stack(
          decoder_input, cache.get("encoder_outputs"), self_attention_bias,
          cache.get("encoder_decoder_attention_bias"), cache)
      logits = self.embedding_softmax_layer.linear(decoder_outputs)
      logits = tf.squeeze(logits, axis=[1])
      return logits, cache
    return symbols_to_logits_fn
  def decode(self, targets, encoder_outputs, attention_bias):
    """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
    with tf.name_scope("decode"):
      # Prepare inputs to decoder layers by shifting targets, adding positional
      # encoding and applying dropout.
      decoder_inputs = self.embedding_softmax_layer(targets)
      with tf.name_scope("shift_targets"):
        # Shift targets to the right, and remove the last element
        decoder_inputs = tf.pad(
            decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
      with tf.name_scope("add_pos_encoding"):
        length = tf.shape(decoder_inputs)[1]
        decoder_inputs += model_utils.get_position_encoding(
            length, self.params.hidden_size)
      if self.train:
        decoder_inputs = tf.nn.dropout(
            decoder_inputs, 1 - self.params.layer_postprocess_dropout)

      # Run values
      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
          length)
      outputs = self.decoder_stack(
          decoder_inputs, encoder_outputs, decoder_self_attention_bias,
          attention_bias)
      logits = self.embedding_softmax_layer.linear(outputs)
      return logits