Exemple #1
0
    def decode(self, targets, encoder_outputs, attention_bias, training):
        """Generate logits for each value in the target sequence
        targets: [batch_size, target_length]
        encoder_outputs: [batch_size, input_length, hidden_size]
        attention_bias: [batch_size, 1, 1, input_length]
        return: [batch_size, target_length, vocab_size]
        """
        with tf.name_scope('decode'):
            decoder_inputs = self.target_embedding_layer(targets)
            decoder_inputs = tf.cast(decoder_inputs, self.params['dtype'])
            attention_bias = tf.cast(attention_bias, self.params['dtype'])
            with tf.name_scope('shift_targets'):
                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope('add_pos_encoding'):
                length = tf.shape(decoder_inputs)[1]
                pos_encoding = model_utils.get_position_encoding(
                    length, self.params['hidden_size'])
                pos_encoding = tf.cast(pos_encoding, self.params['dtype'])
                decoder_inputs += pos_encoding
            if training:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, rate=self.params['layer_postprocess_dropout'])

            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length, dtype=self.params['dtype'])
            outputs = self.decoder_stack(decoder_inputs,
                                         encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias,
                                         training=training)
            logits = self.target_embedding_layer(outputs, mode='linear')
            logits = tf.cast(logits, tf.float32)
            return logits
Exemple #2
0
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""

        timing_signal = model_utils.get_position_encoding(
            max_decode_length + 1, self.params["hidden_size"])
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length)  # 三角形矩阵 (1,1,length,length)

        def symbols_to_logits_fn(ids, i, cache):
            # Set decoder input to the last generated IDs
            if i == 0:
                decoder_input = tf.zeros(
                    [ids.shape[0], 1, self.params["hidden_size"]])
            else:
                decoder_input = ids[:, -1:]  # (batch, 1)
                decoder_input = self.embedding_softmax_layer_decoder(
                    decoder_input)  # (batch, 1, 256)

            decoder_input += timing_signal[i:i + 1]

            # 在翻译中,这里的 bias 是全0向量,长度与当前翻译的长度i相等. 实际上没有任何作用,加入到logits之后,logits不发生变化
            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i + 1]

            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.embedding_softmax_layer_decoder.linear(
                decoder_outputs)
            return logits, cache

        return symbols_to_logits_fn
Exemple #3
0
    def _get_symbols_to_logits_fn(self, max_decode_length, training):
        """Returns a decoding function that calculates logits of the next tokens."""
        timing_signal = model_utils.get_position_encoding(max_decode_length + 1, self.params['hidden_size'])
        timing_signal = tf.cast(timing_signal, self.params['dtype'])
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length, dtype=self.params['dtype'])

        def symbols_to_logits(ids, i, cache):
            """Generate logits for next potential IDs.
            ids: [batch_size * beam_size, i + 1]
            i: Loop index
            return: [batch_size * beam_size, vocab_size]
            """
            decoder_input = ids[:, -1:]
            decoder_input = self.target_embedding_layer(decoder_input)
            decoder_input += timing_signal[i: i+1]
            self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get('encoder_outputs'),
                self_attention_bias,
                cache.get('encoder_decoder_attention_bias'),
                training=training,
                cache=cache)
            logits = self.target_embedding_layer(decoder_outputs, mode='linear')
            logits = tf.squeeze(logits, axis=[1])
            return logits, cache

        return symbols_to_logits
Exemple #4
0
 def decoder_train(self, x, y):
     ## x: (batch_size, enc_len) , y: (batch_size, dec_len)
     dec_bias = model_utils.get_decoder_self_attention_bias(
         self.max_dec_len)
     attention_bias = model_utils.get_padding_bias(x)
     # Encoder
     encoder_emb_inp = self.build_embed(x, encoder=True, reuse=False)
     encoder_outputs = self.build_encoder(x,
                                          encoder_emb_inp,
                                          attention_bias,
                                          reuse=False)
     # Decoder
     batch_size = tf.shape(x)[0]
     start_tokens = tf.fill([batch_size, 1], self.bos_idx)  # 2: <s> ID
     target_slice_last_1 = tf.slice(y, [0, 0],
                                    [batch_size, self.max_dec_len - 1])
     decoder_inputs = tf.concat([start_tokens, target_slice_last_1],
                                axis=1)  ## shift to right
     decoder_emb_inp = self.build_embed(decoder_inputs,
                                        encoder=False,
                                        reuse=True)
     decoder_outputs = self.build_decoder(decoder_emb_inp,
                                          encoder_outputs,
                                          dec_bias,
                                          attention_bias,
                                          reuse=False)
     train_prob = self.build_output(decoder_outputs, reuse=False)
     return encoder_outputs, decoder_inputs, train_prob
Exemple #5
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.
    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            decoder_inputs = self.embedding_softmax_layer(targets)
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params.hidden_size)
            if self.train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params.layer_postprocess_dropout)

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            logits = self.embedding_softmax_layer.linear(outputs)
            return logits
Exemple #6
0
  def _get_symbols_to_logits_fn(self, max_decode_length, training):
    """Returns a decoding function that calculates logits of the next tokens."""

    timing_signal = model_utils.get_position_encoding(
        max_decode_length + 1, self.params["hidden_size"])
    timing_signal = tf.cast(timing_signal, self.params["dtype"])
    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
        max_decode_length, dtype=self.params["dtype"])

    # TODO(b/139770046): Refactor code with better naming of i.
    def symbols_to_logits_fn(ids, i, cache):
      """Generate logits for next potential IDs.

      Args:
        ids: Current decoded sequences. int tensor with shape [batch_size *
          beam_size, i + 1].
        i: Loop index.
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
      # Set decoder input to the last generated IDs
      decoder_input = ids[:, -1:]

      # Preprocess decoder input by getting embeddings and adding timing signal.
      decoder_input = self.embedding_softmax_layer(decoder_input)

      if self.params["padded_decode"]:
        timing_signal_shape = timing_signal.shape.as_list()
        decoder_input += tf.slice(timing_signal, [i, 0],
                                  [1, timing_signal_shape[1]])

        bias_shape = decoder_self_attention_bias.shape.as_list()
        self_attention_bias = tf.slice(
            decoder_self_attention_bias, [0, 0, i, 0],
            [bias_shape[0], bias_shape[1], 1, bias_shape[3]])
      else:
        decoder_input += timing_signal[i:i + 1]

        self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]

      decoder_outputs = self.decoder_stack(
          decoder_input,
          cache.get("encoder_outputs"),
          self_attention_bias,
          cache.get("encoder_decoder_attention_bias"),
          training=training,
          cache=cache,
          decode_loop_step=i if self.params["padded_decode"] else None)
      logits = self.embedding_softmax_layer(decoder_outputs, mode="linear")
      logits = tf.squeeze(logits, axis=[1])
      return logits, cache

    return symbols_to_logits_fn
    def test_get_decoder_self_attention_bias(self):
        length = 5
        bias = model_utils.get_decoder_self_attention_bias(length)
        with self.test_session() as sess:
            bias = sess.run(bias)

        self.assertAllEqual(
            [[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
               [0, 0, NEG_INF, NEG_INF, NEG_INF], [0, 0, 0, NEG_INF, NEG_INF],
               [0, 0, 0, 0, NEG_INF], [0, 0, 0, 0, 0]]]], bias)
Exemple #8
0
  def test_get_decoder_self_attention_bias(self):
    length = 5
    bias = model_utils.get_decoder_self_attention_bias(length)

    self.assertAllEqual([[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
                           [0, 0, NEG_INF, NEG_INF, NEG_INF],
                           [0, 0, 0, NEG_INF, NEG_INF],
                           [0, 0, 0, 0, NEG_INF],
                           [0, 0, 0, 0, 0]]]],
                        bias)
  def test_get_decoder_self_attention_bias(self):
    length = 5
    bias = model_utils.get_decoder_self_attention_bias(length)
    with self.test_session() as sess:
      bias = sess.run(bias)

    self.assertAllEqual([[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
                           [0, 0, NEG_INF, NEG_INF, NEG_INF],
                           [0, 0, 0, NEG_INF, NEG_INF],
                           [0, 0, 0, 0, NEG_INF],
                           [0, 0, 0, 0, 0]]]],
                        bias)
Exemple #10
0
  def decode(self, targets, encoder_outputs, attention_bias, training):
    """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence. int tensor with shape
        [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence. float tensor
        with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
      training: boolean, whether in training mode or not.

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
    with tf.name_scope("decode"):
      # Prepare inputs to decoder layers by shifting targets, adding positional
      # encoding and applying dropout.
      decoder_inputs = self.embedding_softmax_layer(targets)
      decoder_inputs = tf.cast(decoder_inputs, self.params["dtype"])
      attention_bias = tf.cast(attention_bias, self.params["dtype"])
      with tf.name_scope("shift_targets"):
        # Shift targets to the right, and remove the last element
        decoder_inputs = tf.pad(decoder_inputs,
                                [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
      with tf.name_scope("add_pos_encoding"):
        length = tf.shape(decoder_inputs)[1]
        pos_encoding = model_utils.get_position_encoding(
            length, self.params["hidden_size"])
        pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
        decoder_inputs += pos_encoding
      if training:
        decoder_inputs = tf.nn.dropout(
            decoder_inputs, rate=self.params["layer_postprocess_dropout"])

      # Run values
      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
          length, dtype=self.params["dtype"])
      outputs = self.decoder_stack(
          decoder_inputs,
          encoder_outputs,
          decoder_self_attention_bias,
          attention_bias,
          training=training)
      logits = self.embedding_softmax_layer(outputs, mode="linear")
      logits = tf.cast(logits, tf.float32)
      return logits
Exemple #11
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

        targets: 目标语言. shape=[batch_size, target_length].用于计算损失
        encoder_outputs: encoder的输出,在decoder中要对其进行attention操作.[batch_size, input_length, hidden_size]
        attention_bias: padding的位置标记为-1e9,其余位置标记为0. shape=[batch_size, 1, 1, input_length]

        返回 shape = [batch_size, target_length, vocab_size]. 最后一维与词表长度相等
        """
        with tf.name_scope("decode"):
            # embedding后 shape=(batch_size, length, embedding_dim)
            decoder_inputs = self.embedding_softmax_layer_decoder(targets)
            # print("decoder_inputs.shape =", decoder_inputs)

            # 在length中的第一维填充全0向量. 维度不变
            with tf.name_scope("shift_targets"):
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            # print("&&", decoder_inputs[0, 0:2, :10])

            # 加入位置编码
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params["hidden_size"])

            if self.train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    rate=self.params["layer_postprocess_dropout"])

            # shape=(1, 1, length, length). 主对角元和下三角为0,其余元素为无穷小.
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)

            # decode. 此处要传入两个bias:
            # decoder_self_attention_bias 是一个三角矩阵,表示self-attention中的依赖关系
            # attention_bias 对encoder源语言中padding的位置标记为-1e9. 输入shape=(batch,length_decoder,dim)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)

            # 该输出层的权重和embedding层共享. shape=(batch,length_decoder,vocab_size)
            logits = self.embedding_softmax_layer_decoder.linear(outputs)
            return logits
Exemple #12
0
    def decoder_infer(self, x):
        dec_bias = model_utils.get_decoder_self_attention_bias(
            self.max_dec_len)
        attention_bias = model_utils.get_padding_bias(x)
        # Encoder
        encoder_emb_inp = self.build_embed(x, encoder=True, reuse=True)
        encoder_outputs = self.build_encoder(x,
                                             encoder_emb_inp,
                                             attention_bias,
                                             reuse=True)
        # Decoder
        batch_size = tf.shape(x)[0]
        start_tokens = tf.fill([batch_size, 1], self.bos_idx)  # 2: <s> ID
        next_decoder_inputs = tf.concat([
            start_tokens,
            tf.zeros([batch_size, self.max_dec_len - 1], dtype=tf.int32)
        ],
                                        axis=1)  ## batch_size, dec_len
        # predict output with loop. [encoder_outputs, decoder_inputs (filled next token)]
        for i in range(1, self.max_dec_len):
            decoder_emb_inp = self.build_embed(next_decoder_inputs,
                                               encoder=False,
                                               reuse=True)
            decoder_outputs = self.build_decoder(decoder_emb_inp,
                                                 encoder_outputs,
                                                 dec_bias,
                                                 attention_bias,
                                                 reuse=True)
            logits = self.build_output(decoder_outputs, reuse=True)
            next_decoder_inputs = self._filled_next_token(
                next_decoder_inputs, logits, i)

        # slice start_token
        decoder_input_start_1 = tf.slice(next_decoder_inputs, [0, 1],
                                         [batch_size, self.max_dec_len - 1])
        output_token = tf.concat(
            [decoder_input_start_1,
             tf.zeros([batch_size, 1], dtype=tf.int32)],
            axis=1)
        return output_token
Exemple #13
0
 def predict(self, is_training):
     # initializer = tf.variance_scaling_initializer(
     #     self.params["initializer_gain"], mode="fan_avg", distribution="uniform")
     with tf.variable_scope("tf_inference", reuse=tf.AUTO_REUSE):
         self.transformer = transformer.Transformer(self.config,
                                                    is_training)
         self.attention_bias = model_utils.get_decoder_self_attention_bias(
             self.max_len)
         encoder_outputs = self.transformer.encode(self.inp,
                                                   self.attention_bias)
         logits = self.transformer.embedding_softmax_layer.linear(
             encoder_outputs)
         loss = model_utils.soft_cross_entropy_loss(
             logits, self.inp, self.config['label_smoothing'],
             self.config['vocab_size'])
         weights = tf.sequence_mask(self.inp_len,
                                    self.max_len,
                                    dtype=tf.int32)
         loss = loss * tf.to_float(weights)
         loss = tf.reduce_sum(loss, axis=1)
         loss = loss / tf.to_float(self.inp_len)
     return loss
Exemple #14
0
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""

        timing_signal = model_utils.get_position_encoding(
            max_decode_length + 1, self.params["hidden_size"])
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length)

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.
            Args:
              ids: Current decoded sequences.
                int tensor with shape [batch_size * beam_size, i + 1]
              i: Loop index
              cache: dictionary of values storing the encoder output, encoder-decoder
                attention bias, and previous decoder attention values.
            Returns:
              Tuple of
                (logits with shape [batch_size * beam_size, vocab_size],
                 updated cache values)
            """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]

            # Preprocess decoder input by getting embeddings and adding timing signal.
            decoder_input = self.embedding_softmax_layer(decoder_input)
            decoder_input += timing_signal[i:i + 1]

            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i + 1]
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.embedding_softmax_layer.linear(decoder_outputs)
            logits = tf.squeeze(logits, axis=[1])
            return logits, cache

        return symbols_to_logits_fn