コード例 #1
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

            Args:
              targets: target values for the output sequence.
                int tensor with shape [batch_size, target_length]
              encoder_outputs: continuous representation of input sequence.
                float tensor with shape [batch_size, input_length, hidden_size]
              attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

            Returns:
              float32 tensor with shape [batch_size, target_length, vocab_size]
        """
        decoder_inputs = self.embedding_softmax_layer(targets)
        decoder_inputs = nd.expand_dims(decoder_inputs, axis=0)
        decoder_inputs = nd.pad(data=decoder_inputs,
                                mode="constant",
                                constant_value=0,
                                pad_width=(0, 0, 0, 0, 1, 0, 0, 0))
        decoder_inputs = nd.reshape(data=decoder_inputs,
                                    shape=decoder_inputs.shape[1:])[:, :-1, :]

        length = decoder_inputs.shape[1]
        decoder_inputs = decoder_inputs + model_utils.get_position_encoding(
            length, self.param.hidden_size, targets.context)
        if self.train:
            decoder_inputs = self.dropout_output(decoder_inputs)

        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            length, targets.context)
        outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                     decoder_self_attention_bias,
                                     attention_bias)
        logits = self.embedding_softmax_layer.linear(outputs)
        return logits
コード例 #2
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence."""
        with tf.name_scope("decode"):
            decoder_inputs = self.decoder_embedding_layer(
                targets, not ModeKeys.is_predict_one(self.mode))
            with tf.name_scope("shift_targets"):
                decoder_inputs = tf.pad(
                    decoder_inputs,
                    [[0, 0], [1, 0], [0, 0]
                     ])[:, :-1, :]  # [batch, tgt_seqn_len, embed_size]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params.hidden_size)
            if self.is_train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params.layer_postprocess_dropout)

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            logits = self.decoder_softmax_layer.linear(outputs)
            return logits
コード例 #3
0
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""
        timing_signal = model_utils.get_position_encoding(
            max_decode_length + 1, self.param.hidden_size, mx.gpu())
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length, mx.gpu())

        def symbols_to_logits_fn(ids, i, cache):
            decoder_input = ids[:, -1:]
            # decoder的输入为Current decoded sequences 的最后一个

            decoder_input = self.embedding_softmax_layer(decoder_input)
            decoder_input = decoder_input + timing_signal[i:i + 1]

            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i + 1]
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.embedding_softmax_layer.linear(decoder_outputs)

            logits = nd.squeeze(logits, axis=1)
            return logits, cache

        return symbols_to_logits_fn
コード例 #4
0
    def decode(self, targets, encoder_outputs, attention_bias):
        with tf.name_scope("decode"):
            decoder_inputs = self.decoder_embedding_layer(
                targets, not ModeKeys.is_predict_one(self.mode))
            # done
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params.hidden_size)
            if self.is_train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params.layer_postprocess_dropout)

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            # !!!
            # logits = self.embedding_softmax_layer.linear(outputs)
            logits = self.decoder_softmax_layer.linear(outputs)
            # done
            return logits
コード例 #5
0
    def test_get_decoder_self_attention_bias(self):
        length = 5
        bias = model_utils.get_decoder_self_attention_bias(length)
        with self.test_session() as sess:
            bias = sess.run(bias)

        self.assertAllEqual(
            [[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
               [0, 0, NEG_INF, NEG_INF, NEG_INF], [0, 0, 0, NEG_INF, NEG_INF],
               [0, 0, 0, 0, NEG_INF], [0, 0, 0, 0, 0]]]], bias)
コード例 #6
0
    def _get_symbols_to_logits_fn(self, max_decode_length):
        if ModeKeys.is_predict_one(self.mode):
            timing_signal = model_utils.get_position_encoding(
                self.params.max_length, self.params.hidden_size)
            timing_signal = tf.slice(
                timing_signal, [0, 0],
                [max_decode_length + 1, self.params.hidden_size],
                name='slice_timing_signal')
        else:
            timing_signal = model_utils.get_position_encoding(
                max_decode_length + 1, self.params.hidden_size
            )  # [max_decode_length + 1, hidden_size]

        if ModeKeys.is_predict_one(self.mode):
            decoder_self_attention_bias = None
        else:
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                max_decode_length
            )  # [1, 1, max_decode_length, max_decode_length]

        def symbols_to_logits_fn(ids, i, cache):

            decoder_input = ids[:, -1:]  # [batch, 1]

            decoder_input = self.decoder_embedding_layer(
                decoder_input, not ModeKeys.is_predict_one(
                    self.mode))  # [batch, 1, hidden_size]
            if ModeKeys.is_predict_one(self.mode):
                decoder_input = decoder_input * (1 -
                                                 tf.to_float(tf.equal(i, 0)))

            slice_pos_encoding = tf.slice(
                timing_signal, [i, 0], [1, self.params.hidden_size],
                name='slice_pos_encoding')  # [1, hidden_size]
            decoder_input += slice_pos_encoding

            if decoder_self_attention_bias is None:
                self_attention_bias = None
            else:
                self_attention_bias = decoder_self_attention_bias[:, :,
                                                                  i:i + 1, :i +
                                                                  1]  # [1, 1, 1, time_step]
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.decoder_softmax_layer.linear(decoder_outputs)
            logits = tf.reshape(logits, [-1, self.params.target_vocab_size])
            return logits, cache

        return symbols_to_logits_fn
コード例 #7
0
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""

        # shape: (max_decode_length + 1, hidden_size)
        timing_signal = model_utils.get_position_encoding(
            max_decode_length + 1, self.params["hidden_size"])
        # shape: (1, 1, max_decode_length, max_decode_length)
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length)

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

      Args:
        ids: Current decoded sequences.
          int tensor with shape [batch_size * beam_size, i + 1]
        i: Loop index
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
            # Set decoder input to the last generated IDs
            # shape: (batch_size * beam_size, 1)
            decoder_input = ids[:, -1:]

            # Preprocess decoder input by getting embeddings and adding timing signal.
            # shape: (batch_size * beam_size, 1, hidden_size)
            decoder_input = self.embedding_softmax_layer(decoder_input)
            decoder_input += timing_signal[i:i + 1]

            # decoder self attention bias
            # shape: (1, 1, 1, i+1)
            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i + 1]
            # shape: (batch_size * beam_size, 1, hidden_size)
            # 一个query 对应 一个attention answer
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            # shape: (batch_size * beam_size, 1, vocab_size)
            logits = self.embedding_softmax_layer.linear(decoder_outputs)
            logits = tf.squeeze(logits, axis=[1])
            return logits, cache

        return symbols_to_logits_fn
コード例 #8
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

        Args:
          targets: target values for the output sequence.
            int tensor with shape [batch_size, target_length]
          encoder_outputs: continuous representation of input sequence.
            float tensor with shape [batch_size, input_length, hidden_size]
          attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

        Returns:
          float32 tensor with shape [batch_size, target_length, vocab_size]
        """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            # !!!
            # decoder_inputs = self.embedding_softmax_layer(targets)

            decoder_inputs = self.decoder_embedding_layer(
                targets, not ModeKeys.is_predict_one(self.mode))
            # done
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(
                    decoder_inputs,
                    [[0, 0], [1, 0], [0, 0]
                     ])[:, :-1, :]  # [batch, tgt_seqn_len, embed_size]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params.hidden_size)
            if self.is_train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params.layer_postprocess_dropout)

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            # !!!
            # logits = self.embedding_softmax_layer.linear(outputs)
            logits = self.decoder_softmax_layer.linear(outputs)
            # done
            return logits
コード例 #9
0
  def decode(self, targets, encoder_outputs, attention_bias):
    """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
    with tf.compat.v1.name_scope("decode"):
      # Prepare inputs to decoder layers by shifting targets, adding positional
      # encoding and applying dropout.
      decoder_inputs = self.embedding_softmax_layer(targets)
      with tf.compat.v1.name_scope("shift_targets"):
        # Shift targets to the right, and remove the last element
        decoder_inputs = tf.pad(
            tensor=decoder_inputs, paddings=[[0, 0], [1, 0], [0, 0]])[:, :-1, :]
      with tf.compat.v1.name_scope("add_pos_encoding"):
        length = tf.shape(input=decoder_inputs)[1]
        decoder_inputs += model_utils.get_position_encoding(
            length, self.params.hidden_size)
      if self.train:
        mlperf_log.transformer_print(
            key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
            value=self.params.layer_postprocess_dropout)
        decoder_inputs = tf.nn.dropout(
            decoder_inputs, 1 - (1 - self.params.layer_postprocess_dropout))

      with tf.compat.v1.tpu.bfloat16_scope():
        decoder_inputs = tf.cast(decoder_inputs, tf.bfloat16)
        #encoder_outputs = tf.cast(encoder_outputs, tf.bfloat16)
        #attention_bias = tf.cast(attention_bias, tf.bfloat16)
        # Run values
        decoder_self_attention_bias = tf.cast(model_utils.get_decoder_self_attention_bias(
          length), tf.bfloat16)
        outputs = self.decoder_stack(
          decoder_inputs, encoder_outputs, decoder_self_attention_bias,
          attention_bias)
        logits = self.embedding_softmax_layer.linear(outputs)
        logits = tf.cast(logits, tf.float32)
      return logits
コード例 #10
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            # shape: (batch_size, target_length, hidden_size)
            decoder_inputs = self.embedding_softmax_layer(targets)
            with tf.name_scope("shift_targets"):
                # Shift decoder_input one token to the right
                # fill inputs the first token is 0 -> <BOS>,
                # and remove the last element <EOS>
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params["hidden_size"])
            if self.train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - self.params["layer_postprocess_dropout"])

            # Run values
            # shape: [1, 1, target_length, target_length]
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            # shape: (batch_size, target_length, hidden_size)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            # shape: (batch_size, target_length, vocab_size)
            logits = self.embedding_softmax_layer.linear(outputs)
            return logits
コード例 #11
0
ファイル: ut.py プロジェクト: lichundi/universal_transformer
    def _decode(self, encoder_outputs, targets, attention_bias):
        decoder_inputs = self.embedding_layer(targets)
        decoder_inputs = tf.pad(decoder_inputs,
                                [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
        # add positional encoding
        length = tf.shape(decoder_inputs)[1]
        decoder_inputs += model_utils.get_position_encoding(
            length, self.hparams['num_units'])

        if self.is_train:
            decoder_inputs = self.decoder_embedding_dropout(decoder_inputs)

        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            length)
        outputs, dec_ponders, dec_remainders = self.decoder_stack(
            decoder_inputs, encoder_outputs, decoder_self_attention_bias,
            attention_bias)
        logits = self.embedding_layer.linear(outputs)
        return logits, dec_ponders, dec_remainders
コード例 #12
0
ファイル: transformer.py プロジェクト: cybermaster/reference
  def _get_symbols_to_logits_fn(self, max_decode_length):
    """Returns a decoding function that calculates logits of the next tokens."""

    timing_signal = model_utils.get_position_encoding(
        max_decode_length + 1, self.params.hidden_size)
    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
        max_decode_length)

    def symbols_to_logits_fn(ids, i, cache):
      """Generate logits for next potential IDs.

      Args:
        ids: Current decoded sequences.
          int tensor with shape [batch_size * beam_size, i + 1]
        i: Loop index
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
      # Set decoder input to the last generated IDs
      decoder_input = ids[:, -1:]

      # Preprocess decoder input by getting embeddings and adding timing signal.
      decoder_input = self.embedding_softmax_layer(decoder_input)
      decoder_input += timing_signal[i:i + 1]

      self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
      decoder_outputs = self.decoder_stack(
          decoder_input, cache.get("encoder_outputs"), self_attention_bias,
          cache.get("encoder_decoder_attention_bias"), cache)
      logits = self.embedding_softmax_layer.linear(decoder_outputs)
      logits = tf.squeeze(logits, axis=[1])
      return logits, cache
    return symbols_to_logits_fn
コード例 #13
0
ファイル: transformer.py プロジェクト: cybermaster/reference
  def decode(self, targets, encoder_outputs, attention_bias):
    """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
    with tf.name_scope("decode"):
      # Prepare inputs to decoder layers by shifting targets, adding positional
      # encoding and applying dropout.
      decoder_inputs = self.embedding_softmax_layer(targets)
      with tf.name_scope("shift_targets"):
        # Shift targets to the right, and remove the last element
        decoder_inputs = tf.pad(
            decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
      with tf.name_scope("add_pos_encoding"):
        length = tf.shape(decoder_inputs)[1]
        decoder_inputs += model_utils.get_position_encoding(
            length, self.params.hidden_size)
      if self.train:
        decoder_inputs = tf.nn.dropout(
            decoder_inputs, 1 - self.params.layer_postprocess_dropout)

      # Run values
      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
          length)
      outputs = self.decoder_stack(
          decoder_inputs, encoder_outputs, decoder_self_attention_bias,
          attention_bias)
      logits = self.embedding_softmax_layer.linear(outputs)
      return logits
コード例 #14
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """
        :param targets:  [batch_size, target_length]
        :param encoder_outputs: [batch_size, input_length, hidden_size]
        :param attention_bias:  [batch_size, 1, 1, input_length]
        :return: [batch_size, target_length, vocab_size]
        """
        with tf.name_scope('decode'):
            #   [batch_size, target_length, hidden_size]
            decoder_inputs = self.embedding_layer(targets)
            with tf.name_scope('shift_targets'):
                #   pad embedding value 0 at the head of sequence and remove eos_id
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope('add_pos_embedding'):
                length = tf.shape(decoder_inputs)[1]
                position_decode = model_utils.get_position_encoding(
                    length, self.params.get('hidden_size'))
                decoder_inputs = tf.add(decoder_inputs, position_decode)

            if self.train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1. - self.params.get('encoder_decoder_dropout'))

            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)

            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)

            #   [batch_size, target_length, vocab_size]
            logits = self.embedding_layer.linear(outputs)

            return logits
コード例 #15
0
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""
        if ModeKeys.is_predict_one(self.mode):
            timing_signal = model_utils.get_position_encoding(
                self.params.max_length, self.params.hidden_size)
            timing_signal = tf.slice(
                timing_signal, [0, 0],
                [max_decode_length + 1, self.params.hidden_size],
                name='slice_timing_signal')
        else:
            timing_signal = model_utils.get_position_encoding(
                max_decode_length + 1, self.params.hidden_size
            )  # [max_decode_length + 1, hidden_size]

        if ModeKeys.is_predict_one(self.mode):
            decoder_self_attention_bias = None
        else:
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                max_decode_length
            )  # [1, 1, max_decode_length, max_decode_length]

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

            Args:
              ids: Current decoded sequences.
                int tensor with shape [batch_size * beam_size, i + 1]
              i: Loop index
              cache: dictionary of values storing the encoder output, encoder-decoder
                attention bias, and previous decoder attention values.

            Returns:
              Tuple of
                (logits with shape [batch_size * beam_size, vocab_size],
                 updated cache values)
            """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]  # [batch, 1]

            # decoder_input = ids[:, :]     # [batch, 1]
            # print("decoder_input:", decoder_input.shape)

            # Preprocess decoder input by getting embeddings and adding timing signal.
            # !!!!!!!!
            decoder_input = self.decoder_embedding_layer(
                decoder_input, not ModeKeys.is_predict_one(
                    self.mode))  # [batch, 1, hidden_size]
            # !!!!!!!!
            if ModeKeys.is_predict_one(self.mode):
                decoder_input = decoder_input * (1 -
                                                 tf.to_float(tf.equal(i, 0)))

            # add position embedding
            # decoder_input += timing_signal[i:i + 1]
            slice_pos_encoding = tf.slice(
                timing_signal, [i, 0], [1, self.params.hidden_size],
                name='slice_pos_encoding')  # [1, hidden_size]
            decoder_input += slice_pos_encoding

            if decoder_self_attention_bias is None:
                self_attention_bias = None
            else:
                self_attention_bias = decoder_self_attention_bias[:, :,
                                                                  i:i + 1, :i +
                                                                  1]  # [1, 1, 1, time_step]
                # self_attention_bias = decoder_self_attention_bias[:, :, :i+1, :i+1] # [1, 1, 1, time_step]
            # print("attention bias:", self_attention_bias.shape)
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.decoder_softmax_layer.linear(decoder_outputs)
            # logits = tf.squeeze(logits, axis=[1])
            logits = tf.reshape(logits, [-1, self.params.target_vocab_size])
            return logits, cache

        return symbols_to_logits_fn
コード例 #16
0
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    params = model_params.TransformerBaseParams()
    x_inputs = tf.constant([[1, 2, 3, 0, 0], [3, 4, 5, 6, 8]], dtype=tf.int32)

    Enc_Embedding = embedding_layer.EmbeddingWeights(params.source_vocab_size,
                                                     params.hidden_size,
                                                     "source_embedding")
    embedded_inputs = Enc_Embedding(
        x_inputs, not ModeKeys.is_predict_one(ModeKeys.TRAIN))
    print(embedded_inputs.shape)
    attention_bias = model_utils.get_padding_bias(x_inputs)
    print(attention_bias.shape)
    encoder_stack = EncoderStack(params, is_train=True, mode=ModeKeys.TRAIN)
    enc_out = encoder_stack(embedded_inputs, attention_bias, None)
    print(enc_out.shape)
    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
        10)
    self_attention_bias = decoder_self_attention_bias[:, :, 0:1, :1]
    print(self_attention_bias)
    attention_bias = model_utils.get_padding_bias(x_inputs)
    cache = {
        "layer_%d" % layer: {
            "k": tf.zeros([2, 0, params.hidden_size]),
            "v": tf.zeros([2, 0, params.hidden_size]),
        }
        for layer in range(params.num_hidden_layers)
    }
    dec_input = tf.constant([[2], [3]], dtype=tf.int32)
    decoder_stack = DecoderStack(params, is_train=True, mode=ModeKeys.TRAIN)
    dec_out = decoder_stack(dec_input, enc_out, self_attention_bias,
                            attention_bias, cache)
    print(dec_out.shape)