def __init__(self,
                 params,
                 mode,
                 name=None,
                 verbose=True):
        """ Initializes decoder parameters.

        Args:
            params: A dictionary of parameters to construct the
              decoder architecture.
            mode: A mode.
            name: The name of this decoder.
            verbose: Print decoder parameters if set True.
        """
        super(TransformerDecoder, self).__init__(params, mode, name, verbose)

        self._self_attention_layers = []
        self._encdec_attention_layers = []
        for layer in range(self.params["num_layers"]):
            self._self_attention_layers.append(
                MultiHeadAttention(self.params["selfattention.params"], self.mode))
            self._encdec_attention_layers.append(
                MultiHeadAttention(self.params["attention.params"], self.mode))
        if self.mode == ModeKeys.TRAIN:
            self._DecoderOutputSpec = namedtuple(
                "TransformerOutput", "decoder_hidden")
        elif self.mode == ModeKeys.EVAL:
            self._DecoderOutputSpec = namedtuple(
                "TransformerOutput", "decoder_hidden decoder_self_attention encoder_decoder_attention")
        else:
            self._DecoderOutputSpec = namedtuple(
                "TransformerOutput", "decoder_hidden encoder_decoder_attention")
    def encode(self, features, feature_length, **kwargs):
        """ Encodes the inputs.

        Args:
            features: A Tensor, [batch_size, max_features_length, dim].
            feature_length: A Tensor, [batch_size, ].
            **kwargs:

        Returns: An instance of `collections.namedtuple`.
        """
        with tf.variable_scope(self.name) as vs:
            # [batch_size, 1, 1, timesteps], FLOAT_MIN for padding, 0.0 for non-padding
            encoder_attention_bias = MultiHeadAttention.attention_length_to_bias(
                tf.shape(features)[1], feature_length)
            outputs, enc_self_attention = self._transform(
                features, encoder_attention_bias, scope=vs, **kwargs)
            if self.mode == ModeKeys.TRAIN:
                encoder_output = self.encoder_output_tuple_type(
                    # [batch_size, timesteps, dim]
                    outputs=outputs,
                    attention_values=outputs,
                    attention_length=feature_length)
            else:
                encoder_output = self.encoder_output_tuple_type(
                    # [batch_size, timesteps, dim]
                    outputs=outputs,
                    attention_values=outputs,
                    attention_length=feature_length,
                    # a list of Tensors, [batch_size, num_heads, length_q, length_k]
                    encoder_self_attention=enc_self_attention)
            return encoder_output
    def __init__(self, params, mode, name=None, verbose=True):
        """ Initializes the parameters of the encoder.

        Args:
            params: A dictionary of parameters to construct the
              encoder architecture.
            mode: A mode.
            name: The name of this encoder.
            verbose: Print encoder parameters if set True.
        """
        super(TransformerEncoder, self).__init__(params=params,
                                                 mode=mode,
                                                 name=name,
                                                 verbose=verbose)

        self._self_attention_layers = []
        for layer in range(self.params["num_layers"]):
            self._self_attention_layers.append(
                MultiHeadAttention(self.params["selfattention.params"],
                                   self.mode))

        if self.mode == ModeKeys.TRAIN:
            self.encoder_output_tuple_type = namedtuple(
                "EncoderOutput", "outputs attention_values attention_length")
        else:
            self.encoder_output_tuple_type = namedtuple(
                "EncoderOutput",
                "outputs attention_values attention_length encoder_self_attention"
            )
    def encode(self, features, feature_length, **kwargs):
        """ Encodes the inputs.

        Args:
            features: A Tensor, [batch_size, max_features_length, dim].
            feature_length: A Tensor, [batch_size, ].
            **kwargs:

        Returns: An instance of `collections.namedtuple`.
        """
        with tf.variable_scope(self.name) as vs:
            # [batch_size, 1, 1, timesteps], FLOAT_MIN for padding, 0.0 for non-padding
            encoder_attention_bias = MultiHeadAttention.attention_length_to_bias(tf.shape(features)[1], feature_length)
            outputs, enc_self_attention = self._transform(features, encoder_attention_bias, scope=vs, **kwargs)
            if self.mode == ModeKeys.TRAIN:
                encoder_output = self.encoder_output_tuple_type(
                    # [batch_size, timesteps, dim]
                    outputs=outputs,
                    attention_values=outputs,
                    attention_length=feature_length)
            else:
                encoder_output = self.encoder_output_tuple_type(
                    # [batch_size, timesteps, dim]
                    outputs=outputs,
                    attention_values=outputs,
                    attention_length=feature_length,
                    # a list of Tensors, [batch_size, num_heads, length_q, length_k]
                    encoder_self_attention=enc_self_attention)
            return encoder_output
    def prepare(self, encoder_output, bridge, helper):
        """ Prepares for `step()` function.
        Do
            1. acquire attention information from `encoder_output`;

        Args:
            encoder_output: An instance of `collections.namedtuple`
              from `Encoder.encode()`.
            bridge: None.
            helper: An instance of `Feedback` that samples next
              symbols from logits.

        Returns: A dict containing decoder RNN states, pre-projected attention
          keys, attention values and attention length, and will be passed
          to `step()` function.
        """
        _ = bridge
        attention_values = encoder_output.attention_values
        attention_length = encoder_output.attention_length
        if hasattr(encoder_output, "attention_bias"):
            attention_bias = encoder_output.attention_bias
        else:
            attention_bias = MultiHeadAttention.attention_length_to_bias(
                tf.shape(attention_values)[1], attention_length)

        # initialize cache
        if self.mode == ModeKeys.INFER:
            decoding_states = {}
            batch_size = tf.shape(attention_values)[0]
            depth = self._self_attention_layers[0].attention_value_depth
            if depth < 0:
                # TODO please check when code goes into this condition
                depth = tf.shape(attention_values)[2]
            # initialize decoder self attention keys/values
            for l in range(self.params["num_layers"]):
                keys = tf.zeros([batch_size, 0, depth])
                values = tf.zeros([batch_size, 0, depth])
                # Ensure shape invariance for tf.while_loop.
                keys.set_shape([None, None, depth])
                values.set_shape([None, None, depth])
                with tf.variable_scope("layer_%d" % l):
                    with tf.variable_scope("encdec_attention"):
                        with tf.variable_scope(self._encdec_attention_layers[l].name):
                            preproj_keys, preproj_values = self._encdec_attention_layers[l] \
                                .compute_kv(attention_values)
                decoding_states["layer_{}".format(l)] = {
                    "self_attention": {"keys": keys, "values": values},
                    "encdec_attention": {"attention_keys": preproj_keys,
                                         "attention_values": preproj_values}}
        else:
            decoding_states = None

        init_cache = initialize_cache(
            decoding_states=decoding_states,
            memory=attention_values,
            memory_bias=attention_bias)
        return init_cache
    def prepare(self, encoder_output, bridge, helper):
        """ Prepares for `step()` function.
        Do
            1. acquire attention information from `encoder_output`;

        Args:
            encoder_output: An instance of `collections.namedtuple`
              from `Encoder.encode()`.
            bridge: None.
            helper: An instance of `Feedback` that samples next
              symbols from logits.

        Returns: A dict containing decoder RNN states, pre-projected attention
          keys, attention values and attention length, and will be passed
          to `step()` function.
        """
        _ = bridge
        attention_values = encoder_output.attention_values
        attention_length = encoder_output.attention_length
        if hasattr(encoder_output, "attention_bias"):
            attention_bias = encoder_output.attention_bias
        else:
            attention_bias = MultiHeadAttention.attention_length_to_bias(None, attention_length)

        # initialize cache
        if self.mode == ModeKeys.INFER:
            decoding_states = {}
            batch_size = tf.shape(attention_values)[0]
            depth = self._self_attention_layers[0].attention_value_depth
            if depth < 0:
                # TODO please check when code goes into this condition
                depth = tf.shape(attention_values)[2]
            # initialize decoder self attention keys/values
            for l in range(self.params["num_layers"]):
                keys = tf.zeros([batch_size, 0, depth])
                values = tf.zeros([batch_size, 0, depth])
                # Ensure shape invariance for tf.while_loop.
                keys._shape = tf.TensorShape([None, None, depth])
                values._shape = tf.TensorShape([None, None, depth])
                with tf.variable_scope("layer_%d" % l):
                    with tf.variable_scope("encdec_attention"):
                        with tf.variable_scope(self._encdec_attention_layers[l].name):
                            preproj_keys, preproj_values = self._encdec_attention_layers[l] \
                                .compute_kv(attention_values)
                decoding_states["layer_{}".format(l)] = {
                    "self_attention": {"keys": keys, "values": values},
                    "encdec_attention": {"attention_keys": preproj_keys,
                                         "attention_values": preproj_values}}
        else:
            decoding_states = None

        init_cache = initialize_cache(
            decoding_states=decoding_states,
            memory=attention_values,
            memory_bias=attention_bias)
        return init_cache