def __init__(self, params):
        super(DecoderStack, self).__init__()
        self.params = params
        self.layers = []
        for _ in range(params.decoder_num_layers):
            self_attention_layer = LightConv(params, padding='VALID')
            enc_dec_attention_layer = MultiHeadAttentionLayer(
                params.num_heads, params.hidden_size, params.keep_prob)
            feed_forward_network = FeedForwardLayer(params.hidden_size,
                                                    params.ff_size,
                                                    params.keep_prob)

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params),
                PrePostProcessingWrapper(enc_dec_attention_layer, params),
                PrePostProcessingWrapper(feed_forward_network, params)
            ])
        self.output_normalization = LayerNormalization(params.hidden_size)
    def __init__(self, params):
        super(EncoderStack, self).__init__()
        self.params = params
        self.layers = []
        for _ in range(params.encoder_num_layers):
            # Create sublayers for each layer.
            self_attention_layer = SelfAttentionLayer(params.num_heads,
                                                      params.hidden_size,
                                                      params.keep_prob)
            feed_forward_network = FeedForwardLayer(params.hidden_size,
                                                    params.ff_size,
                                                    params.keep_prob)

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params),
                PrePostProcessingWrapper(feed_forward_network, params)
            ])

        # Create final layer normalization layer.
        self.output_normalization = LayerNormalization(params.hidden_size)