def build(self, input_shape): """Builds the encoder stack.""" params = self.params for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"]) feed_forward_network = ffn_layer.FeedForwardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params), PrePostProcessingWrapper(feed_forward_network, params) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params["hidden_size"]) super(EncoderStack, self).build(input_shape)
def test_feed_forward_network(self): hidden_size = 64 filter_size = 32 relu_dropout = 0.5 layer = ffn_layer.FeedForwardNetwork(hidden_size, filter_size, relu_dropout) self.assertDictEqual( layer.get_config(), { "hidden_size": hidden_size, "filter_size": filter_size, "relu_dropout": relu_dropout, }) length = 2 x = tf.ones([1, length, hidden_size]) y = layer(x, training=True) self.assertEqual(y.shape, ( 1, length, hidden_size, ))
def build(self, input_shape): """Builds the decoder stack.""" params = self.params for _ in range(params["num_hidden_layers"]): self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"]) enc_dec_attention_layer = attention_layer.Attention( params["hidden_size"], params["num_heads"], params["attention_dropout"]) feed_forward_network = ffn_layer.FeedForwardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params), PrePostProcessingWrapper(enc_dec_attention_layer, params), PrePostProcessingWrapper(feed_forward_network, params) ]) self.output_normalization = tf.keras.layers.LayerNormalization( epsilon=1e-6, dtype="float32") super(DecoderStack, self).build(input_shape)