def build(self, input_shape): """Builds the encoder stack.""" params = self.params for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"]) feed_forward_network = ffn_layer.FeedForwardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params), PrePostProcessingWrapper(feed_forward_network, params) ]) # Create final layer normalization layer. self.output_normalization = tf.keras.layers.LayerNormalization( epsilon=1e-6, dtype="float32") super(EncoderStack, self).build(input_shape)
def test_attention_layer(self): hidden_size = 64 num_heads = 4 dropout = 0.5 dim_per_head = hidden_size // num_heads layer = attention_layer.SelfAttention(hidden_size, num_heads, dropout) self.assertDictEqual( layer.get_config(), { "hidden_size": hidden_size, "num_heads": num_heads, "attention_dropout": dropout, }) length = 2 x = tf.ones([1, length, hidden_size]) bias = tf.ones([1]) cache = { "k": tf.zeros([1, 0, num_heads, dim_per_head]), "v": tf.zeros([1, 0, num_heads, dim_per_head]), } y = layer(x, bias, training=True, cache=cache) self.assertEqual(y.shape, ( 1, length, 64, )) self.assertEqual(cache["k"].shape, ( 1, length, num_heads, dim_per_head, )) self.assertEqual(cache["v"].shape, ( 1, length, num_heads, dim_per_head, ))