def __init__(self, params, train, name=None): super(EncoderStack, self).__init__(name=name) # self.layers = [] # for idx in range(params["num_hidden_layers"]): # 参数为6 # 0. ------------------------------- # 初始化 selfAttention 层 self_attention_layer_0 = attention_layer.SelfAttention(params["hidden_size"], params["num_heads"], params["attention_dropout"], train, n="Encoder self-attention 0", name="enc-selfatt-0") # 初始化 前向全连接 层 feed_forward_network_0 = ffn_layer.FeedFowardNetwork(params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], name="enc-ffn-0") # PrePostProcessingWrapper的目的是进行包装 # 具体操作为: layer_norm -> 具体操作 -> dropout -> resdual-connect self.self_attention_wrapper_0 = PrePostProcessingWrapper(self_attention_layer_0, params, train, name="enc-selfattwrap-0") self.feed_forward_wrapper_0 = PrePostProcessingWrapper(feed_forward_network_0, params, train, name="enc-ffnwrap-0") # 1. ------------------------------- self_attention_layer_1 = attention_layer.SelfAttention(params["hidden_size"], params["num_heads"], params["attention_dropout"], train, n="Encoder self-attention 1", name="enc-selfatt-1") feed_forward_network_1 = ffn_layer.FeedFowardNetwork(params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], name="enc-ffn-1") self.self_attention_wrapper_1 = PrePostProcessingWrapper(self_attention_layer_1, params, train, name="enc-selfattwrap-1") self.feed_forward_wrapper_1 = PrePostProcessingWrapper(feed_forward_network_1, params, train, name="enc-ffnwrap-1") # 2. ------------------------------- self_attention_layer_2 = attention_layer.SelfAttention(params["hidden_size"], params["num_heads"], params["attention_dropout"], train, n="Encoder self-attention 2", name="enc-selfatt-2") feed_forward_network_2 = ffn_layer.FeedFowardNetwork(params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], name="enc-ffn-2") self.self_attention_wrapper_2 = PrePostProcessingWrapper(self_attention_layer_2, params, train, name="enc-selfattwrap-2") self.feed_forward_wrapper_2 = PrePostProcessingWrapper(feed_forward_network_2, params, train, name="enc-ffnwrap-2") # layer-norm 层,用于最终输出时使用 self.output_normalization = layer_norm.LayerNormalization(params["hidden_size"], name="enc-norm")
def __init__(self, params, train, name=None): super(DecoderStack, self).__init__(name=name) # 0 ----------------------- self_attention_layer_0 = attention_layer.SelfAttention(params["hidden_size"], params["num_heads"], params["attention_dropout"], train, n="Decoder self-attention 0", name="dec-selfatt-0") enc_dec_attention_layer_0 = attention_layer.Attention(params["hidden_size"], params["num_heads"], params["attention_dropout"], train, n="Decoder-encoder attention 0", name="dec-enc-0") feed_forward_network_0 = ffn_layer.FeedFowardNetwork(params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], name="dec-ffn-0") self.self_attention_wrapper_0 = PrePostProcessingWrapper(self_attention_layer_0, params, train, name="dec-selfattwrap-0") self.enc_dec_attention_wrapper_0 = PrePostProcessingWrapper(enc_dec_attention_layer_0, params, train, name="dec-encwrap-0") self.feed_forward_wrapper_0 = PrePostProcessingWrapper(feed_forward_network_0, params, train, name="dec-ffnwrap-0") # 1 ----------------------- self_attention_layer_1 = attention_layer.SelfAttention(params["hidden_size"], params["num_heads"], params["attention_dropout"], train, n="Decoder self-attention 1", name="dec-selfatt-1") enc_dec_attention_layer_1 = attention_layer.Attention(params["hidden_size"], params["num_heads"], params["attention_dropout"], train, n="Decoder-encoder attention 1", name="dec-enc-1") feed_forward_network_1 = ffn_layer.FeedFowardNetwork(params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], name="dec-ffn-1") self.self_attention_wrapper_1 = PrePostProcessingWrapper(self_attention_layer_1, params, train, name="dec-selfattwrap-1") self.enc_dec_attention_wrapper_1 = PrePostProcessingWrapper(enc_dec_attention_layer_1, params, train, name="dec-encwrap-1") self.feed_forward_wrapper_1 = PrePostProcessingWrapper(feed_forward_network_1, params, train, name="dec-ffnwrap-1") # 2 ----------------------- self_attention_layer_2 = attention_layer.SelfAttention(params["hidden_size"], params["num_heads"], params["attention_dropout"], train, n="Decoder self-attention 2", name="dec-selfatt-2") enc_dec_attention_layer_2 = attention_layer.Attention(params["hidden_size"], params["num_heads"], params["attention_dropout"], train, n="Decoder-encoder attention 2", name="dec-enc-2") feed_forward_network_2 = ffn_layer.FeedFowardNetwork(params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], name="dec-ffn-2") self.self_attention_wrapper_2 = PrePostProcessingWrapper(self_attention_layer_2, params, train, name="dec-selfattwrap-2") self.enc_dec_attention_wrapper_2 = PrePostProcessingWrapper(enc_dec_attention_layer_2, params, train, name="dec-encwrap-2") self.feed_forward_wrapper_2 = PrePostProcessingWrapper(feed_forward_network_2, params, train, name="dec-ffnwrap-2") # self.output_normalization = layer_norm.LayerNormalization(params["hidden_size"], name="dec-norm")
def build(self, input_shape): """Builds the encoder stack.""" params = self.params self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"]) feed_forward_network = feed_forward_layer.FeedForwardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], params['train']) self.self_attention_layer = PrePostProcessingWrapper(self_attention_layer, params) self.feed_forward_network = PrePostProcessingWrapper(feed_forward_network, params) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params["hidden_size"]) super(EncoderStack, self).build(input_shape)
def __init__(self, train): super(EncoderStack, self).__init__() self.layers = [] for _ in range(hiddenlayers): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( hiddensize, numhead, attentiondropout, train) feed_forward_network = ffn_layer.FeedFowardNetwork( hiddensize, filtersize, reludropout, train, True) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, train), PrePostProcessingWrapper(feed_forward_network, train) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(hiddensize)
def build(self, input_shape): params = self.params for _ in range(params['num_hidden_layers']): self_attention_layer = attention_layer.SelfAttention( params['hidden_size'], params['num_heads'], params['attention_dropout']) feed_forward_network = embedding_layer.FeedForwardNetwork( params['hidden_size'], params['filter_size'], params['relu_dropout']) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params), PrePostProcessingWrapper(feed_forward_network, params) ]) # Create final layer normalization layer. self.output_normalization = tf.keras.layers.LayerNormalization( epsilon=1e-6, dtype="float32") super(EncoderStack, self).build(input_shape)
def build(self, input_shape): params = self.params for _ in range(params['num_hidden_layers']): self_attention_layer = attention_layer.SelfAttention( params['hidden_size'], params['num_heads'], params['attention_dropout']) feed_forward_network = ffn_layer.FeedForwardNetword( params['hidden_size'], params['filter_size'], params['relu_dropout']) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params), PrePostProcessingWrapper(feed_forward_network, params) ]) self.output_normalization = layers.LayerNormalization(epsilon=1e-6, dtype=tf.float32) super().build(input_shape)
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] for _ in range(params.num_hidden_layers): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train, params.allow_ffn_pad) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params.hidden_size)
def test_attention_layer(self): embedding_size = 64 hidden_size = 512 num_heads = 4 dropout = 0.5 layer = attention_layer.SelfAttention(embedding_size, hidden_size, num_heads, dropout) length = 2 x = tf.ones([1, length, embedding_size]) bias = tf.ones([1]) cache = { "k": tf.zeros([1, 0, embedding_size]), "v": tf.zeros([1, 0, embedding_size]), } inputs = {'query_input': x, 'source_input': x, 'bias': bias, 'training': True, 'cache': cache} y, cache = layer(inputs) # y = layer(x,bias,True,cache) self.assertEqual(y.shape, (1, length, 512,)) self.assertEqual(cache["k"].shape, (1, length, embedding_size,)) self.assertEqual(cache["v"].shape, (1, length, embedding_size,))
def build(self, input_shape): """Builds the encoder stack.""" params = self.params for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"]) feed_forward_network = ffn_layer.FeedForwardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params), PrePostProcessingWrapper(feed_forward_network, params) ]) # Create final layer normalization layer. self.output_normalization = tf.keras.layers.LayerNormalization( epsilon=1e-6, dtype="float32") super(EncoderStack, self).build(input_shape)
def build(self, input_shape): """Builds the decoder stack.""" params = self.params for _ in range(params["num_hidden_layers"]): self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"]) enc_dec_attention_layer = attention_layer.Attention( params["hidden_size"], params["num_heads"], params["attention_dropout"]) feed_forward_network = ffn_layer.FeedForwardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params), PrePostProcessingWrapper(enc_dec_attention_layer, params), PrePostProcessingWrapper(feed_forward_network, params) ]) self.output_normalization = LayerNormalization(params["hidden_size"]) super(DecoderStack, self).build(input_shape)
def test_attention_layer(self): # embedding_size = 64 hidden_size = 512 num_heads = 4 dropout = 0.5 layer = attention_layer.SelfAttention(hidden_size, num_heads, dropout, True, 0.02) length = 2 x = tf.ones([1, length, hidden_size]) bias = tf.ones([1]) # cache = { # "k": tf.zeros([1, 0, hidden_size]), # "v": tf.zeros([1, 0, hidden_size]), # } inputs = {'query_input': x, 'source_input': x, 'bias': bias} y = layer(inputs) # y = layer(x,bias,True,cache) self.assertEqual(y.shape, ( 1, length, 512, ))
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] for _ in range(params["num_hidden_layers"]): self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) enc_dec_attention_layer = attention_layer.Attention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train) ]) self.output_normalization = LayerNormalization(params["hidden_size"])
def test_attention_layer(self): hidden_size = 64 num_heads = 4 dropout = 0.5 dim_per_head = hidden_size // num_heads layer = attention_layer.SelfAttention(hidden_size, num_heads, dropout) self.assertDictEqual( layer.get_config(), { "hidden_size": hidden_size, "num_heads": num_heads, "attention_dropout": dropout, }) length = 2 x = tf.ones([1, length, hidden_size]) bias = tf.ones([1]) cache = { "k": tf.zeros([1, 0, num_heads, dim_per_head]), "v": tf.zeros([1, 0, num_heads, dim_per_head]), } y = layer(x, bias, training=True, cache=cache) self.assertEqual(y.shape, ( 1, length, 64, )) self.assertEqual(cache["k"].shape, ( 1, length, num_heads, dim_per_head, )) self.assertEqual(cache["v"].shape, ( 1, length, num_heads, dim_per_head, ))