def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] for _ in range(params["num_hidden_layers"]): self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) enc_dec_attention_layer = attention_layer.Attention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( #params["hidden_size"], params["filter_size"], #params["hidden_size"] * 2, params["filter_size"], params["hidden_size"] + params["latent_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"], output_size=params["hidden_size"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper( feed_forward_network, params, train, input_hidden_size=params["hidden_size"] + params["latent_size"], output_hidden_size=params["hidden_size"]) ]) self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] # N层decoder for _ in range(params["num_hidden_layers"]): # decoder端self-attention self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) # source-target attention enc_dec_attention_layer = attention_layer.Attention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) # ffn feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) # 用PrePostProcess同样做layer norm、dropout self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train) ]) self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(EncoderStack, self).__init__() self.return_attention_scores = params['return_attention_scores'] self.layers = [] no_scores_params = copy.deepcopy(params) no_scores_params.update({'return_attention_scores': False}) for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], params["return_attention_scores"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, no_scores_params, train) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] for i in range(params["num_hidden_layers"]): # Flag based calling of Self Attention if 'dec-self' in params["concrete_heads"]: print("*** Decoder Concrete ***") self_attention_layer = attention_layer.SelfAttentionConcrete( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"]) elif not params["alive_heads_dec_self"]: print("*** Decoder Plain ***") self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) else: print("*** Decoder Fixed Alive ***") print("The fixed gates used for decoder self attention are : {}".format(params['alive_heads_dec_self'])) self_attention_layer = attention_layer.SelfAttentionFixedAliveHeads( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, head_gate=params["alive_heads_dec_self"][i]) # Flag based calling of encoder-decoder Attention if 'enc-dec' in params["concrete_heads"]: print("*** Enc-Dec Concrete ***") enc_dec_attention_layer = attention_layer.AttentionConcrete( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"]) elif not params["alive_heads_enc_dec"]: print("*** Enc-Dec Plain ***") enc_dec_attention_layer = attention_layer.Attention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) else: print("*** Enc-Dec Fixed Alive ***") print("The fixed gates used for encoder decoder attention are : {}".format(params['alive_heads_enc_dec'])) enc_dec_attention_layer = attention_layer.AttentionFixedAliveHeads( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, head_gate=params["alive_heads_enc_dec"][i]) # Feed Forward layer feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train) ])
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] for _ in range(params.num_hidden_layers): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params.hidden_size)
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] assert not ('enc-self' in params["concrete_heads"] and params["alive_heads_enc_self"]), \ "enc-self is passed as both with trainable concrete gates heads and fixed gates" assert not ('dec-self' in params["concrete_heads"] and params["alive_heads_dec_self"]), \ "dec-self is passed as both with trainable concrete gates heads and fixed gates" assert not ('dec-enc' in params["concrete_heads"] and params["alive_heads_dec_enc"]), \ "dec-enc is passed as both with trainable concrete gates heads and fixed gates" for i in range(params["num_hidden_layers"]): # Create sublayers for each layer. if 'enc-self' in params["concrete_heads"]: print("*** Encoder Concrete ***") self_attention_layer = attention_layer.SelfAttentionConcrete( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"]) elif not params["alive_heads_enc_self"]: print("*** Encoder Plain ***") self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) else: print("*** Encoder Fixed Alive ***") print("The fixed gates used for encoder self attention are : {}".format(params['alive_heads_enc_self'])) self_attention_layer = attention_layer.SelfAttentionFixedAliveHeads( params["hidden_size"], params["num_heads"], params["attention_dropout"], train, head_gate=params["alive_heads_enc_self"][i]) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] # 原来是以数组形式呈现的,呵呵 # 定义了N层结构相同,参数不同的self-attention+feedforward层 for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. 每次定义一个self-att和ffn self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params["hidden_size"])
def __init__(self, params, train): super(EncoderStack, self).__init__() # 这是干啥 self.layers = [] # 用列表来储存这些层,每个元素是一个二元元组 for _ in range(params["num_hidden_layers"]): # 循环建立N个独立层,这个参数被设置为6 # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( # SelfAttention 层 params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( # 前向传播层 params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper( self_attention_layer, params, train), # 所有的层都要经过layer normalizaiton和dropout PrePostProcessingWrapper(feed_forward_network, params, train) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization( params["hidden_size"]) # 怎么起作用的