def __init__(self, params, train, **kwargs): super(DecoderStack, self).__init__(**kwargs) self.param = params with self.name_scope(): self.layer = nn.Sequential() with self.layer.name_scope(): for i in range(params.num_hidden_layers): self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) enc_dec_attention_layer = attention_layer.Attention( params.hidden_size, params.num_heads, params.attention_dropout, train) feed_forward_network = fnn_layer.FeedForwardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layer.add( PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)) self.output_normalization = nn.LayerNorm(axis=-1, epsilon=1e-6)
def __init__(self, params, is_train, mode): super(DecoderStack, self).__init__() self.mode = mode self.predict_one = ModeKeys.is_predict_one(self.mode) self.layers = [] for _ in range(params.num_hidden_layers): self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, is_train, self.predict_one) if self.mode == ModeKeys.PREDICT_ONE_DECODER: enc_dec_attention_layer = attention_layer.EncDecPredictOneAttention( params.hidden_size, params.num_heads, params.attention_dropout, is_train, self.predict_one) else: enc_dec_attention_layer = attention_layer.Attention( params.hidden_size, params.num_heads, params.attention_dropout, is_train, self.predict_one) feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, is_train, self.predict_one) # decoder 包含3个模块,分别是self-attention,enc_dec_attention,以及feed-forward. 分别wrapper熵layer_norm和dropout. self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, is_train), PrePostProcessingWrapper(enc_dec_attention_layer, params, is_train), PrePostProcessingWrapper(feed_forward_network, params, is_train) ]) self.output_normalization = LayerNormalization(params.hidden_size)
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] self.batch_size = params.batch_size self.beam_size = params.beam_size mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=params.num_hidden_layers) for _ in range(params.num_hidden_layers): self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) enc_dec_attention_layer = attention_layer.Attention( params.hidden_size, params.num_heads, params.attention_dropout, train) feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train) ]) self.output_normalization = LayerNormalization(params.hidden_size) self.encdec_cache = {} self.enc_out_cache = {}
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] for _ in range(params.num_hidden_layers): self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) enc_dec_attention_layer = attention_layer.Attention( params.hidden_size, params.num_heads, params.attention_dropout, train) feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) self.output_normalization = LayerNormalization(params.hidden_size)
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=params.num_hidden_layers) for _ in range(params.num_hidden_layers): # SSY 3.1 transformer/model/attention_layer.py Dense and matmul self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) # SSY 3.2 transformer/model/attention_layer.py Dense and matmul enc_dec_attention_layer = attention_layer.Attention( params.hidden_size, params.num_heads, params.attention_dropout, train) # SSY 3.3 transformer/model/ffn_layer.py only Dense feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) self.output_normalization = LayerNormalization(params.hidden_size)