Example #1
0
    def __init__(self, params, train, **kwargs):
        super(DecoderStack, self).__init__(**kwargs)
        self.param = params
        with self.name_scope():
            self.layer = nn.Sequential()
            with self.layer.name_scope():
                for i in range(params.num_hidden_layers):
                    self_attention_layer = attention_layer.SelfAttention(
                        params.hidden_size, params.num_heads,
                        params.attention_dropout, train)
                    enc_dec_attention_layer = attention_layer.Attention(
                        params.hidden_size, params.num_heads,
                        params.attention_dropout, train)
                    feed_forward_network = fnn_layer.FeedForwardNetwork(
                        params.hidden_size, params.filter_size,
                        params.relu_dropout, train)

                    self.layer.add(
                        PrePostProcessingWrapper(self_attention_layer, params,
                                                 train),
                        PrePostProcessingWrapper(enc_dec_attention_layer,
                                                 params, train),
                        PrePostProcessingWrapper(feed_forward_network, params,
                                                 train))
            self.output_normalization = nn.LayerNorm(axis=-1, epsilon=1e-6)
 def __init__(self, params, is_train, mode):
     super(DecoderStack, self).__init__()
     self.mode = mode
     self.predict_one = ModeKeys.is_predict_one(self.mode)
     self.layers = []
     for _ in range(params.num_hidden_layers):
         self_attention_layer = attention_layer.SelfAttention(
             params.hidden_size, params.num_heads, params.attention_dropout,
             is_train, self.predict_one)
         if self.mode == ModeKeys.PREDICT_ONE_DECODER:
             enc_dec_attention_layer = attention_layer.EncDecPredictOneAttention(
                 params.hidden_size, params.num_heads,
                 params.attention_dropout, is_train, self.predict_one)
         else:
             enc_dec_attention_layer = attention_layer.Attention(
                 params.hidden_size, params.num_heads,
                 params.attention_dropout, is_train, self.predict_one)
         feed_forward_network = ffn_layer.FeedFowardNetwork(
             params.hidden_size, params.filter_size, params.relu_dropout,
             is_train, self.predict_one)
         # decoder 包含3个模块,分别是self-attention,enc_dec_attention,以及feed-forward. 分别wrapper熵layer_norm和dropout.
         self.layers.append([
             PrePostProcessingWrapper(self_attention_layer, params,
                                      is_train),
             PrePostProcessingWrapper(enc_dec_attention_layer, params,
                                      is_train),
             PrePostProcessingWrapper(feed_forward_network, params,
                                      is_train)
         ])
         self.output_normalization = LayerNormalization(params.hidden_size)
Example #3
0
    def __init__(self, params, train):
        super(DecoderStack, self).__init__()
        self.layers = []
        self.batch_size = params.batch_size
        self.beam_size = params.beam_size
        mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
                                     value=params.num_hidden_layers)
        for _ in range(params.num_hidden_layers):
            self_attention_layer = attention_layer.SelfAttention(
                params.hidden_size, params.num_heads, params.attention_dropout,
                train)
            enc_dec_attention_layer = attention_layer.Attention(
                params.hidden_size, params.num_heads, params.attention_dropout,
                train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params.hidden_size, params.filter_size, params.relu_dropout,
                train)

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(enc_dec_attention_layer, params,
                                         train),
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])

        self.output_normalization = LayerNormalization(params.hidden_size)
        self.encdec_cache = {}
        self.enc_out_cache = {}
Example #4
0
  def __init__(self, params, train):
    super(DecoderStack, self).__init__()
    self.layers = []
    for _ in range(params.num_hidden_layers):
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      enc_dec_attention_layer = attention_layer.Attention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params, train),
          PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
          PrePostProcessingWrapper(feed_forward_network, params, train)])

    self.output_normalization = LayerNormalization(params.hidden_size)
Example #5
0
  def __init__(self, params, train):
    super(DecoderStack, self).__init__()
    self.layers = []
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
        value=params.num_hidden_layers)
    for _ in range(params.num_hidden_layers):
      # SSY 3.1  transformer/model/attention_layer.py Dense and matmul
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      # SSY 3.2  transformer/model/attention_layer.py Dense and matmul
      enc_dec_attention_layer = attention_layer.Attention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      # SSY 3.3 transformer/model/ffn_layer.py only Dense
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params, train),
          PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
          PrePostProcessingWrapper(feed_forward_network, params, train)])

    self.output_normalization = LayerNormalization(params.hidden_size)