def __init__(self, params, train):
        super(DecoderStack, self).__init__()
        self.layers = []
        for _ in range(params["num_hidden_layers"]):
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            enc_dec_attention_layer = attention_layer.Attention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                #params["hidden_size"], params["filter_size"],
                #params["hidden_size"] * 2, params["filter_size"],
                params["hidden_size"] + params["latent_size"],
                params["filter_size"],
                params["relu_dropout"],
                train,
                params["allow_ffn_pad"],
                output_size=params["hidden_size"])

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(enc_dec_attention_layer, params,
                                         train),
                PrePostProcessingWrapper(
                    feed_forward_network,
                    params,
                    train,
                    input_hidden_size=params["hidden_size"] +
                    params["latent_size"],
                    output_hidden_size=params["hidden_size"])
            ])

        self.output_normalization = LayerNormalization(params["hidden_size"])
Beispiel #2
0
    def __init__(self, params, train):
        super(DecoderStack, self).__init__()
        self.layers = []
        # N层decoder
        for _ in range(params["num_hidden_layers"]):
            # decoder端self-attention
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            # source-target attention
            enc_dec_attention_layer = attention_layer.Attention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            # ffn
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])
            # 用PrePostProcess同样做layer norm、dropout
            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(enc_dec_attention_layer, params,
                                         train),
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])

        self.output_normalization = LayerNormalization(params["hidden_size"])
Beispiel #3
0
    def __init__(self, params, train):
        super(EncoderStack, self).__init__()
        self.return_attention_scores = params['return_attention_scores']
        self.layers = []

        no_scores_params = copy.deepcopy(params)
        no_scores_params.update({'return_attention_scores': False})

        for _ in range(params["num_hidden_layers"]):
            # Create sublayers for each layer.
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], params["return_attention_scores"],
                train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(feed_forward_network,
                                         no_scores_params, train)
            ])

        # Create final layer normalization layer.
        self.output_normalization = LayerNormalization(params["hidden_size"])
 def __init__(self, params, train):
     super(DecoderStack, self).__init__()
     self.layers = []
     for i in range(params["num_hidden_layers"]):
         
         # Flag based calling of Self Attention
         if 'dec-self' in params["concrete_heads"]:
             print("*** Decoder Concrete ***")
             self_attention_layer = attention_layer.SelfAttentionConcrete(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"])
         elif not params["alive_heads_dec_self"]:
             print("*** Decoder Plain ***")
             self_attention_layer = attention_layer.SelfAttention(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train)
         else:
             print("*** Decoder Fixed Alive ***")
             print("The fixed gates used for decoder self attention are : {}".format(params['alive_heads_dec_self']))
             self_attention_layer = attention_layer.SelfAttentionFixedAliveHeads(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train,
                 head_gate=params["alive_heads_dec_self"][i])
         
         # Flag based calling of encoder-decoder Attention
         if 'enc-dec' in params["concrete_heads"]:
             print("*** Enc-Dec Concrete ***")
             enc_dec_attention_layer = attention_layer.AttentionConcrete(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"])
         elif not params["alive_heads_enc_dec"]:
             print("*** Enc-Dec Plain ***")
             enc_dec_attention_layer = attention_layer.Attention(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train)
         else:
             print("*** Enc-Dec Fixed Alive ***")
             print("The fixed gates used for encoder decoder attention are : {}".format(params['alive_heads_enc_dec']))
             enc_dec_attention_layer = attention_layer.AttentionFixedAliveHeads(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train,
                 head_gate=params["alive_heads_enc_dec"][i])
         
         # Feed Forward layer
         feed_forward_network = ffn_layer.FeedFowardNetwork(
             params["hidden_size"], params["filter_size"],
             params["relu_dropout"], train, params["allow_ffn_pad"])
         
         self.layers.append([
             PrePostProcessingWrapper(self_attention_layer, params, train),
             PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
             PrePostProcessingWrapper(feed_forward_network, params, train)])
     
     self.output_normalization = LayerNormalization(params["hidden_size"])
    def __init__(self, params, train):
        super(EncoderStack, self).__init__()
        self.layers = []
        for _ in range(params["num_hidden_layers"]):
            # Create sublayers for each layer.
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])
  def __init__(self, params, train):
    super(EncoderStack, self).__init__()
    self.layers = []
    for _ in range(params.num_hidden_layers):
      # Create sublayers for each layer.
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params, train),
          PrePostProcessingWrapper(feed_forward_network, params, train)])

    # Create final layer normalization layer.
    self.output_normalization = LayerNormalization(params.hidden_size)
 def __init__(self, params, train):
     super(EncoderStack, self).__init__()
     self.layers = []
     
     assert not ('enc-self' in params["concrete_heads"] and params["alive_heads_enc_self"]), \
         "enc-self is passed as both with trainable concrete gates heads and fixed gates"
     assert not ('dec-self' in params["concrete_heads"] and params["alive_heads_dec_self"]), \
         "dec-self is passed as both with trainable concrete gates heads and fixed gates"
     assert not ('dec-enc' in params["concrete_heads"] and params["alive_heads_dec_enc"]), \
         "dec-enc is passed as both with trainable concrete gates heads and fixed gates"
     
     for i in range(params["num_hidden_layers"]):
         # Create sublayers for each layer.
         
         if 'enc-self' in params["concrete_heads"]:
             print("*** Encoder Concrete ***")
             self_attention_layer = attention_layer.SelfAttentionConcrete(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"])
         elif not params["alive_heads_enc_self"]:
             print("*** Encoder Plain ***")
             self_attention_layer = attention_layer.SelfAttention(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train)
         else:
             print("*** Encoder Fixed Alive ***")
             print("The fixed gates used for encoder self attention are : {}".format(params['alive_heads_enc_self']))
             self_attention_layer = attention_layer.SelfAttentionFixedAliveHeads(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train,
                 head_gate=params["alive_heads_enc_self"][i])
         
         feed_forward_network = ffn_layer.FeedFowardNetwork(
             params["hidden_size"], params["filter_size"],
             params["relu_dropout"], train, params["allow_ffn_pad"])
         
         self.layers.append([
             PrePostProcessingWrapper(self_attention_layer, params, train),
             PrePostProcessingWrapper(feed_forward_network, params, train)])
     
     # Create final layer normalization layer.
     self.output_normalization = LayerNormalization(params["hidden_size"])
Beispiel #8
0
    def __init__(self, params, train):
        super(EncoderStack, self).__init__()
        self.layers = []  # 原来是以数组形式呈现的,呵呵
        # 定义了N层结构相同,参数不同的self-attention+feedforward层
        for _ in range(params["num_hidden_layers"]):
            # Create sublayers for each layer.  每次定义一个self-att和ffn
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])

        # Create final layer normalization layer.
        self.output_normalization = LayerNormalization(params["hidden_size"])
    def __init__(self, params, train):
        super(EncoderStack, self).__init__()  # 这是干啥
        self.layers = []  # 用列表来储存这些层,每个元素是一个二元元组
        for _ in range(params["num_hidden_layers"]):  # 循环建立N个独立层,这个参数被设置为6
            # Create sublayers for each layer.
            self_attention_layer = attention_layer.SelfAttention(  # SelfAttention 层
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(  # 前向传播层
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])

            self.layers.append([
                PrePostProcessingWrapper(
                    self_attention_layer, params,
                    train),  # 所有的层都要经过layer normalizaiton和dropout
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])

        # Create final layer normalization layer.
        self.output_normalization = LayerNormalization(
            params["hidden_size"])  # 怎么起作用的