def __init__(self, params, train):
        super(EncoderOutputLayer, self).__init__()
        input_hidden_size = 2 * params["hidden_size"]
        output_hidden_size = params["hidden_size"]

        self.feed_foward_layer = ffn_layer.FeedFowardNetwork(
            #input_hidden_size, params["hidden_size"],
            #input_hidden_size, params["filter_size"],
            input_hidden_size,
            output_hidden_size,
            params["relu_dropout"],
            train,
            params["allow_ffn_pad"],
            output_size=output_hidden_size,
            activation=tf.nn.relu)
        #activation=tf.nn.tanh)

        self.feed_foward_layer = PrePostProcessingWrapper(
            self.feed_foward_layer,
            params,
            train,
            input_hidden_size=input_hidden_size,
            output_hidden_size=output_hidden_size)

        self.output_norm_layer = LayerNormalization(output_hidden_size)
    def __init__(self, params, train):
        super(DecoderStack, self).__init__()
        self.layers = []
        for _ in range(params["num_hidden_layers"]):
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            enc_dec_attention_layer = attention_layer.Attention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                #params["hidden_size"], params["filter_size"],
                #params["hidden_size"] * 2, params["filter_size"],
                params["hidden_size"] + params["latent_size"],
                params["filter_size"],
                params["relu_dropout"],
                train,
                params["allow_ffn_pad"],
                output_size=params["hidden_size"])

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(enc_dec_attention_layer, params,
                                         train),
                PrePostProcessingWrapper(
                    feed_forward_network,
                    params,
                    train,
                    input_hidden_size=params["hidden_size"] +
                    params["latent_size"],
                    output_hidden_size=params["hidden_size"])
            ])

        self.output_normalization = LayerNormalization(params["hidden_size"])
Exemple #3
0
    def __init__(self, params, train):
        super(EncoderStack, self).__init__()
        self.return_attention_scores = params['return_attention_scores']
        self.layers = []

        no_scores_params = copy.deepcopy(params)
        no_scores_params.update({'return_attention_scores': False})

        for _ in range(params["num_hidden_layers"]):
            # Create sublayers for each layer.
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], params["return_attention_scores"],
                train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(feed_forward_network,
                                         no_scores_params, train)
            ])

        # Create final layer normalization layer.
        self.output_normalization = LayerNormalization(params["hidden_size"])
def compute_bow_loss(latent_sample, targets, params, train):
    """
    Args:
      latent_variable: size [batch_size, hidden_size]
      targets: size [batch_size, length]
  """
    with tf.variable_scope("bow_decoder"):
        # feed forward
        bow_ffn_layer = ffn_layer.FeedFowardNetwork(
            params["latent_size"],
            params["filter_size"],
            params["relu_dropout"],
            train,
            params["allow_ffn_pad"],
            output_size=params["vocab_size"],
            activation=tf.nn.relu)
        expd_lv = tf.expand_dims(latent_sample,
                                 axis=1)  # get [batch_size, 1, hidden_size]
        bow_logits = bow_ffn_layer(
            expd_lv, padding=None)  # get [batch_size, 1, vocab_size]
        length = tf.shape(targets)[1]
        tile_bow_logits = tf.tile(
            bow_logits, [1, length, 1])  # get [batch_size, length, vocab_size]

        # compute loss
        xentropy, weights = metrics.padded_cross_entropy_loss(
            tile_bow_logits, targets, params["label_smoothing"],
            params["vocab_size"])

        # average first in sentence, then in batch
        bow_predict_loss = tf.reduce_sum(xentropy)

        return bow_predict_loss
    def __init__(self, params, train):
        super(LatentVariableLayer, self).__init__()
        self.train = train

        # debug
        if self.train:
            input_hidden_size = 2 * params["hidden_size"]
            output_hidden_size = 2 * params["hidden_size"]
            self.feed_foward_layer = ffn_layer.FeedFowardNetwork(
                #input_hidden_size, params["hidden_size"],
                input_hidden_size,
                params["filter_size"],
                params["relu_dropout"],
                train,
                params["allow_ffn_pad"],
                output_size=output_hidden_size,
                activation=tf.nn.relu)
            #activation=tf.nn.tanh)

            self.feed_foward_layer = PrePostProcessingWrapper(
                self.feed_foward_layer,
                params,
                train,
                input_hidden_size=input_hidden_size,
                output_hidden_size=output_hidden_size)
Exemple #6
0
    def __init__(self, params, train):
        super(DecoderStack, self).__init__()
        self.layers = []
        # N层decoder
        for _ in range(params["num_hidden_layers"]):
            # decoder端self-attention
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            # source-target attention
            enc_dec_attention_layer = attention_layer.Attention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            # ffn
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])
            # 用PrePostProcess同样做layer norm、dropout
            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(enc_dec_attention_layer, params,
                                         train),
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])

        self.output_normalization = LayerNormalization(params["hidden_size"])
Exemple #7
0
  def __init__(self, params, train):
    super(LatentVariableLayer, self).__init__()
    self.train = train
    #output_hidden_size = 2 * params["hidden_size"] # use hidden_size as latent_size
    output_hidden_size = 2 * params["latent_size"]
    self.norm = True
    self.drop = True
    self.residual = False
    if params["use_std"]:
      self.residual = True
    self.params = params

    self.prior_ffl = ffn_layer.FeedFowardNetwork(
        params["hidden_size"], params["filter_size"],
        params["relu_dropout"], train, params["allow_ffn_pad"],
        output_size = output_hidden_size,
        activation=tf.nn.relu)
    self.prior_ffl = PrePostProcessingWrapper(
        self.prior_ffl, params, train,
        input_hidden_size = params["hidden_size"],
        output_hidden_size = output_hidden_size,
        norm=self.norm, drop=self.drop, residual=self.residual) 
        #norm=True, drop=False, residual=False) 
    if self.params["use_std"]:
        self.prior_mu_layer = tf.layers.Dense(
            params["latent_size"], use_bias=False, activation=tf.tanh, name="mu_layer")
        self.prior_std_layer = tf.layers.Dense(
            params["latent_size"], use_bias=False, activation=tf.sigmoid, name="std_layer")

    if self.train:
      input_hidden_size =  2 * params["hidden_size"]
      self.recog_ffl = ffn_layer.FeedFowardNetwork(
          input_hidden_size, params["filter_size"],
          params["relu_dropout"], train, params["allow_ffn_pad"],
          output_size = output_hidden_size,
          activation=tf.nn.relu)
      self.recog_ffl = PrePostProcessingWrapper(
          self.recog_ffl, params, train,
          input_hidden_size = input_hidden_size,
          output_hidden_size = output_hidden_size,
          norm=self.norm, drop=self.drop, residual=self.residual) 
          #norm=True, drop=False, residual=False) 
      if self.params["use_std"]:
          self.recog_mu_layer = tf.layers.Dense(
              params["latent_size"], use_bias=False, activation=tf.tanh, name="mu_layer")
          self.recog_std_layer = tf.layers.Dense(
              params["latent_size"], use_bias=False, activation=tf.sigmoid, name="std_layer")
 def __init__(self, params, train):
     super(DecoderStack, self).__init__()
     self.layers = []
     for i in range(params["num_hidden_layers"]):
         
         # Flag based calling of Self Attention
         if 'dec-self' in params["concrete_heads"]:
             print("*** Decoder Concrete ***")
             self_attention_layer = attention_layer.SelfAttentionConcrete(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"])
         elif not params["alive_heads_dec_self"]:
             print("*** Decoder Plain ***")
             self_attention_layer = attention_layer.SelfAttention(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train)
         else:
             print("*** Decoder Fixed Alive ***")
             print("The fixed gates used for decoder self attention are : {}".format(params['alive_heads_dec_self']))
             self_attention_layer = attention_layer.SelfAttentionFixedAliveHeads(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train,
                 head_gate=params["alive_heads_dec_self"][i])
         
         # Flag based calling of encoder-decoder Attention
         if 'enc-dec' in params["concrete_heads"]:
             print("*** Enc-Dec Concrete ***")
             enc_dec_attention_layer = attention_layer.AttentionConcrete(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"])
         elif not params["alive_heads_enc_dec"]:
             print("*** Enc-Dec Plain ***")
             enc_dec_attention_layer = attention_layer.Attention(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train)
         else:
             print("*** Enc-Dec Fixed Alive ***")
             print("The fixed gates used for encoder decoder attention are : {}".format(params['alive_heads_enc_dec']))
             enc_dec_attention_layer = attention_layer.AttentionFixedAliveHeads(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train,
                 head_gate=params["alive_heads_enc_dec"][i])
         
         # Feed Forward layer
         feed_forward_network = ffn_layer.FeedFowardNetwork(
             params["hidden_size"], params["filter_size"],
             params["relu_dropout"], train, params["allow_ffn_pad"])
         
         self.layers.append([
             PrePostProcessingWrapper(self_attention_layer, params, train),
             PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
             PrePostProcessingWrapper(feed_forward_network, params, train)])
     
     self.output_normalization = LayerNormalization(params["hidden_size"])
Exemple #9
0
 def __init__(self, params, train):
   super(SentenceEmbeddingLayer, self).__init__()
   self.sent_attention_layer = ffn_layer.FeedFowardNetwork( # 2 sub-layers, one is feedfoward with activation, another is linear
       params["hidden_size"], params["hidden_size"],
       params["relu_dropout"], train, params["allow_ffn_pad"],
       output_size = 1, activation=tf.nn.relu,
       #use_bias_output=True)
       use_bias_output=False)
       #output_size = 1, activation=tf.nn.tanh)
   self.sent_attention_layer = PrePostProcessingWrapper(
       self.sent_attention_layer, params, train,
       input_hidden_size = params["hidden_size"],
       output_hidden_size = 1, norm=False) # encoder_stack do nomarlization, do not re-normarlize
  def __init__(self, params, train):
    super(EncoderStack, self).__init__()
    self.layers = []
    for _ in range(params.num_hidden_layers):
      # Create sublayers for each layer.
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params, train),
          PrePostProcessingWrapper(feed_forward_network, params, train)])

    # Create final layer normalization layer.
    self.output_normalization = LayerNormalization(params.hidden_size)
    def __init__(self, params, train):
        super(EncoderStack, self).__init__()
        self.layers = []
        for _ in range(params["num_hidden_layers"]):
            # Create sublayers for each layer.
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])
 def __init__(self, params, train):
     super(SentenceEmbeddingLayer, self).__init__()
     self.sent_attention_layer = ffn_layer.FeedFowardNetwork(  # 2 sub-layers, one is feedfoward with activation, another is linear
         params["hidden_size"],
         params["hidden_size"],
         params["relu_dropout"],
         train,
         params["allow_ffn_pad"],
         output_size=1,
         activation=tf.nn.relu)
     #output_size = 1, activation=tf.nn.tanh)
     self.sent_attention_layer = PrePostProcessingWrapper(
         self.sent_attention_layer,
         params,
         train,
         input_hidden_size=params["hidden_size"],
         output_hidden_size=1)
    def __init__(self, params, train, input_size, as_standard_norm=False):
        # helping vars
        self.train = train
        self.params = params
        self.as_standard_norm = as_standard_norm

        if as_standard_norm:
            return

        self.input_size = input_size
        self.output_size = 2 * params["latent_size"]
        assert params["num_latent_layers"] >= 1
        self.layers = []
        for i in range(params["num_latent_layers"]):
            temp_input_size = self.input_size
            if i > 0: temp_input_size = self.output_size
            ffl = ffn_layer.FeedFowardNetwork(temp_input_size,
                                              params["filter_size"],
                                              params["relu_dropout"],
                                              train,
                                              params["allow_ffn_pad"],
                                              output_size=self.output_size,
                                              activation=tf.tanh)
            #output_size = self.output_size, activation=tf.nn.relu)
            ffl = PrePostProcessingWrapper(ffl,
                                           params,
                                           train,
                                           input_hidden_size=temp_input_size,
                                           output_hidden_size=self.output_size,
                                           norm=True,
                                           drop=True,
                                           residual=(i > 0))
            #norm=True, drop=True, residual=(i>0 and i<params["num_latent_layers"]-1))
            self.layers.append(ffl)

        # if use_std, do another feed forward.
        if self.params["use_std"]:
            self.mu_layer = tf.layers.Dense(params["latent_size"],
                                            use_bias=False,
                                            activation=tf.tanh,
                                            name="mu_layer")
            self.std_layer = tf.layers.Dense(params["latent_size"],
                                             use_bias=False,
                                             activation=tf.sigmoid,
                                             name="std_layer")
 def __init__(self, params, train):
     super(EncoderStack, self).__init__()
     self.layers = []
     
     assert not ('enc-self' in params["concrete_heads"] and params["alive_heads_enc_self"]), \
         "enc-self is passed as both with trainable concrete gates heads and fixed gates"
     assert not ('dec-self' in params["concrete_heads"] and params["alive_heads_dec_self"]), \
         "dec-self is passed as both with trainable concrete gates heads and fixed gates"
     assert not ('dec-enc' in params["concrete_heads"] and params["alive_heads_dec_enc"]), \
         "dec-enc is passed as both with trainable concrete gates heads and fixed gates"
     
     for i in range(params["num_hidden_layers"]):
         # Create sublayers for each layer.
         
         if 'enc-self' in params["concrete_heads"]:
             print("*** Encoder Concrete ***")
             self_attention_layer = attention_layer.SelfAttentionConcrete(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train, {'l0_penalty': 1.0}, concrete_coef=params["concrete_coef"])
         elif not params["alive_heads_enc_self"]:
             print("*** Encoder Plain ***")
             self_attention_layer = attention_layer.SelfAttention(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train)
         else:
             print("*** Encoder Fixed Alive ***")
             print("The fixed gates used for encoder self attention are : {}".format(params['alive_heads_enc_self']))
             self_attention_layer = attention_layer.SelfAttentionFixedAliveHeads(
                 params["hidden_size"], params["num_heads"],
                 params["attention_dropout"], train,
                 head_gate=params["alive_heads_enc_self"][i])
         
         feed_forward_network = ffn_layer.FeedFowardNetwork(
             params["hidden_size"], params["filter_size"],
             params["relu_dropout"], train, params["allow_ffn_pad"])
         
         self.layers.append([
             PrePostProcessingWrapper(self_attention_layer, params, train),
             PrePostProcessingWrapper(feed_forward_network, params, train)])
     
     # Create final layer normalization layer.
     self.output_normalization = LayerNormalization(params["hidden_size"])
Exemple #15
0
    def __init__(self, params, train):
        super(EncoderStack, self).__init__()
        self.layers = []  # 原来是以数组形式呈现的,呵呵
        # 定义了N层结构相同,参数不同的self-attention+feedforward层
        for _ in range(params["num_hidden_layers"]):
            # Create sublayers for each layer.  每次定义一个self-att和ffn
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])

        # Create final layer normalization layer.
        self.output_normalization = LayerNormalization(params["hidden_size"])
    def __init__(self, params, train):
        super(EncoderStack, self).__init__()  # 这是干啥
        self.layers = []  # 用列表来储存这些层,每个元素是一个二元元组
        for _ in range(params["num_hidden_layers"]):  # 循环建立N个独立层,这个参数被设置为6
            # Create sublayers for each layer.
            self_attention_layer = attention_layer.SelfAttention(  # SelfAttention 层
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(  # 前向传播层
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])

            self.layers.append([
                PrePostProcessingWrapper(
                    self_attention_layer, params,
                    train),  # 所有的层都要经过layer normalizaiton和dropout
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])

        # Create final layer normalization layer.
        self.output_normalization = LayerNormalization(
            params["hidden_size"])  # 怎么起作用的
    def __init__(self, params, train):
        super(EncoderStack, self).__init__()
        self.layers = []
        for _ in range(params["num_hidden_layers"]):
            # Create sublayers for each layer.
            self_attention_layer_one = attention2_layer.SelfAttentionOne(
                params["hidden_size"], params["num_heads"],
                params["num_vir_entities"], params["attention_dropout"], train)
            self_attention_layer_two = attention2_layer.SelfAttentionTwo(
                params["hidden_size"], params["d2_model"], params["num_heads"],
                params["num_vir_entities"], params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], params["allow_ffn_pad"], train)

            self.layers.append([
                self_attention_layer_one, self_attention_layer_two,
                feed_forward_network
            ])

        self.train = train
        self.postprocess_dropout = params["layer_postprocess_dropout"]
        self.d2_model = params["d2_model"]
        self.hidden_size = params["hidden_size"]