예제 #1
0
 def sub_layer_multi_head_attention(self,
                                    layer_index,
                                    Q,
                                    K_s,
                                    type,
                                    mask=None):  # COMMON FUNCTION
     """
     multi head attention as sub layer
     :param layer_index: index of layer number
     :param Q: shape should be: [batch_size*sequence_length,embed_size]
     :param k_s: shape should be: [batch_size*sequence_length,embed_size]
     :param type: encoder,decoder or encoder_decoder_attention
     :param use_mask:whether to use mask or not.if use mask,illegal connection will be mask as huge big negative value.so it's possiblitity will become zero.
     input of this layer's shape:[batch_size,sequence_length]
     :return: output of multi head attention.shape:[batch_size,sequence_length,d_model]
     """
     print(
         "===================================>base_model.sub_layer_multi_head_attention.type:",
         type)
     with tf.variable_scope("base_model_sub_layer_multi_head_attention_" +
                            type + str(layer_index)):
         # below is to handle attention for encoder and decoder with difference length:
         length = self.decoder_sent_length if (
             type != 'encoder'
             and self.sequence_length != self.decoder_sent_length
         ) else self.sequence_length
         #1. get V as learned parameters
         V_s = tf.get_variable(
             "V_s",
             shape=(self.batch_size * length, self.d_model),
             initializer=self.initializer
         )  ##TODO variable' shape should not include batch_size.
         #2. call function of multi head attention to get result
         multi_head_attention_class = MultiHeadAttention(
             Q,
             K_s,
             V_s,
             self.d_model,
             self.d_k,
             self.d_v,
             self.sequence_length,
             self.h,
             mask=mask)
         sub_layer_multi_head_attention_output = multi_head_attention_class.multi_head_attention_fn(
         )  # [batch_size*sequence_length,d_model]
     return sub_layer_multi_head_attention_output  # [batch_size*sequence_length,d_model]
 def sub_layer_multi_head_attention(
         self,
         layer_index,
         Q,
         K_s,
         type,
         mask=None,
         is_training=None,
         dropout_keep_prob=None):  # COMMON FUNCTION
     """
     multi head attention as sub layer
     :param layer_index: index of layer number
     :param Q: shape should be: [batch_size,sequence_length,embed_size]
     :param k_s: shape should be: [batch_size,sequence_length,embed_size]
     :param type: encoder,decoder or encoder_decoder_attention
     :param mask: when use mask,illegal connection will be mask as huge big negative value.so it's possiblitity will become zero.
     :return: output of multi head attention.shape:[batch_size,sequence_length,d_model]
     """
     with tf.variable_scope("base_mode_sub_layer_multi_head_attention_" +
                            type + str(layer_index)):
         # below is to handle attention for encoder and decoder with difference length:
         #length=self.decoder_sent_length if (type!='encoder' and self.sequence_length!=self.decoder_sent_length) else self.sequence_length #TODO this may be useful
         length = self.sequence_length
         #1. get V as learned parameters
         V_s = tf.get_variable("V_s",
                               shape=(self.batch_size, length,
                                      self.d_model),
                               initializer=self.initializer)
         #2. call function of multi head attention to get result
         multi_head_attention_class = MultiHeadAttention(
             Q,
             K_s,
             V_s,
             self.d_model,
             self.d_k,
             self.d_v,
             self.sequence_length,
             self.h,
             type=type,
             is_training=is_training,
             mask=mask,
             dropout_rate=(1.0 - dropout_keep_prob))
         sub_layer_multi_head_attention_output = multi_head_attention_class.multi_head_attention_fn(
         )  # [batch_size*sequence_length,d_model]
     return sub_layer_multi_head_attention_output  # [batch_size,sequence_length,d_model]
 def sub_layer_multi_head_attention(self ,layer_index ,Q ,K_s,type,mask=None,is_training=None,dropout_keep_prob=None)  :# COMMON FUNCTION
     """
     multi head attention as sub layer
     :param layer_index: index of layer number
     :param Q: shape should be: [batch_size,sequence_length,embed_size]
     :param k_s: shape should be: [batch_size,sequence_length,embed_size]
     :param type: encoder,decoder or encoder_decoder_attention
     :param mask: when use mask,illegal connection will be mask as huge big negative value.so it's possiblitity will become zero.
     :return: output of multi head attention.shape:[batch_size,sequence_length,d_model]
     """
     with tf.variable_scope("base_mode_sub_layer_multi_head_attention_" + type+str(layer_index)):
         # below is to handle attention for encoder and decoder with difference length:
         #length=self.decoder_sent_length if (type!='encoder' and self.sequence_length!=self.decoder_sent_length) else self.sequence_length #TODO this may be useful
         length=self.sequence_length
         #1. get V as learned parameters
         V_s = tf.get_variable("V_s", shape=(self.batch_size,length,self.d_model),initializer=self.initializer)
         #2. call function of multi head attention to get result
         multi_head_attention_class = MultiHeadAttention(Q, K_s, V_s, self.d_model, self.d_k, self.d_v, self.sequence_length,
                                                         self.h,type=type,is_training=is_training,mask=mask,dropout_rate=(1.0-dropout_keep_prob))
         sub_layer_multi_head_attention_output = multi_head_attention_class.multi_head_attention_fn()  # [batch_size*sequence_length,d_model]
     return sub_layer_multi_head_attention_output  # [batch_size,sequence_length,d_model]
예제 #4
0
    def inference(self):
        """main computation graph here: 1.Word Encoder. 2.Word Attention. 3.Sentence Encoder 4.Sentence Attention 5.linear classifier"""
        # 1.Word Encoder
        # 1.1 embedding of words
        input_x = tf.split(
            self.input_x, self.num_sentences, axis=1
        )  # a list. length:num_sentences.each element is:[None,self.sequence_length/num_sentences]
        input_x = tf.stack(
            input_x, axis=1
        )  # shape:[None,self.num_sentences,self.sequence_length/num_sentences]
        self.embedded_words = tf.nn.embedding_lookup(
            self.Embedding,
            input_x)  # [None,num_sentences,sentence_length,embed_size]
        embedded_words_reshaped = tf.reshape(
            self.embedded_words,
            shape=[-1, self.sequence_length, self.embed_size
                   ])  # [batch_size*num_sentences,sentence_length,embed_size]
        # 1.2 forward gru
        hidden_state_forward_list = self.gru_forward_word_level(
            embedded_words_reshaped
        )  # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
        # 1.3 backward gru
        hidden_state_backward_list = self.gru_backward_word_level(
            embedded_words_reshaped
        )  # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
        # 1.4 concat forward hidden state and backward hidden state. hidden_state: a list.len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]
        self.hidden_state = [
            tf.concat([h_forward, h_backward], axis=1)
            for h_forward, h_backward in zip(hidden_state_forward_list,
                                             hidden_state_backward_list)
        ]  # hidden_state:list,len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]

        # 2.Word Attention
        # for each sentence.
        sentence_representation = self.attention_word_level(
            self.hidden_state
        )  # output:[batch_size*num_sentences,hidden_size*2]
        sentence_representation = tf.reshape(
            sentence_representation,
            shape=[-1, self.num_sentences, self.hidden_size * 2
                   ])  # shape:[batch_size,num_sentences,hidden_size*2]

        # 2.====================== Multi Head Attention + Layer normalization================================
        V_sentence = tf.get_variable(
            "V_sentence",
            shape=sentence_representation.get_shape().as_list(),
            initializer=self.initializer)
        multi_head_attention_class = MultiHeadAttention(
            sentence_representation,
            sentence_representation,
            V_sentence,
            200,
            25,
            25,
            self.sequence_length,
            8,
            type='word_level')
        sentence_representation = multi_head_attention_class.multi_head_attention_fn(
        )  #shape:[sequence_length,d_model]

        #postion_wise_feed_forward = PositionWiseFeedFoward(sentence_representation,'word_level',d_model=sentence_representation.get_shape()[-1])
        #sentence_representation = postion_wise_feed_forward.position_wise_feed_forward_fn()

        #sentence_representation=self.layer_normalization(sentence_representation,"sentence_representation") #add layer_normalization
        #with tf.name_scope("dropout"):#
        #    sentence_representation = tf.nn.dropout(sentence_representation,keep_prob=self.dropout_keep_prob)  # shape:[None,hidden_size*4]

        # 3.Sentence Encoder
        # 3.1) forward gru for sentence
        hidden_state_forward_sentences = self.gru_forward_sentence_level(
            sentence_representation
        )  # a list.length is sentence_length, each element is [None,hidden_size]
        # 3.2) backward gru for sentence
        hidden_state_backward_sentences = self.gru_backward_sentence_level(
            sentence_representation
        )  # a list,length is sentence_length, each element is [None,hidden_size]
        # 3.3) concat forward hidden state and backward hidden state
        # below hidden_state_sentence is a list,len:sentence_length,element:[None,hidden_size*2]
        self.hidden_state_sentence = [
            tf.concat([h_forward, h_backward], axis=1)
            for h_forward, h_backward in zip(hidden_state_forward_sentences,
                                             hidden_state_backward_sentences)
        ]

        # 3. ======================Multi-head Attention + Layer Normalization================================
        #####self.hidden_state_sentence=tf.stack(self.hidden_state_sentence,axis=1) #[batch,sentence_length,hidden_size*2]
        #####print("self.hidden_state_sentence0:", self.hidden_state_sentence)  # 8, 6, 200
        ####V_encoder= tf.get_variable("V_encoder", shape=self.hidden_state_sentence.get_shape().as_list(),initializer=self.initializer)
        #####multi_head_attention_class = MultiHeadAttention(self.hidden_state_sentence, self.hidden_state_sentence, V_encoder, 400, 50, 50, self.sequence_length, 8,type='word_encoder')
        ####hidden_state_sentence=multi_head_attention_class.multi_head_attention_fn() #shape:[sequence_length,d_model]

        ######postion_wise_feed_forward = PositionWiseFeedFoward(hidden_state_sentence,'sentence_level',d_model=hidden_state_sentence.get_shape()[-1])
        ######hidden_state_sentence = postion_wise_feed_forward.position_wise_feed_forward_fn()

        ######hidden_state_sentence=self.layer_normalization(self.hidden_state_sentence,"hidden_state_sentence") #add layer_normalization
        ######print("hidden_state_sentence:",hidden_state_sentence) #[8, 6, 200]
        ######self.hidden_state_sentence=[tf.squeeze(e,axis=1) for e in tf.split(hidden_state_sentence,hidden_state_sentence.get_shape().as_list()[1],axis=1)]

        # 4.Sentence Attention
        print("self.hidden_state_sentence:",
              self.hidden_state_sentence)  # 8, 6, 200
        document_representation = self.attention_sentence_level(
            self.hidden_state_sentence)  # shape:[None,hidden_size*4]
        document_representation = self.layer_normalization(
            document_representation,
            "document_representation")  # add layer_normalization
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(
                document_representation,
                keep_prob=self.dropout_keep_prob)  # shape:[None,hidden_size*4]
        # 5. logits(use linear layer)and predictions(argmax)
        with tf.name_scope("output"):
            logits = tf.matmul(
                self.h_drop, self.W_projection
            ) + self.b_projection  # shape:[None,self.num_classes]==tf.matmul([None,hidden_size*2],[hidden_size*2,self.num_classes])
        return logits
    def inference(self):
        """main computation graph here: 1.Word Encoder. 2.Word Attention. 3.Sentence Encoder 4.Sentence Attention 5.linear classifier"""
        # 1.Word Encoder
        # 1.1 embedding of words
        input_x = tf.split(self.input_x, self.num_sentences,axis=1)  # a list. length:num_sentences.each element is:[None,self.sequence_length/num_sentences]
        input_x = tf.stack(input_x, axis=1)  # shape:[None,self.num_sentences,self.sequence_length/num_sentences]
        self.embedded_words = tf.nn.embedding_lookup(self.Embedding,input_x)  # [None,num_sentences,sentence_length,embed_size]
        embedded_words_reshaped = tf.reshape(self.embedded_words, shape=[-1, self.sequence_length,self.embed_size])  # [batch_size*num_sentences,sentence_length,embed_size]
        # 1.2 forward gru
        hidden_state_forward_list = self.gru_forward_word_level(embedded_words_reshaped)  # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
        # 1.3 backward gru
        hidden_state_backward_list = self.gru_backward_word_level(embedded_words_reshaped)  # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
        # 1.4 concat forward hidden state and backward hidden state. hidden_state: a list.len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]
        self.hidden_state = [tf.concat([h_forward, h_backward], axis=1) for h_forward, h_backward in
                             zip(hidden_state_forward_list, hidden_state_backward_list)]  # hidden_state:list,len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]

        # 2.Word Attention
        # for each sentence.
        sentence_representation = self.attention_word_level(self.hidden_state)  # output:[batch_size*num_sentences,hidden_size*2]
        sentence_representation = tf.reshape(sentence_representation, shape=[-1, self.num_sentences, self.hidden_size * 2])  # shape:[batch_size,num_sentences,hidden_size*2]

        # 2.====================== Multi Head Attention + Layer normalization================================
        V_sentence= tf.get_variable("V_sentence", shape=sentence_representation.get_shape().as_list(),initializer=self.initializer)
        multi_head_attention_class = MultiHeadAttention(sentence_representation, sentence_representation, V_sentence, 200, 25, 25, self.sequence_length, 8,type='word_level')
        sentence_representation=multi_head_attention_class.multi_head_attention_fn() #shape:[sequence_length,d_model]

        #postion_wise_feed_forward = PositionWiseFeedFoward(sentence_representation,'word_level',d_model=sentence_representation.get_shape()[-1])
        #sentence_representation = postion_wise_feed_forward.position_wise_feed_forward_fn()

        #sentence_representation=self.layer_normalization(sentence_representation,"sentence_representation") #add layer_normalization
        #with tf.name_scope("dropout"):#
        #    sentence_representation = tf.nn.dropout(sentence_representation,keep_prob=self.dropout_keep_prob)  # shape:[None,hidden_size*4]

        # 3.Sentence Encoder
        # 3.1) forward gru for sentence
        hidden_state_forward_sentences = self.gru_forward_sentence_level(sentence_representation)  # a list.length is sentence_length, each element is [None,hidden_size]
        # 3.2) backward gru for sentence
        hidden_state_backward_sentences = self.gru_backward_sentence_level(sentence_representation)  # a list,length is sentence_length, each element is [None,hidden_size]
        # 3.3) concat forward hidden state and backward hidden state
        # below hidden_state_sentence is a list,len:sentence_length,element:[None,hidden_size*2]
        self.hidden_state_sentence = [tf.concat([h_forward, h_backward], axis=1) for h_forward, h_backward in zip(hidden_state_forward_sentences, hidden_state_backward_sentences)]

        # 3. ======================Multi-head Attention + Layer Normalization================================
        #####self.hidden_state_sentence=tf.stack(self.hidden_state_sentence,axis=1) #[batch,sentence_length,hidden_size*2]
        #####print("self.hidden_state_sentence0:", self.hidden_state_sentence)  # 8, 6, 200
        ####V_encoder= tf.get_variable("V_encoder", shape=self.hidden_state_sentence.get_shape().as_list(),initializer=self.initializer)
        #####multi_head_attention_class = MultiHeadAttention(self.hidden_state_sentence, self.hidden_state_sentence, V_encoder, 400, 50, 50, self.sequence_length, 8,type='word_encoder')
        ####hidden_state_sentence=multi_head_attention_class.multi_head_attention_fn() #shape:[sequence_length,d_model]

        ######postion_wise_feed_forward = PositionWiseFeedFoward(hidden_state_sentence,'sentence_level',d_model=hidden_state_sentence.get_shape()[-1])
        ######hidden_state_sentence = postion_wise_feed_forward.position_wise_feed_forward_fn()

        ######hidden_state_sentence=self.layer_normalization(self.hidden_state_sentence,"hidden_state_sentence") #add layer_normalization
        ######print("hidden_state_sentence:",hidden_state_sentence) #[8, 6, 200]
        ######self.hidden_state_sentence=[tf.squeeze(e,axis=1) for e in tf.split(hidden_state_sentence,hidden_state_sentence.get_shape().as_list()[1],axis=1)]

        # 4.Sentence Attention
        print("self.hidden_state_sentence:", self.hidden_state_sentence)  # 8, 6, 200
        document_representation = self.attention_sentence_level(self.hidden_state_sentence)  # shape:[None,hidden_size*4]
        document_representation = self.layer_normalization(document_representation,"document_representation")  # add layer_normalization
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(document_representation,keep_prob=self.dropout_keep_prob)  # shape:[None,hidden_size*4]
        # 5. logits(use linear layer)and predictions(argmax)
        with tf.name_scope("output"):
            logits = tf.matmul(self.h_drop, self.W_projection) + self.b_projection  # shape:[None,self.num_classes]==tf.matmul([None,hidden_size*2],[hidden_size*2,self.num_classes])
        return logits