def inference(self):
        """ building blocks:
        encoder:6 layers.each layers has two   sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network.
               for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512.
        decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack.
               for each sublayer. use LayerNorm(x+Sublayer(x)).
        """
        # 1.embedding for encoder input & decoder input
        # 1.1 position embedding for encoder input
        input_x_embeded = tf.nn.embedding_lookup(
            self.Embedding, self.input_x)  #[None,sequence_length, embed_size]
        input_x_embeded = tf.multiply(
            input_x_embeded, tf.sqrt(tf.cast(self.d_model, dtype=tf.float32)))
        input_mask = tf.get_variable("input_mask", [self.sequence_length, 1],
                                     initializer=self.initializer)
        input_x_embeded = tf.add(
            input_x_embeded,
            input_mask)  #[None,sequence_length,embed_size].position embedding.

        # 2. encoder
        encoder_class = Encoder(self.d_model,
                                self.d_k,
                                self.d_v,
                                self.sequence_length,
                                self.h,
                                self.batch_size,
                                self.num_layer,
                                input_x_embeded,
                                input_x_embeded,
                                dropout_keep_prob=self.dropout_keep_prob,
                                use_residual_conn=self.use_residual_conn)
        Q_encoded, K_encoded = encoder_class.encoder_fn()  #K_v_encoder

        Q_encoded = tf.reshape(
            Q_encoded,
            shape=(self.batch_size, -1))  #[batch_size,sequence_length*d_model]
        with tf.variable_scope("output"):
            logits = tf.matmul(
                Q_encoded, self.W_projection
            ) + self.b_projection  #logits shape:[batch_size*decoder_sent_length,self.num_classes]
        print("logits:", logits)
        return logits
    def inference(self):
        """ building blocks:
        encoder:6 layers.each layers has two   sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network.
               for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512.
        decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack.
               for each sublayer. use LayerNorm(x+Sublayer(x)).
        """
        # 1.embedding for encoder input & decoder input
        # 1.1 position embedding for encoder input
        input_x_embeded = tf.nn.embedding_lookup(self.Embedding,self.input_x)  #[None,sequence_length, embed_size]
        input_x_embeded=tf.multiply(input_x_embeded,tf.sqrt(tf.cast(self.d_model,dtype=tf.float32)))
        input_mask=tf.get_variable("input_mask",[self.sequence_length,1],initializer=self.initializer)
        input_x_embeded=tf.add(input_x_embeded,input_mask) #[None,sequence_length,embed_size].position embedding.

        # 2. encoder
        encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length,self.h,self.batch_size,self.num_layer,input_x_embeded,input_x_embeded,dropout_keep_prob=self.dropout_keep_prob,use_residual_conn=self.use_residual_conn)
        Q_encoded,K_encoded = encoder_class.encoder_fn() #K_v_encoder

        Q_encoded=tf.reshape(Q_encoded,shape=(self.batch_size,-1)) #[batch_size,sequence_length*d_model]
        with tf.variable_scope("output"):
            logits = tf.matmul(Q_encoded, self.W_projection) + self.b_projection #logits shape:[batch_size*decoder_sent_length,self.num_classes]
        print("logits:",logits)
        return logits
예제 #3
0
    def inference(self):
        """ building blocks:
        encoder:6 layers.each layers has two   sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network.
               for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512.
        decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack.
               for each sublayer. use LayerNorm(x+Sublayer(x)).
        """
        # 1.embedding for encoder input & decoder input
        # 1.1 position embedding for encoder input
        input_x_embeded = tf.nn.embedding_lookup(self.Embedding,self.input_x)  #[None,sequence_length, embed_size]
        input_x_embeded=tf.multiply(input_x_embeded,tf.sqrt(tf.cast(self.d_model,dtype=tf.float32)))
        input_mask=tf.get_variable("input_mask",[self.sequence_length,1],initializer=self.initializer)
        input_x_embeded=tf.add(input_x_embeded,input_mask) #[None,sequence_length,embed_size].position embedding.
        # 1.2 position embedding for decoder input
        decoder_input_embedded = tf.nn.embedding_lookup(self.Embedding_label, self.decoder_input) #[None,decoder_sent_length,embed_size]
        decoder_input_embedded = tf.multiply(decoder_input_embedded, tf.sqrt(tf.cast(self.d_model,dtype=tf.float32)))
        decoder_input_mask=tf.get_variable("decoder_input_mask",[self.decoder_sent_length,1],initializer=self.initializer)
        decoder_input_embedded=tf.add(decoder_input_embedded,decoder_input_mask)

        # 2. encoder
        encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length,self.h,self.batch_size,self.num_layer,input_x_embeded,input_x_embeded,dropout_keep_prob=self.dropout_keep_prob)
        Q_encoded,K_encoded = encoder_class.encoder_fn() #K_v_encoder

        # 3. decoder with attention ==>get last of output(hidden state)====>prepare to get logits
        mask = self.get_mask(self.decoder_sent_length)
                           #d_model, d_k, d_v, sequence_length, h, batch_size, Q, K_s, K_v_encoder, decoder_sent_length,
                           #num_layer = 6, type = 'decoder', is_training = True, mask = None
        decoder = Decoder(self.d_model, self.d_k, self.d_v, self.sequence_length, self.h, self.batch_size,
                          decoder_input_embedded, decoder_input_embedded, K_encoded,self.decoder_sent_length,
                          num_layer=self.num_layer,is_training=self.is_training,mask=mask,dropout_keep_prob=self.dropout_keep_prob) #,extract_word_vector_fn=extract_word_vector_fn
        Q_decoded, K_decoded=decoder.decoder_fn() #[batch_size,decoder_sent_length,d_model]
        K_decoded=tf.reshape(K_decoded,shape=(-1,self.d_model))
        with tf.variable_scope("output"):
            print("self.W_projection2:",self.W_projection," ;K_decoded:",K_decoded)
            logits = tf.matmul(K_decoded, self.W_projection) + self.b_projection #logits shape:[batch_size*decoder_sent_length,self.num_classes]
            logits=tf.reshape(logits,shape=(self.batch_size,self.decoder_sent_length,self.num_classes)) #logits shape:[batch_size,decoder_sent_length,self.num_classes]
        return logits
    def inference(self):
        """ building blocks:
        encoder:6 layers.each layers has two   sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network.
               for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512.
        decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack.
               for each sublayer. use LayerNorm(x+Sublayer(x)).
        """
        # 1.embedding for encoder input & decoder input
        # 1.1 position embedding for encoder input
        input_x_embeded = tf.nn.embedding_lookup(self.Embedding,self.input_x)  #[None,sequence_length, embed_size]
        input_x_embeded=tf.multiply(input_x_embeded,tf.sqrt(tf.cast(self.d_model,dtype=tf.float32)))
        input_mask=tf.get_variable("input_mask",[self.sequence_length,1],initializer=self.initializer)
        input_x_embeded=tf.add(input_x_embeded,input_mask) #[None,sequence_length,embed_size].position embedding.
        # 1.2 position embedding for decoder input
        decoder_input_embedded = tf.nn.embedding_lookup(self.Embedding_label, self.decoder_input) #[None,decoder_sent_length,embed_size]
        decoder_input_embedded = tf.multiply(decoder_input_embedded, tf.sqrt(tf.cast(self.d_model,dtype=tf.float32)))
        decoder_input_mask=tf.get_variable("decoder_input_mask",[self.decoder_sent_length,1],initializer=self.initializer)
        decoder_input_embedded=tf.add(decoder_input_embedded,decoder_input_mask)

        # 2. encoder
        encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length,self.h,self.batch_size,self.num_layer,input_x_embeded,input_x_embeded,dropout_keep_prob=self.dropout_keep_prob)
        Q_encoded,K_encoded = encoder_class.encoder_fn() #K_v_encoder

        # 3. decoder with attention ==>get last of output(hidden state)====>prepare to get logits
        mask = self.get_mask(self.decoder_sent_length)
                           #d_model, d_k, d_v, sequence_length, h, batch_size, Q, K_s, K_v_encoder, decoder_sent_length,
                           #num_layer = 6, type = 'decoder', is_training = True, mask = None
        decoder = Decoder(self.d_model, self.d_k, self.d_v, self.sequence_length, self.h, self.batch_size,
                          decoder_input_embedded, decoder_input_embedded, K_encoded,self.decoder_sent_length,
                          num_layer=self.num_layer,is_training=self.is_training,mask=mask,dropout_keep_prob=self.dropout_keep_prob) #,extract_word_vector_fn=extract_word_vector_fn
        Q_decoded, K_decoded=decoder.decoder_fn() #[batch_size,decoder_sent_length,d_model]
        K_decoded=tf.reshape(K_decoded,shape=(-1,self.d_model))
        with tf.variable_scope("output"):
            print("self.W_projection2:",self.W_projection," ;K_decoded:",K_decoded)
            logits = tf.matmul(K_decoded, self.W_projection) + self.b_projection #logits shape:[batch_size*decoder_sent_length,self.num_classes]
            logits=tf.reshape(logits,shape=(self.batch_size,self.decoder_sent_length,self.num_classes)) #logits shape:[batch_size,decoder_sent_length,self.num_classes]
        return logits
    def inference(self):
        """ building blocks:
        encoder:6 layers.each layers has two   sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network.
               for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512.
        decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack.
               for each sublayer. use LayerNorm(x+Sublayer(x)).
        """
        # 1.embedding for encoder input & decoder input
        # 1.1 position embedding for encoder input
        input_x_embeded = tf.nn.embedding_lookup(
            self.Embedding, self.input_x)  #[None,sequence_length, embed_size]
        input_x_embeded = tf.multiply(
            input_x_embeded, tf.sqrt(tf.cast(self.d_model, dtype=tf.float32)))
        input_mask = tf.get_variable("input_mask", [self.sequence_length, 1],
                                     initializer=self.initializer)
        input_x_embeded = tf.add(
            input_x_embeded,
            input_mask)  #[None,sequence_length,embed_size].position embedding.
        input_x_embeded = tf.reshape(
            input_x_embeded,
            shape=(-1,
                   self.embed_size))  #[batch_size*sequence_length,embed_size]
        # 1.2 position embedding for decoder input
        decoder_input_embedded = tf.nn.embedding_lookup(
            self.Embedding_label,
            self.decoder_input)  #[None,decoder_sent_length,embed_size]
        decoder_input_embedded = tf.multiply(
            decoder_input_embedded,
            tf.sqrt(tf.cast(self.d_model, dtype=tf.float32)))
        decoder_input_mask = tf.get_variable("decoder_input_mask",
                                             [self.decoder_sent_length, 1],
                                             initializer=self.initializer)
        decoder_input_embedded = tf.add(decoder_input_embedded,
                                        decoder_input_mask)
        decoder_input_embedded = tf.reshape(
            decoder_input_embedded,
            shape=(-1, self.embed_size
                   ))  ##[batch_size*decoder_sent_length,embed_size]

        # 2. encoder
        encoder_class = Encoder(self.d_model, self.d_k, self.d_v,
                                self.sequence_length, self.h, self.batch_size,
                                self.num_layer, input_x_embeded,
                                input_x_embeded)
        Q_encoded, K_encoded = encoder_class.encoder_fn()  #K_v_encoder

        # 3. decoder with attention ==>get last of output(hidden state)====>prepare to get logits
        #check two parameters below: Q: should from decoder; K_s: should be the output of encoder
        decoder = Decoder(self.d_model, self.d_k, self.d_v,
                          self.sequence_length, self.h, self.batch_size,
                          decoder_input_embedded, decoder_input_embedded,
                          decoder_input_embedded, K_encoded,
                          self.decoder_sent_length)
        Q_decoded, K_decoded = decoder.decoder_fn(
        )  #[batch_size*decoder_sent_length,d_model]
        with tf.variable_scope("output"):
            print("self.W_projection2:", self.W_projection)
            logits = tf.matmul(
                K_decoded, self.W_projection
            ) + self.b_projection  #logits shape:[batch_size*decoder_sent_length,self.num_classes]
            logits = tf.reshape(
                logits,
                shape=(self.batch_size, self.decoder_sent_length,
                       self.num_classes)
            )  #logits shape:[batch_size,decoder_sent_length,self.num_classes]
        return logits