def inference(self): """ building blocks: encoder:6 layers.each layers has two sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network. for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512. decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack. for each sublayer. use LayerNorm(x+Sublayer(x)). """ # 1.embedding for encoder input & decoder input # 1.1 position embedding for encoder input input_x_embeded = tf.nn.embedding_lookup( self.Embedding, self.input_x) #[None,sequence_length, embed_size] input_x_embeded = tf.multiply( input_x_embeded, tf.sqrt(tf.cast(self.d_model, dtype=tf.float32))) input_mask = tf.get_variable("input_mask", [self.sequence_length, 1], initializer=self.initializer) input_x_embeded = tf.add( input_x_embeded, input_mask) #[None,sequence_length,embed_size].position embedding. # 2. encoder encoder_class = Encoder(self.d_model, self.d_k, self.d_v, self.sequence_length, self.h, self.batch_size, self.num_layer, input_x_embeded, input_x_embeded, dropout_keep_prob=self.dropout_keep_prob, use_residual_conn=self.use_residual_conn) Q_encoded, K_encoded = encoder_class.encoder_fn() #K_v_encoder Q_encoded = tf.reshape( Q_encoded, shape=(self.batch_size, -1)) #[batch_size,sequence_length*d_model] with tf.variable_scope("output"): logits = tf.matmul( Q_encoded, self.W_projection ) + self.b_projection #logits shape:[batch_size*decoder_sent_length,self.num_classes] print("logits:", logits) return logits
def inference(self): """ building blocks: encoder:6 layers.each layers has two sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network. for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512. decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack. for each sublayer. use LayerNorm(x+Sublayer(x)). """ # 1.embedding for encoder input & decoder input # 1.1 position embedding for encoder input input_x_embeded = tf.nn.embedding_lookup(self.Embedding,self.input_x) #[None,sequence_length, embed_size] input_x_embeded=tf.multiply(input_x_embeded,tf.sqrt(tf.cast(self.d_model,dtype=tf.float32))) input_mask=tf.get_variable("input_mask",[self.sequence_length,1],initializer=self.initializer) input_x_embeded=tf.add(input_x_embeded,input_mask) #[None,sequence_length,embed_size].position embedding. # 2. encoder encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length,self.h,self.batch_size,self.num_layer,input_x_embeded,input_x_embeded,dropout_keep_prob=self.dropout_keep_prob,use_residual_conn=self.use_residual_conn) Q_encoded,K_encoded = encoder_class.encoder_fn() #K_v_encoder Q_encoded=tf.reshape(Q_encoded,shape=(self.batch_size,-1)) #[batch_size,sequence_length*d_model] with tf.variable_scope("output"): logits = tf.matmul(Q_encoded, self.W_projection) + self.b_projection #logits shape:[batch_size*decoder_sent_length,self.num_classes] print("logits:",logits) return logits
def inference(self): """ building blocks: encoder:6 layers.each layers has two sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network. for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512. decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack. for each sublayer. use LayerNorm(x+Sublayer(x)). """ # 1.embedding for encoder input & decoder input # 1.1 position embedding for encoder input input_x_embeded = tf.nn.embedding_lookup(self.Embedding,self.input_x) #[None,sequence_length, embed_size] input_x_embeded=tf.multiply(input_x_embeded,tf.sqrt(tf.cast(self.d_model,dtype=tf.float32))) input_mask=tf.get_variable("input_mask",[self.sequence_length,1],initializer=self.initializer) input_x_embeded=tf.add(input_x_embeded,input_mask) #[None,sequence_length,embed_size].position embedding. # 1.2 position embedding for decoder input decoder_input_embedded = tf.nn.embedding_lookup(self.Embedding_label, self.decoder_input) #[None,decoder_sent_length,embed_size] decoder_input_embedded = tf.multiply(decoder_input_embedded, tf.sqrt(tf.cast(self.d_model,dtype=tf.float32))) decoder_input_mask=tf.get_variable("decoder_input_mask",[self.decoder_sent_length,1],initializer=self.initializer) decoder_input_embedded=tf.add(decoder_input_embedded,decoder_input_mask) # 2. encoder encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length,self.h,self.batch_size,self.num_layer,input_x_embeded,input_x_embeded,dropout_keep_prob=self.dropout_keep_prob) Q_encoded,K_encoded = encoder_class.encoder_fn() #K_v_encoder # 3. decoder with attention ==>get last of output(hidden state)====>prepare to get logits mask = self.get_mask(self.decoder_sent_length) #d_model, d_k, d_v, sequence_length, h, batch_size, Q, K_s, K_v_encoder, decoder_sent_length, #num_layer = 6, type = 'decoder', is_training = True, mask = None decoder = Decoder(self.d_model, self.d_k, self.d_v, self.sequence_length, self.h, self.batch_size, decoder_input_embedded, decoder_input_embedded, K_encoded,self.decoder_sent_length, num_layer=self.num_layer,is_training=self.is_training,mask=mask,dropout_keep_prob=self.dropout_keep_prob) #,extract_word_vector_fn=extract_word_vector_fn Q_decoded, K_decoded=decoder.decoder_fn() #[batch_size,decoder_sent_length,d_model] K_decoded=tf.reshape(K_decoded,shape=(-1,self.d_model)) with tf.variable_scope("output"): print("self.W_projection2:",self.W_projection," ;K_decoded:",K_decoded) logits = tf.matmul(K_decoded, self.W_projection) + self.b_projection #logits shape:[batch_size*decoder_sent_length,self.num_classes] logits=tf.reshape(logits,shape=(self.batch_size,self.decoder_sent_length,self.num_classes)) #logits shape:[batch_size,decoder_sent_length,self.num_classes] return logits
def inference(self): """ building blocks: encoder:6 layers.each layers has two sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network. for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512. decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack. for each sublayer. use LayerNorm(x+Sublayer(x)). """ # 1.embedding for encoder input & decoder input # 1.1 position embedding for encoder input input_x_embeded = tf.nn.embedding_lookup( self.Embedding, self.input_x) #[None,sequence_length, embed_size] input_x_embeded = tf.multiply( input_x_embeded, tf.sqrt(tf.cast(self.d_model, dtype=tf.float32))) input_mask = tf.get_variable("input_mask", [self.sequence_length, 1], initializer=self.initializer) input_x_embeded = tf.add( input_x_embeded, input_mask) #[None,sequence_length,embed_size].position embedding. input_x_embeded = tf.reshape( input_x_embeded, shape=(-1, self.embed_size)) #[batch_size*sequence_length,embed_size] # 1.2 position embedding for decoder input decoder_input_embedded = tf.nn.embedding_lookup( self.Embedding_label, self.decoder_input) #[None,decoder_sent_length,embed_size] decoder_input_embedded = tf.multiply( decoder_input_embedded, tf.sqrt(tf.cast(self.d_model, dtype=tf.float32))) decoder_input_mask = tf.get_variable("decoder_input_mask", [self.decoder_sent_length, 1], initializer=self.initializer) decoder_input_embedded = tf.add(decoder_input_embedded, decoder_input_mask) decoder_input_embedded = tf.reshape( decoder_input_embedded, shape=(-1, self.embed_size )) ##[batch_size*decoder_sent_length,embed_size] # 2. encoder encoder_class = Encoder(self.d_model, self.d_k, self.d_v, self.sequence_length, self.h, self.batch_size, self.num_layer, input_x_embeded, input_x_embeded) Q_encoded, K_encoded = encoder_class.encoder_fn() #K_v_encoder # 3. decoder with attention ==>get last of output(hidden state)====>prepare to get logits #check two parameters below: Q: should from decoder; K_s: should be the output of encoder decoder = Decoder(self.d_model, self.d_k, self.d_v, self.sequence_length, self.h, self.batch_size, decoder_input_embedded, decoder_input_embedded, decoder_input_embedded, K_encoded, self.decoder_sent_length) Q_decoded, K_decoded = decoder.decoder_fn( ) #[batch_size*decoder_sent_length,d_model] with tf.variable_scope("output"): print("self.W_projection2:", self.W_projection) logits = tf.matmul( K_decoded, self.W_projection ) + self.b_projection #logits shape:[batch_size*decoder_sent_length,self.num_classes] logits = tf.reshape( logits, shape=(self.batch_size, self.decoder_sent_length, self.num_classes) ) #logits shape:[batch_size,decoder_sent_length,self.num_classes] return logits