def init_placeholders(self): """初始化训练、预测所需的变量 """ # 编码器输入,shape=(batch_size, time_step) # 有 batch_size 句话,每句话是最大长度为 time_step 的 index 表示 self.encoder_inputs = tf.placeholder(dtype=tf.int32, shape=(self.batch_size, None), name='encoder_inputs') # 编码器长度输入,shape=(batch_size, 1) # 指的是 batch_size 句话每句话的长度 self.encoder_inputs_length = tf.placeholder( dtype=tf.int32, shape=(self.batch_size, ), name='encoder_inputs_length') # 编码器输入,shape=(batch_size, time_step) # 有 batch_size 句话,每句话是最大长度为 time_step 的 index 表示 self.x = tf.placeholder(dtype=tf.int32, shape=(self.batch_size, None), name='x') # 编码器长度输入,shape=(batch_size, 1) # 指的是 batch_size 句话每句话的长度 self.xl = tf.placeholder(dtype=tf.int32, shape=(self.batch_size, ), name='xl') # 编码器的embedding with tf.device(_get_embed_device(self.input_vocab_size)): self.encoder_embeddings = tf.get_variable( name='embedding', shape=(self.input_vocab_size, self.embedding_size), initializer=self.initializer, ) if self.mode == 'train': self.targets = tf.placeholder(dtype=tf.int64, shape=(self.batch_size, ), name='target')
def build_decoder(self, encoder_outputs, encoder_state): with tf.variable_scope('decoder') as decoder_scope: ( self.decoder_cell, self.decoder_initial_state ) = self.build_decoder_cell(encoder_outputs, encoder_state) with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable( tf.constant(0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings' ) self.decoder_embeddings_placeholder =\ tf.placeholder(tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign(self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embedding', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) self.decoder_output_projection = layers.Dense(self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') if self.mode == 'train': self.decoder_inputs_embdedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train ) inputs = self.decoder_inputs_embdedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper' ) training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state ) max_decoder_length = tf.reduce_max( self.decoder_inputs_length ) ( outputs, self.final_state, _ ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope ) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output ) self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks' ) decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose(decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax( decoder_logits_train, axis=-1, name='decoder_pred_train' ) self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True ) self.add_loss = self.loss + self.add_loss elif self.mode == 'decode': start_token = tf.tile( [WordSequence.START], [self.batch_size] ) end_token = WordSequence.END def embed_and_input_proj(inputs): return tf.nn.embedding_lookup( self.decoder_embeddings, inputs ) if not self.use_beamsearch_decode: decoder_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_token, end_token=end_token, embedding=embed_and_input_proj ) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoder_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection ) else: inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_token, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection ) if self.max_decode_step is not None: max_decoder_step = self.max_decode_step else: max_decoder_step = tf.round(tf.reduce_max( self.encoder_inputs_length ) * 4) ( self.decoder_outputs_decode, self.final_state ) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, maximum_iterations=max_decoder_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope )) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_train = tf.transpose( self.decoder_pred_decode, (1, 0) ) else: self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2) ) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1] ) dod = self.decoder_pred_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_encoder(self): """ 构建编码器 :return: """ with tf.variable_scope('encoder'): encoder_cell = self.build_encoder_cell() with tf.device(_get_embed_device(self.input_vocab_size)): if self.pretrained_embedding: self.encoder_embeddings = tf.Variable( tf.constant(0.0, shape=(self.input_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.encoder_embeddings_placeholder = tf.placeholder(tf.float32, (self.input_vocab_size, self.embedding_size) ) self.encoder_embeddings_init = self.encoder_embeddings.assign(self.encoder_embeddings_placeholder) else: self.encoder_embeddings = tf.get_variable(name='embeddings', shape=(self.input_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) self.encoder_inputs_embedded = tf.nn.embedding_lookup( params=self.encoder_embeddings, ids=self.encoder_inputs) if self.use_residual: self.encoder_inputs_embedded = layers.dense(self.encoder_inputs_embedded, self.hidden_size, use_bias=False, name='encoder_residual_projection') inputs = self.encoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) if not self.bidirection: ( encoder_outputs, encoder_state ) = tf.nn.dynamic_rnn( cell=encoder_cell, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) else: # 多了合并操作 encoder_cell_bw = self.build_encoder_cell() ( (encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_state, encoder_bw_state) ) = tf.nn.bidirectional_dynamic_rnn( cell_bw=encoder_cell_bw, cell_fw=encoder_cell, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) encoder_outputs = tf.concat( (encoder_bw_outputs, encoder_fw_outputs), 2) encoder_state = [] for i in range(self.depth): encoder_state.append(encoder_fw_state[i]) encoder_state.append(encoder_bw_state[i]) encoder_state = tuple(encoder_state) return encoder_outputs, encoder_state
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: #创建解码器单元 (self.decoder_cell,self.decoder_initial_state)\ = self.build_decoder_cell(encoder_outputs, encoder_state) # 解码器embedding 根据词表大小选择CPU还是GPU上训练 with tf.device(_get_embed_device(self.target_vocab_size)): #如果是共享的embedding 则赋值,否则加载预训练 或者初始化进行后续的训练 if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings #如果是预训练的embedding elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable( tf.constant( 0.0, shape=(self.target_vocab_size,self.embedding_size)), trainable=True,#是否可以被训练 name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) #解码器的输出 self.decoder_output_projection = layers.Dense( self.target_vocab_size, #一共有词表大小个输出 dtype=tf.float32, use_bias=False, name='decoder_output_projection' ) if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train #placeholder初始化时设定 ) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) #帮助feed参数 一般用于训练阶段Decoder解码,辅助Decoder解码过程 training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper' ) # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, #用之前定义的初始化单元的状态进行初始化 ) # decoder在当前batch下最大的time_steps max_decoder_length = tf.reduce_max(self.decoder_inputs_length) #定义动态解码的输出 (outputs,self.final_state,_)\ = seq2seq.dynamic_decode( #动态decoder decoder=training_decoder, output_time_major=self.time_major, #True是以time(seq_length)为第一维,False是以batch_size为第一维 impute_finished=True, #追踪finished,如果一个序列已经finished,那么后面的每一步output为0 maximum_iterations=max_decoder_length,#最大迭代次数(可以理解为decoder最多可以生成几个词) parallel_iterations=self.parallel_iterations,##while_loop的并行次数 swap_memory=True, ##True时,当遇到OOM(out of memory),是否把张量从显存转到内存 scope=decoder_scope) #在训练时将所有的结果在全连接层一次性做投影运算 可以提高效率官方提倡 self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output #上面定义的解码器的输出 ) # masks: masking for valid and padded time steps, #tf.sequence_mask的作用是构建序列长度的mask标志 """ tf.sequence_mask([1,2], 4) --> [[ True False False False] [ True True False False]] """ # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks' ) decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose(decoder_logits_train, (1,0,2)) #解码器训练时的预测输出 decoder_logits_train一共有词表大小个输出,现仅取值最大的那个下标即为预测的对应下标 self.decoder_pred_train = tf.argmax( decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks 损失之类 # train_entropy = cross entropy self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs,#标签 logits=decoder_logits_train)#预测 self.masks_rewards = self.masks * self.rewards #seq2sqe中的损失函数 就是将各个时间步输出相加求平均 权重为mask 当句子长度短于最大长度,为0部分的权重为0 self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, #这里权重跟下面的不同 average_across_timesteps=True, #损失将除以总的权重 average_across_batch=True, #损失将是总的损失处于批次大小 ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 #对原数据进行扩展 参考 #https://blog.csdn.net/tsyccnh/article/details/82459859 start_tokens = tf.tile( [WordSequence.START], [self.batch_size] ) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper 将输入转换成对应词表对应下的embedding """ return tf.nn.embedding_lookup( self.decoder_embeddings, inputs ) #如果不使用集束搜索解码 这里定义helper和decoder的结构 if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output #贪婪搜索解码 decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens,#起始token end_token=end_token, #结束token embedding=embed_and_input_proj #已经将输入转换成对应的embedding ) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection ) else: #使用beamsearch解码 # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) #一般使用最大值 if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round(tf.reduce_max( self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode,#输出 self.final_state, #最后的状态 _ # self.decoder_outputs_length_decode ) = seq2seq.dynamic_decode( decoder=inference_decoder, #这里包含了使用哪种解码方式 output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope ) #如果不使用beamsearch解码,使用贪婪解码 #调用dynamic_decode进行解码,decoder_outputs_decode是一个namedtuple,里面包含两项(rnn_outputs, sample_id) # rnn_output: [batch_size, decoder_targets_length, vocab_size],保存decode每个时刻每个单词的概率,可以用来计算loss # sample_id: [batch_size], tf.int32,保存最终的编码结果。可以表示最后的答案 if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id #就是最终的答案 if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) #如果使用beamsearch #参考 https://blog.csdn.net/liuchonge/article/details/79021938 # 对于使用beam_search的时候,decoder_outputs_decode它里面包含两项(predicted_ids, beam_search_decoder_output) # predicted_ids: [batch_size, decoder_targets_length, beam_size],保存输出结果 # beam_search_decoder_output: BeamSearchDecoderOutput instance namedtuple(scores, predicted_ids, parent_ids) # 所以对应只需要返回predicted_ids或者sample_id即可翻译成最终的结果 else: self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_encoder(self): """构建编码器 返回编码器的输出以及各个层最后状态的输出 """ # print("构建编码器") with tf.variable_scope('encoder'): # 构建 encoder_cell encoder_cell = self.build_encoder_cell() # 编码器的embedding tf.device()用于指定在哪块gpu或者cpu上进行下列操作 tf.device('/gpu:1') 指定Session在第二块GPU上运行 with tf.device(_get_embed_device(self.input_vocab_size)): # 加载训练好的embedding if self.pretrained_embedding: self.encoder_embeddings = tf.Variable( tf.constant( 0.0, shape=(self.input_vocab_size, self.embedding_size) ), trainable=True, name='embeddings' ) self.encoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.input_vocab_size, self.embedding_size) ) self.encoder_embeddings_init = \ self.encoder_embeddings.assign(self.encoder_embeddings_placeholder) else: #根据initializer对变量进行初始化 get_variable为获取变量或者创建新变量 self.encoder_embeddings = tf.get_variable( name='embedding', shape=(self.input_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) # embedded之后的输入 shape = (batch_size, time_step, embedding_size) self.encoder_inputs_embedded = tf.nn.embedding_lookup( params=self.encoder_embeddings, ids=self.encoder_inputs ) #使用残差网络 则需要对输入先进行映射 if self.use_residual: self.encoder_inputs_embedded = \ layers.dense(self.encoder_inputs_embedded, self.hidden_units, use_bias=False, name='encoder_residual_projection') #编码器输入 inputs = self.encoder_inputs_embedded #time_major 会导致对应维度的内容不一样 if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) if not self.bidirectional: # 单向 RNN ( encoder_outputs, encoder_state ) = tf.nn.dynamic_rnn( cell=encoder_cell, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True ) else: # 双向 RNN 比较麻烦 #构建反向的编码器单元 optput就是输出 state就是比如LSTM里的H和C encoder_cell_bw = self.build_encoder_cell() ( (encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_state, encoder_bw_state) ) = tf.nn.bidirectional_dynamic_rnn( cell_fw=encoder_cell, cell_bw=encoder_cell_bw, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True ) # 首先合并两个方向 RNN 的输出 encoder_outputs = tf.concat( (encoder_fw_outputs, encoder_bw_outputs), 2) encoder_state = [] for i in range(self.depth): encoder_state.append(encoder_fw_state[i])#前向 encoder_state.append(encoder_bw_state[i])#后向 encoder_state = tuple(encoder_state) #encoder_state=(encoder_fw_state[0],encoder_bw_state[0].......) return encoder_outputs, encoder_state
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell( encoder_outputs, encoder_state) # 解码器embedding with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = \ self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max(self.decoder_inputs_length) ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks # train_entropy = cross entropy self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True, ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 start_tokens = tf.tile([WordSequence.START], [self.batch_size]) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper """ return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection) else: # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope)) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_encoder(self): """构建编码器""" # print("构建编码器") with tf.variable_scope('encoder'): # 构建 encoder_cell encoder_cell = self.build_encoder_cell() # 编码器的embedding运行在GPU/CPU上 with tf.device(_get_embed_device(self.input_vocab_size)): # 加载预训练好的embedding if self.pretrained_embedding: self.encoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.input_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.encoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.input_vocab_size, self.embedding_size)) self.encoder_embeddings_init = self.encoder_embeddings.assign( self.encoder_embeddings_placeholder) else: self.encoder_embeddings = tf.get_variable( name='embedding', shape=(self.input_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) # embedding之后的输入 shape = (batch_size, time_step, embedding_size) self.encoder_inputs_embedded = tf.nn.embedding_lookup( params=self.encoder_embeddings, ids=self.encoder_inputs) # 当时用残差网络时 if self.use_residual: self.encoder_inputs_embedded = layers.dense( self.encoder_inputs_embedded, self.hidden_units, use_bias=False, name='encoder_residual_projection') inputs = self.encoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) if not self.bidirectional: # 单向 RNN (encoder_outputs, encoder_state) = tf.nn.dynamic_rnn( cell=encoder_cell, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) else: # 双向RNN # encoder_cell_fw = self.build_encoder_cell() encoder_cell_bw = self.build_encoder_cell() # backward cell ((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_state, encoder_bw_state)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=encoder_cell, cell_bw=encoder_cell_bw, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) # 首先合并两个方向 RNN 的输出 encoder_outputs = tf.concat( (encoder_fw_outputs, encoder_bw_outputs), 2) encoder_state = [] for i in range(self.depth): encoder_state.append(encoder_fw_state[i]) encoder_state.append(encoder_bw_state[i]) encoder_state = tuple(encoder_state) return encoder_outputs, encoder_state
def build_decoder(self): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: # Building decoder_cell and decoder_initial_state (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell() # 解码器embedding if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings else: with tf.device(_get_embed_device(self.target_vocab_size)): self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) # 使用 residual 的时候,对齐输入 if self.use_residual: self.decoder_embeddings = tf.layers.dense( self.decoder_embeddings, self.hidden_units * 2) # On Using Very Large Target Vocabulary # for Neural Machine Translation # https://arxiv.org/pdf/1412.2007v2.pdf # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims hidden_units = self.hidden_units if self.bidirectional: hidden_units *= 2 self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, # use_bias=False, name='decoder_output_projection') if self.mode == 'train': # decoder_inputs_embedded: # [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Helper to feed inputs for training: # read inputs from dense ground truth vectors inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length_train, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, # output_layer=self.decoder_output_projection ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: # if output_time_major=False: # [batch_size, max_time_step + 1, num_decoder_symbols] # if output_time_major=True: # [max_time_step + 1, batch_size, num_decoder_symbols] # decoder_outputs_train.sample_id: [batch_size], tf.int32 ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) # More efficient to do the projection # on the batch-time-concatenated tensor # logits_train: # [batch_size, max_time_step + 1, num_decoder_symbols] # 训练的时候一次性对所有的结果进行 output_layer 的投影运算 # 官方NMT库说这样能提高10~20%的速度 # 实际上我提高的速度会更大 self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # Computes per word average cross-entropy over a batch # Internally calls # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_targets_train, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_targets_train, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True, ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_targets_train, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 start_tokens = tf.tile([WordSequence.START], [self.batch_size]) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper """ return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection) else: # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: # if output_time_major=False: # [batch_size, max_time_step, num_decoder_symbols] # if output_time_major=True # [max_time_step, batch_size, num_decoder_symbols] # decoder_outputs_decode.sample_id: # if output_time_major=False # [batch_size, max_time_step], tf.int32 # if output_time_major=True # [max_time_step, batch_size], tf.int32 # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: # if output_time_major=False: # [batch_size, max_time_step, beam_width] # if output_time_major=True # [max_time_step, batch_size, beam_width] # decoder_outputs_decode.beam_search_decoder_output: # BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) # 官方文档提到的一个潜在的最大长度选择 # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2) # https://www.tensorflow.org/tutorials/seq2seq if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: # [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax( # self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with # the result of the beamsearch decoder # decoder_pred_decode: # [batch_size, max_time_step, 1] (output_major=False) # self.decoder_pred_decode = tf.expand_dims( # self.decoder_outputs_decode.sample_id, # -1 # ) dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: # Use beam search to approximately # find the most likely translation # decoder_pred_decode: # [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_encoder(self): """构建编码器 """ # print("构建编码器") with tf.variable_scope('encoder'): # 构建 encoder_cell self.encoder_cell = self.build_encoder_cell() # 编码器的embedding with tf.device(_get_embed_device(self.input_vocab_size)): self.encoder_embeddings = tf.get_variable( name='embedding', shape=(self.input_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) # embedded之后的输入 shape = (batch_size, time_step, embedding_size) self.encoder_inputs_embedded = tf.nn.embedding_lookup( params=self.encoder_embeddings, ids=self.encoder_inputs) # Encode input sequences into context vectors: # encoder_outputs: [batch_size, max_time_step, cell_output_size] # encoder_state: [batch_size, cell_output_size] inputs = self.encoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) if not self.bidirectional: # 单向 RNN (self.encoder_outputs, self.encoder_last_state) = tf.nn.dynamic_rnn( cell=self.encoder_cell, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) else: # 双向 RNN 比较麻烦 self.encoder_cell_bw = self.build_encoder_cell() ((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_state, encoder_bw_state)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=self.encoder_cell, cell_bw=self.encoder_cell_bw, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) # 首先合并两个方向 RNN 的输出 self.encoder_outputs = tf.concat( (encoder_fw_outputs, encoder_bw_outputs), 2) # 在 bidirectional 的情况下合并 state # QHD # borrow from # https://github.com/ematvey/tensorflow-seq2seq-tutorials/blob/master/model_new.py # 对上面链接中的代码有修改,因为原代码没有考虑多层cell的情况(MultiRNNCell) if isinstance(encoder_fw_state[0], LSTMStateTuple): # LSTM 的 cell self.encoder_last_state = tuple([ LSTMStateTuple( c=tf.concat( (encoder_fw_state[i].c, encoder_bw_state[i].c), 1), h=tf.concat( (encoder_fw_state[i].h, encoder_bw_state[i].h), 1)) for i in range(len(encoder_fw_state)) ]) elif isinstance(encoder_fw_state[0], tf.Tensor): # GRU 的中间状态只有一个,所以类型是 tf.Tensor # 分别合并(concat)就可以了 self.encoder_last_state = tuple([ tf.concat((encoder_fw_state[i], encoder_bw_state[i]), 1, name='bidirectional_concat_{}'.format(i)) for i in range(len(encoder_fw_state)) ])
def build_encoder(self): """ 构建编码器 :return:encoder_outputs, 最后一层rnn的输出 encoder_state,每一层的final state """ with tf.variable_scope('encoder'): encoder_cell = self.build_encoder_cell() # 编码器的embedding with tf.device(_get_embed_device(self.input_vocab_size)): # 加载训练好的embedding if self.pretrained_embedding: # 预训练模式 self.encoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.input_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.encoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.input_vocab_size, self.embedding_size)) self.encoder_embeddings_init = \ self.encoder_embeddings.assign( self.encoder_embeddings_placeholder) else: self.encoder_embeddings = tf.get_variable( name='embedding', shape=(self.input_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) # embedded之后的输入 shape = (batch_size, time_step, embedding_size) self.encoder_inputs_embedded = tf.nn.embedding_lookup( params=self.encoder_embeddings, ids=self.encoder_inputs) # 使用残差结构,先将输入的维度转换为隐藏层的维度 if self.use_residual: self.encoder_inputs_embedded = \ layers.dense(self.encoder_inputs_embedded, self.hidden_units, use_bias=False, name='encoder_residual_projection') inputs = self.encoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) if not self.bidirectional: (encoder_outputs, encoder_state) = tf.nn.dynamic_rnn( cell=encoder_cell, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True # 动态rnn,可以交换内存 ) else: encoder_cell_bw = self.build_encoder_cell() ((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_state, encoder_bw_state)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=encoder_cell, cell_bw=encoder_cell_bw, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) encoder_outputs = tf.concat( (encoder_fw_outputs, encoder_bw_outputs), 2) encoder_state = [] for i in range(self.depth): c_fw, h_fw = encoder_fw_state[i] c_bw, h_bw = encoder_bw_state[i] c = tf.concat((c_fw, c_bw), axis=-1) h = tf.concat((h_fw, h_bw), axis=-1) encoder_state.append(LSTMStateTuple(c=c, h=h)) encoder_state = tuple(encoder_state) return encoder_outputs, encoder_state
def build_decoder(self, encoder_outputs, encoder_state): """ 构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell( encoder_outputs, encoder_state) #构建解码器的embedding with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( #根据预测值或者真实值得到下一刻的输入 inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state #output_layer = self.decoder_output_projection #输出映射层,将rnn_size转化为vocab_size维 ) #decoder在当前的batch下的最大time_steps max_decoder_length = tf.reduce_max(self.decoder_inputs_length) outputs, self.final_state, _ = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished= True, #Boolean,为真时会拷贝最后一个时刻的状态并将输出置零,程序运行更稳定,使最终状态和输出具有正确的值,在反向传播时忽略最后一个完成步。但是会降低程序运行速度。 maximum_iterations= max_decoder_length, #最大解码步数,一般训练设置为decoder_inputs_length,预测时设置一个想要的最大序列长度即可。程序会在产生<eos>或者到达最大步数处停止 parallel_iterations=self. parallel_iterations, #parallel_iterations是并行执行循环的个数 swap_memory=True, scope=decoder_scope) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output) self.masks = tf.sequence_mask( #构建序列长度的mask标志 lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, #真实值y logits=decoder_logits_train #预测值y_ ) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits= decoder_logits_train, #[batch_size, sequence_length, num_decoder_symbols] targets=self. decoder_inputs, #[batch_size, sequence_length] 不用做one_hot weights=self. masks_rewards, #[batch_size, sequence_length] 即mask,滤去padding的loss计算,使loss计算更准确。 average_across_timesteps=True, average_across_batch=True) self.loss = seq2seq.sequence_loss( #序列的损失函数 logits= decoder_logits_train, #[batch_size, sequence_length, num_decoder_symbols] targets=self. decoder_inputs, #[batch_size, sequence_length] 不用做one_hot weights=self.masks, # 即mask,滤去padding的loss计算,使loss计算更准确。 average_across_timesteps=True, average_across_batch=True) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': start_tokens = tf.tile([WordSequence.START], [self.batch_size]) end_token = WordSequence.END def embed_and_input_proj(inputs): return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) if not self.use_beamsearch_decode: decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection) else: inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection) if self.max_decode_step is not None: max_decoder_step = self.max_decode_step else: max_decoder_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) self.decoder_outputs_decode, self.final_state, _ = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, maximum_iterations=max_decoder_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_encoder(self): """ 构建编码器""" with tf.variable_scope('encoder'): #变量命名空间 ,实现变量共享 encoder_cell = self.build_encoder_cell() with tf.device(_get_embed_device( self.input_vocab_size)): #判断使用显存还是内存 if self.pretrained_embedding: self.encoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.input_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.encoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.input_vocab_size, self.embedding_size)) self.encoder_embeddings_init = self.encoder_embeddings.assign( #赋值操作 self.encoder_embeddings_placeholder) else: self.encoder_embeddings = tf.get_variable( name='embedding', shape=(self.input_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) self.encoder_inputs_embedded = tf.nn.embedding_lookup( #函数是在params中查找ids的表示 #这里是在二维embeddings中找二维的ids, ids每一行中的一个数对应embeddings中的一行,所以最后是[batch_size, time_step, embedding_size] params=self.encoder_embeddings, ids=self.encoder_inputs) if self.use_residual: #全连接层 self.encoder_inputs_embedded = layers.dense( self.encoder_inputs_embedded, self.hidden_units, use_bias=False, name='encoder_residual_projection') inputs = self.encoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) if not self.bidirectional: (encoder_outputs, encoder_state) = tf.nn.dynamic_rnn( cell=encoder_cell, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=False) else: encoder_cell_bw = self.build_encoder_cell() ((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_state, encoder_bw_state) ) = tf.nn.bidirectional_dynamic_rnn( #动态多层双向lstm_rnn cell_fw=encoder_cell, cell_bw=encoder_cell_bw, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) encoder_outputs = tf.concat( [encoder_fw_outputs, encoder_bw_outputs], 2) encoder_state = [] for i in range(self.depth): encoder_state.append(encoder_fw_state[i]) encoder_state.append(encoder_bw_state[i]) encoder_state = tuple(encoder_state) return encoder_outputs, encoder_state
def build_encoder(self): """构建编码器 """ # print("构建编码器") with tf.variable_scope('encoder'): # 构建 encoder_cell self.encoder_cell = self.build_encoder_cell() # 编码器的embedding with tf.device(_get_embed_device(self.input_vocab_size)): self.encoder_embeddings = tf.get_variable( name='embedding', shape=(self.input_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) # embedded之后的输入 shape = (batch_size, time_step, embedding_size) self.encoder_inputs_embedded = tf.nn.embedding_lookup( params=self.encoder_embeddings, ids=self.encoder_inputs) # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims # 输入投影层 # 如果使用了residual,为了对齐输入和输出层,这里可能必须增加一个投影 input_layer = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='input_projection') self.input_layer = input_layer # Embedded inputs having gone through input projection layer self.encoder_inputs_embedded = input_layer( self.encoder_inputs_embedded) # Encode input sequences into context vectors: # encoder_outputs: [batch_size, max_time_step, cell_output_size] # encoder_state: [batch_size, cell_output_size] inputs = self.encoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) if not self.bidirectional: (self.encoder_outputs, self.encoder_last_state) = tf.nn.dynamic_rnn( cell=self.encoder_cell, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) else: self.encoder_cell_bw = self.build_encoder_cell() ((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_state, encoder_bw_state)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=self.encoder_cell, cell_bw=self.encoder_cell_bw, inputs=inputs, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=self.time_major, parallel_iterations=self.parallel_iterations, swap_memory=True) self.encoder_outputs = tf.concat( (encoder_fw_outputs, encoder_bw_outputs), 2) # 在 bidirectional 的情况下合并 state # QHD # borrow from # https://github.com/ematvey/tensorflow-seq2seq-tutorials/blob/master/model_new.py # 对上面链接中的代码有修改,因为原代码没有考虑多层cell的情况(MultiRNNCell) if isinstance(encoder_fw_state[0], LSTMStateTuple): # LSTM 的 cell self.encoder_last_state = tuple([ LSTMStateTuple( c=tf.concat( (encoder_fw_state[i].c, encoder_bw_state[i].c), 1), h=tf.concat( (encoder_fw_state[i].h, encoder_bw_state[i].h), 1)) for i in range(len(encoder_fw_state)) ]) elif isinstance(encoder_fw_state[0], tf.Tensor): # GRU 的中间状态只有一个,所以类型是 tf.Tensor # 分别合并(concat)就可以了 self.encoder_last_state = tuple([ tf.concat((encoder_fw_state[i], encoder_bw_state[i]), 1, name='bidirectional_concat_{}'.format(i)) for i in range(len(encoder_fw_state)) ])