def __call__(self, decoder_initial_states, encoder_output, encoder_padding_mask, **kwargs): output_layer = kwargs.pop("output_layer", None) if self.decoding_strategy == "train_greedy": # for teach-forcing MLE pre-training helper = layers.TrainingHelper(**kwargs) elif self.decoding_strategy == "infer_sample": helper = layers.SampleEmbeddingHelper(**kwargs) elif self.decoding_strategy == "infer_greedy": helper = layers.GreedyEmbeddingHelper(**kwargs) if self.decoding_strategy == "beam_search": beam_size = kwargs.get("beam_size", 4) encoder_output = layers.BeamSearchDecoder.tile_beam_merge_with_batch( encoder_output, beam_size) encoder_padding_mask = layers.BeamSearchDecoder.tile_beam_merge_with_batch( encoder_padding_mask, beam_size) decoder = layers.BeamSearchDecoder( cell=self.decoder_cell, output_fn=output_layer, **kwargs) else: decoder = layers.BasicDecoder( self.decoder_cell, helper, output_fn=output_layer) (decoder_output, decoder_final_state, dec_seq_lengths) = layers.dynamic_decode( decoder, inits=decoder_initial_states, max_step_num=self.max_decoding_length, encoder_output=encoder_output, encoder_padding_mask=encoder_padding_mask, impute_finished=False # for test coverage if self.decoding_strategy == "beam_search" else True, is_test=True if self.decoding_strategy == "beam_search" else False, return_length=True) return decoder_output, decoder_final_state, dec_seq_lengths
def _build_decoder(self, enc_final_state, mode='train', beam_size=10): dec_cell = DecoderCell(self.num_layers, self.hidden_size, self.dropout, self.init_scale) output_layer = lambda x: layers.fc(x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr( name="output_w", initializer=uniform_initializer( self.init_scale)), bias_attr=False) if mode == 'train': dec_output, dec_final_state = rnn(cell=dec_cell, inputs=self.tar_emb, initial_states=enc_final_state) dec_output = output_layer(dec_output) return dec_output elif mode == 'beam_search': beam_search_decoder = BeamSearchDecoder( dec_cell, self.beam_start_token, self.beam_end_token, beam_size, embedding_fn=self.tar_embeder, output_fn=output_layer) outputs, _ = dynamic_decode(beam_search_decoder, inits=enc_final_state, max_step_num=self.beam_max_step_num) return outputs
def _build_decoder(self, enc_final_state, mode='train', beam_size=10): output_layer = lambda x: layers.fc( x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr( name="output_w", initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale)), bias_attr=False) dec_cell = AttentionDecoderCell(self.num_layers, self.hidden_size, self.dropout, self.init_scale) dec_initial_states = [ enc_final_state, dec_cell.get_initial_states(batch_ref=self.enc_output, shape=[self.hidden_size]) ] max_src_seq_len = layers.shape(self.src)[1] src_mask = layers.sequence_mask(self.src_sequence_length, maxlen=max_src_seq_len, dtype='float32') enc_padding_mask = (src_mask - 1.0) if mode == 'train': dec_output, _ = rnn(cell=dec_cell, inputs=self.tar_emb, initial_states=dec_initial_states, sequence_length=None, enc_output=self.enc_output, enc_padding_mask=enc_padding_mask) dec_output = output_layer(dec_output) elif mode == 'beam_search': output_layer = lambda x: layers.fc( x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr(name="output_w"), bias_attr=False) beam_search_decoder = BeamSearchDecoder( dec_cell, self.beam_start_token, self.beam_end_token, beam_size, embedding_fn=self.tar_embeder, output_fn=output_layer) enc_output = beam_search_decoder.tile_beam_merge_with_batch( self.enc_output, beam_size) enc_padding_mask = beam_search_decoder.tile_beam_merge_with_batch( enc_padding_mask, beam_size) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=self.beam_max_step_num, enc_output=enc_output, enc_padding_mask=enc_padding_mask) return outputs return dec_output
def decoder(encoder_output, encoder_output_proj, encoder_state, encoder_padding_mask, trg=None, is_train=True): """Decoder: GRU with Attention""" decoder_cell = DecoderCell(hidden_size=decoder_size) decoder_initial_states = layers.fc(encoder_state, size=decoder_size, act="tanh") trg_embeder = lambda x: fluid.embedding( input=x, size=[target_dict_size, hidden_dim], dtype="float32", param_attr=fluid.ParamAttr(name="trg_emb_table")) output_layer = lambda x: layers.fc(x, size=target_dict_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr(name= "output_w")) if is_train: decoder_output, _ = layers.rnn( cell=decoder_cell, inputs=trg_embeder(trg), initial_states=decoder_initial_states, time_major=False, encoder_output=encoder_output, encoder_output_proj=encoder_output_proj, encoder_padding_mask=encoder_padding_mask) decoder_output = output_layer(decoder_output) else: encoder_output = layers.BeamSearchDecoder.tile_beam_merge_with_batch( encoder_output, beam_size) encoder_output_proj = layers.BeamSearchDecoder.tile_beam_merge_with_batch( encoder_output_proj, beam_size) encoder_padding_mask = layers.BeamSearchDecoder.tile_beam_merge_with_batch( encoder_padding_mask, beam_size) beam_search_decoder = layers.BeamSearchDecoder( cell=decoder_cell, start_token=bos_id, end_token=eos_id, beam_size=beam_size, embedding_fn=trg_embeder, output_fn=output_layer) decoder_output, _ = layers.dynamic_decode( decoder=beam_search_decoder, inits=decoder_initial_states, max_step_num=max_length, output_time_major=False, encoder_output=encoder_output, encoder_output_proj=encoder_output_proj, encoder_padding_mask=encoder_padding_mask) return decoder_output
def _build_net(self): self.seq_len = fluid.layers.data(name="seq_len", shape=[1], dtype='int64', lod_level=0) self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1]) src_mask = fluid.layers.sequence_mask(self.seq_len_used, maxlen=self.max_seq_len, dtype='float32') enc_padding_mask = (src_mask - 1.0) # Define decoder and initialize it. dec_cell = AttentionDecoderCell(self.num_layers, self.hidden_size, self.dropout) dec_init_hidden = fluid.layers.fc( input=self.feature, size=self.hidden_size, num_flatten_dims=1, param_attr=fluid.ParamAttr( name="dec_init_hidden_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="dec_init_hidden_b", initializer=fluid.initializer.Constant(0.))) dec_initial_states = [[[ dec_init_hidden, dec_cell.get_initial_states(batch_ref=self.feature, shape=[self.hidden_size]) ]] * self.num_layers, dec_cell.get_initial_states( batch_ref=self.feature, shape=[self.hidden_size])] tar_vocab_size = len(self._label_list) tar_embeder = lambda x: fluid.embedding( input=x, size=[tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr(name='target_embedding', initializer=fluid.initializer. UniformInitializer(low=-0.1, high=0.1))) start_token_id = self._label_list.index(self.start_token) end_token_id = self._label_list.index(self.end_token) if not self.is_predict_phase: self.dec_input = fluid.layers.data(name="dec_input", shape=[self.max_seq_len], dtype='int64') tar_emb = tar_embeder(self.dec_input) dec_output, _ = rnn(cell=dec_cell, inputs=tar_emb, initial_states=dec_initial_states, sequence_length=None, enc_output=self.token_feature, enc_padding_mask=enc_padding_mask) self.logits = fluid.layers.fc( dec_output, size=tar_vocab_size, num_flatten_dims=len(dec_output.shape) - 1, param_attr=fluid.ParamAttr( name="output_w", initializer=fluid.initializer.UniformInitializer( low=-0.1, high=0.1))) self.ret_infers = fluid.layers.reshape(x=fluid.layers.argmax( self.logits, axis=2), shape=[-1, 1]) logits = self.logits logits = fluid.layers.softmax(logits) return [logits] else: output_layer = lambda x: fluid.layers.fc( x, size=tar_vocab_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr(name="output_w")) beam_search_decoder = BeamSearchDecoder(dec_cell, start_token_id, end_token_id, self.beam_size, embedding_fn=tar_embeder, output_fn=output_layer) enc_output = beam_search_decoder.tile_beam_merge_with_batch( self.token_feature, self.beam_size) enc_padding_mask = beam_search_decoder.tile_beam_merge_with_batch( enc_padding_mask, self.beam_size) self.ret_infers, _ = dynamic_decode( beam_search_decoder, inits=dec_initial_states, max_step_num=self.beam_max_step_num, enc_output=enc_output, enc_padding_mask=enc_padding_mask) return self.ret_infers
def _build_decoder(self, z_mean=None, z_log_var=None, enc_output=None, mode='train', beam_size=10): dec_input = layers.dropout(self.tar_emb, dropout_prob=self.dec_dropout_in, dropout_implementation="upscale_in_train") # `output_layer` will be used within BeamSearchDecoder output_layer = lambda x: layers.fc(x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, name="output_w") # `sample_output_layer` samples an id from the logits distribution instead of argmax(logits) # it will be used within BeamSearchDecoder sample_output_layer = lambda x: layers.unsqueeze( fluid.one_hot(layers.unsqueeze( layers.sampling_id(layers.softmax( layers.squeeze(output_layer(x), [1])), dtype='int'), [1]), depth=self.tar_vocab_size), [1]) if mode == 'train': latent_z = self._sampling(z_mean, z_log_var) else: latent_z = layers.gaussian_random_batch_size_like( self.tar, shape=[-1, self.latent_size]) dec_first_hidden_cell = layers.fc(latent_z, 2 * self.hidden_size * self.num_layers, name='fc_hc') dec_first_hidden, dec_first_cell = layers.split( dec_first_hidden_cell, 2) if self.num_layers > 1: dec_first_hidden = layers.split(dec_first_hidden, self.num_layers) dec_first_cell = layers.split(dec_first_cell, self.num_layers) else: dec_first_hidden = [dec_first_hidden] dec_first_cell = [dec_first_cell] dec_initial_states = [[h, c] for h, c in zip(dec_first_hidden, dec_first_cell) ] dec_cell = DecoderCell(self.num_layers, self.hidden_size, latent_z, self.param_attr_initializer, self.param_attr_scale, self.dec_dropout_out) if mode == 'train': dec_output, _ = rnn(cell=dec_cell, inputs=dec_input, initial_states=dec_initial_states, sequence_length=self.tar_sequence_length) dec_output = output_layer(dec_output) return dec_output elif mode == 'greedy': start_token = 1 end_token = 2 max_length = 100 beam_search_decoder = BeamSearchDecoder( dec_cell, start_token, end_token, beam_size=1, embedding_fn=self.tar_embeder, output_fn=output_layer) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=max_length) return outputs elif mode == 'sampling': start_token = 1 end_token = 2 max_length = 100 beam_search_decoder = BeamSearchDecoder( dec_cell, start_token, end_token, beam_size=1, embedding_fn=self.tar_embeder, output_fn=sample_output_layer) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=max_length) return outputs else: print("mode not supprt", mode)
def decoder(encoder_output, encoder_output_proj, encoder_state, encoder_padding_mask, trg=None, is_train=True): # 定义 RNN 所需要的组件 decoder_cell = DecoderCell(hidden_size=decoder_size) decoder_initial_states = layers.fc(encoder_state, size=decoder_size, act="tanh") trg_embeder = lambda x: fluid.embedding( input=x, size=[target_dict_size, hidden_dim], dtype="float32", param_attr=fluid.ParamAttr(name="trg_emb_table")) output_layer = lambda x: layers.fc(input=x, size=target_dict_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr(name= "output_w")) if is_train: # 训练 # 训练时使用 `layers.rnn` 构造由 `cell` 指定的循环神经网络 # 循环的每一步从 `inputs` 中切片产生输入,并执行 `cell.call` # [-1,-1,512,] , [-1,512,] decoder_output, _ = layers.rnn( cell=decoder_cell, inputs=trg_embeder(trg), initial_states=decoder_initial_states, time_major=False, encoder_output=encoder_output, encoder_output_proj=encoder_output_proj, encoder_padding_mask=encoder_padding_mask) decoder_output = layers.fc(input=decoder_output, size=target_dict_size, num_flatten_dims=2, param_attr=fluid.ParamAttr(name="output_w")) else: # 基于 beam search 的预测生成 # beam search 时需要将用到的形为 `[batch_size, ...]` 的张量扩展为 `[batch_size* beam_size, ...]` encoder_output = layers.BeamSearchDecoder.tile_beam_merge_with_batch( encoder_output, beam_size) encoder_output_proj = layers.BeamSearchDecoder.tile_beam_merge_with_batch( encoder_output_proj, beam_size) encoder_padding_mask = layers.BeamSearchDecoder.tile_beam_merge_with_batch( encoder_padding_mask, beam_size) # BeamSearchDecoder 定义了单步解码的操作:`cell.call` + `beam_search_step` beam_search_decoder = layers.BeamSearchDecoder( cell=decoder_cell, start_token=bos_id, end_token=eos_id, beam_size=beam_size, embedding_fn=trg_embeder, output_fn=output_layer) # 使用 layers.dynamic_decode 动态解码 # 重复执行 `decoder.step()` 直到其返回的表示完成状态的张量中的值全部为True或解码步骤达到 `max_step_num` decoder_output, _ = layers.dynamic_decode( decoder=beam_search_decoder, inits=decoder_initial_states, max_step_num=max_length, output_time_major=False, encoder_output=encoder_output, encoder_output_proj=encoder_output_proj, encoder_padding_mask=encoder_padding_mask) return decoder_output