def forward(self, features): def FC(inputs, name, i, act): return L.fc(inputs, self.hidden_size, act=act, param_attr=F.ParamAttr( name='%s.fc.w_%d' % (name, i), initializer=F.initializer.XavierInitializer( fan_in=self.hidden_size, fan_out=self.hidden_size)), bias_attr=F.ParamAttr( name='%s.fc.b_%d' % (name, i), initializer=F.initializer.Constant(0.))) title_ids, comment_ids = features embedding_attr = F.ParamAttr( name='emb', initializer=F.initializer.XavierInitializer( fan_in=self.vocab_size, fan_out=self.embedding_size)) title_encoded = L.embedding(title_ids, [self.vocab_size, self.embedding_size], param_attr=embedding_attr) comment_encoded = L.embedding(comment_ids, [self.vocab_size, self.embedding_size], param_attr=embedding_attr) # Vsum zero = L.fill_constant(shape=[1], dtype='int64', value=0) title_pad = L.cast(L.logical_not(L.equal(title_ids, zero)), 'float32') comment_pad = L.cast(L.logical_not(L.equal(comment_ids, zero)), 'float32') title_encoded = L.reduce_sum(title_encoded * title_pad, dim=1) title_encoded = L.softsign(title_encoded) comment_encoded = L.reduce_sum(comment_encoded * comment_pad, dim=1) comment_encoded = L.softsign(comment_encoded) for i in range(self.num_layers): title_encoded = FC(title_encoded, 'title', i, 'tanh') for i in range(self.num_layers): comment_encoded = FC(comment_encoded, 'comment', i, 'tanh') score = L.reduce_sum(title_encoded * comment_encoded, dim=1, keep_dim=True) / np.sqrt(self.hidden_size) if self.mode is propeller.RunMode.PREDICT: probs = L.sigmoid(score) return probs else: return score
def _grammar_step(self, logits, next_cell_states, decode_states, actions, gmr_mask): """跟进文法约束完成一步解码逻辑 Args: logits (Variable): shape = [batch_size, beam_size, vocab_size] next_cell_states (Variable): NULL decode_states (StateWrapper): NULL Returns: TODO Raises: NULL """ # 解码出符合语法规则的 token logits logits, valid_table_mask = self._output_layer(logits, actions, gmr_mask, decode_states.valid_table_mask) # 初始化 vocab size self._vocab_size = logits.shape[-1] self._vocab_size_tensor = layers.fill_constant(shape=[1], dtype='int64', value=logits.shape[-1]) # 计算 log probs,并 mask 掉 finished 部分 step_log_probs = layers.log(layers.softmax(logits)) step_log_probs = self._mask_finished_probs(step_log_probs, decode_states.finished) scores = layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]) topk_scores, topk_indices = layers.topk(input=scores, k=self._beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_indices = layers.reshape(topk_indices, shape=[-1]) # top-k 对应的 beam beam_indices = layers.elementwise_floordiv(topk_indices, self._vocab_size_tensor) # top-k 对应的 token id token_indices = layers.elementwise_mod(topk_indices, self._vocab_size_tensor) # 根据 top k 的来源,重新组织 step_log_probs next_log_probs = nn_utils.batch_gather( layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]), topk_indices) def _beam_gather(x, beam_indices): """reshape x to beam dim, and gather each beam_indices Args: x (TYPE): NULL Returns: Variable """ x = self.split_batch_beams(x) return nn_utils.batch_gather(x, beam_indices) next_cell_states = layers.utils.map_structure(lambda x: _beam_gather(x, beam_indices), next_cell_states) next_finished = _beam_gather(decode_states.finished, beam_indices) next_lens = _beam_gather(decode_states.lengths, beam_indices) next_lens = layers.elementwise_add(next_lens, layers.cast(layers.logical_not(next_finished), next_lens.dtype)) next_finished = layers.logical_or(next_finished, layers.equal(token_indices, self._end_token_tensor)) decode_output = OutputWrapper(topk_scores, token_indices, beam_indices) decode_states = StateWrapper(next_cell_states, next_log_probs, next_finished, next_lens, valid_table_mask) return decode_output, decode_states
def is_finished(self, step_idx, source_length, alive_log_probs, finished_scores, finished_in_finished): """ is_finished """ base_1 = layers.cast(source_length, 'float32') + 55.0 base_1 /= 6.0 max_length_penalty = layers.pow(base_1, self.alpha) flat_alive_log_probs = layers.reshape(alive_log_probs, [-1]) lower_bound_alive_scores_1 = layers.gather(flat_alive_log_probs, [self.get_alive_index]) lower_bound_alive_scores = lower_bound_alive_scores_1 / max_length_penalty lowest_score_of_finished_in_finish = layers.reduce_min(finished_scores * finished_in_finished, dim=1) finished_in_finished = layers.cast(finished_in_finished, 'bool') lowest_score_of_finished_in_finish += \ ((1.0 - layers.cast(layers.reduce_any(finished_in_finished, 1), 'float32')) * -INF) #print lowest_score_of_finished_in_finish bound_is_met = layers.reduce_all(layers.greater_than(lowest_score_of_finished_in_finish, lower_bound_alive_scores)) decode_length = source_length + 50 length_cond = layers.less_than(x=step_idx, y=decode_length) return layers.logical_and(x=layers.logical_not(bound_is_met), y=length_cond)
def mask_fill(input, mask, value): """Fill value to input according to mask Args: input: input matrix mask: mask matrix value: Fill value Returns: output >>> input [ [1, 2, 3], [4, 5, 6] ] >>> mask [ [True, True, False], [True, False, False] ] >>> mask_fill(input, mask, 0) [ [1, 2, 0], [4, 0, 0] ] """ return input * layers.cast(layers.logical_not( mask), input.dtype) + layers.cast(mask, input.dtype) * value
def _build_input_mask(self, src_ids): zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.logical_not(L.equal(src_ids, zero)) # assume pad id == 0 input_mask = L.cast(input_mask, 'float') input_mask.stop_gradient = True return input_mask
def forward(self, features): src_ids, sent_ids = features dtype = 'float16' if self.hparam['fp16'] else 'float32' zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), dtype) # assume pad id == 0 #input_mask = L.unsqueeze(input_mask, axes=[2]) d_shape = L.shape(src_ids) seqlen = d_shape[1] batch_size = d_shape[0] pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) pos_ids = L.expand(pos_ids, [batch_size, 1]) pos_ids = L.unsqueeze(pos_ids, axes=[2]) pos_ids = L.cast(pos_ids, 'int64') pos_ids.stop_gradient = True input_mask.stop_gradient = True task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment task_ids.stop_gradient = True bert = ErnieModel( src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, task_ids=task_ids, input_mask=input_mask, config=self.hparam, use_fp16=self.hparam['fp16'] ) cls_feats = bert.get_pooled_output() cls_feats = L.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train" ) logits = L.fc( input=cls_feats, size=self.hparam['num_label'], param_attr=F.ParamAttr( name="cls_out_w", initializer=F.initializer.TruncatedNormal(scale=0.02)), bias_attr=F.ParamAttr( name="cls_out_b", initializer=F.initializer.Constant(0.)) ) propeller.summary.histogram('pred', logits) if self.mode is propeller.RunMode.PREDICT: probs = L.softmax(logits) return probs else: return logits
def bow(ids): embed = L.embedding( input=ids, size=[self.config.vocab_size, self.config.emb_size], dtype=self._emb_dtype, param_attr=F.ParamAttr(name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) zero = L.fill_constant(shape=[1], dtype='int64', value=0) pad = L.cast(L.logical_not(L.equal(ids, zero)), 'float32') sumed = L.reduce_sum(embed * pad, dim=1) sumed = L.softsign(sumed) return sumed
def _process_type_leaf(condition, decoder, grammar_stack, next_inputs, finished): """Process when output type is LEAF Args: condition (TYPE): NULL decoder (TYPE): NULL grammar_stack (StackData): (gmr_stack_data, gmr_stack_pos) next_inputs (DecoderInputsWrapper): (input_var, action, grammar_mask) finished (TYPE): NULL Returns: None Raises: NULL """ ## pop stack next_output, valid_pos, gmr_stack_tmp = data_structure.Stack.pop( grammar_stack, mask=True, in_place=False) valid_pos = fluider.squeeze(valid_pos, [1]) ## update next grammar mask next_actions = layers.elementwise_mul(decoder.grammar_action(next_output), layers.cast( valid_pos, dtype=next_inputs.action.dtype), axis=0) next_gmr_mask = layers.elementwise_mul( decoder.grammar_mask(next_output), layers.cast(valid_pos, dtype=next_inputs.gmr_mask.dtype), axis=0) ## save result, while condition is True new_gmr_stack_data, new_gmr_stack_pos, new_actions, new_gmr_mask = nn_utils.ifelse( condition, [gmr_stack_tmp.data, gmr_stack_tmp.pos, next_actions, next_gmr_mask], [ grammar_stack.data, grammar_stack.pos, next_inputs.action, next_inputs.gmr_mask ]) layers.utils.map_structure( layers.assign, [new_gmr_stack_data, new_gmr_stack_pos, next_actions, new_gmr_mask], [ grammar_stack.data, grammar_stack.pos, next_inputs.action, next_inputs.gmr_mask ]) layers.logical_or(finished, layers.logical_and(condition, layers.logical_not(valid_pos)), out=finished)
def pop(cls, stack_data, mask=True, in_place=True): """pop data in stack_data Args: stack_data (StackData): (data, pos) with shape ([batch_size, stack_len], [batch_size, 1]) mask (bool): 是否 mask 空栈的返回值。默认为 True in_place (bool): 默认为 True Returns: (Variable1, Variable2) Variable1: pop 得到的值 dtype=stack_data.data.dtype shape=[-1] Variable2: 对应位置的值是否合法。入参已经为空的栈,此处为 False。 dtype=bool shape=[-1] Raises: NULL """ data = stack_data.data pos = stack_data.pos # 只有非空的栈才能pop(才合法) valid_pos = layers.logical_not(cls.empty(stack_data)) new_pos_delta = layers.cast(valid_pos, dtype=pos.dtype) new_pos = layers.elementwise_sub(pos, new_pos_delta) # shape = [batch_size] output = nn_utils.batch_gather(data, new_pos) # mask 空栈的返回值 if mask: # shape = [batch_size, 1] mask_tag = layers.cast( new_pos_delta, dtype=data.dtype) if data.dtype != pos.dtype else new_pos_delta mask_tag = layers.squeeze(mask_tag, [1]) output = layers.elementwise_mul(output, mask_tag) # 出栈后原位置置为0 updates = layers.zeros_like(output) new_data = nn_utils.batch_scatter(data, new_pos, updates, overwrite=True, in_place=in_place) if in_place: layers.assign(new_pos, pos) return output, valid_pos, stack_data else: return output, valid_pos, StackData(new_data, new_pos)
def forward(self, words, feats): """Forward network""" batch_size, seq_len = words.shape # get the mask and lengths of given batch mask = words != self.pad_index ext_words = words # set the indices larger than num_embeddings to unk_index if hasattr(self, 'pretrained'): ext_mask = words >= self.word_embed.weight.shape[0] ext_words = nn.mask_fill(words, ext_mask, self.unk_index) # get outputs from embedding layers word_embed = self.word_embed(ext_words) if hasattr(self, 'pretrained'): word_embed += self.pretrained(words) feat_embed = self.feat_embed(feats) word_embed, feat_embed = self.embed_dropout(word_embed, feat_embed) # concatenate the word and feat representations # embed.size = (batch, seq_len, n_embed * 2) embed = layers.concat((word_embed, feat_embed), axis=-1) if self.args.encoding_model == "lstm": x, _ = self.lstm(embed, mask, self.pad_index) x = self.lstm_dropout(x) else: _, x = self.transformer(words, word_emb=embed) # apply MLPs to the BiLSTM output states arc_h = self.mlp_arc_h(x) arc_d = self.mlp_arc_d(x) rel_h = self.mlp_rel_h(x) rel_d = self.mlp_rel_d(x) # get arc and rel scores from the bilinear attention # [batch_size, seq_len, seq_len] s_arc = self.arc_attn(arc_d, arc_h) # [batch_size, seq_len, seq_len, n_rels] s_rel = layers.transpose(self.rel_attn(rel_d, rel_h), perm=(0, 2, 3, 1)) # set the scores that exceed the length of each sentence to -1e5 s_arc_mask = nn.unsqueeze(layers.logical_not(mask), 1) s_arc = nn.mask_fill(s_arc, s_arc_mask, -1e5) return s_arc, s_rel
def _check_finished(decoder, next_inputs, finished, outputs_array): """check finished instance by next_inputs.action, and update finished tag and write END to outputs Args: decoder (TYPE): NULL next_inputs (TYPE): NULL finished (TYPE): NULL outputs_array (TYPE): NULL Returns: TODO Raises: NULL """ act_stop = tensor.fill_constant_batch_size_like( next_inputs.action, shape=next_inputs.action.shape, value=decoder._grammar.ACTION_STOP, dtype='int64') new_finished = layers.logical_and( layers.equal(next_inputs.action, act_stop), layers.logical_not(finished)) end_token_id = tensor.fill_constant_batch_size_like( outputs_array.data, shape=[-1], value=decoder._grammar.END, dtype=outputs_array.data.dtype) out_data_tmp, out_pos_tmp = data_structure.Array.push(outputs_array, end_token_id, in_place=False) new_data, new_pos = nn_utils.ifelse( new_finished, [out_data_tmp, out_pos_tmp], [outputs_array.data, outputs_array.pos]) layers.assign(new_data, outputs_array.data) layers.assign(new_pos, outputs_array.pos) layers.logical_or(finished, new_finished, out=finished)
def training_network(self, img, caption): # build caption and mask target = caption[:, 1:] source = caption[:, :-1] padding_filled = layers.fill_constant_batch_size_like(target, shape=[-1, decoder_config['sentence_length'] - 1], dtype='int64', value=config.dc['padding_idx']) mask = layers.equal(target, padding_filled) mask = layers.cast(layers.logical_not(mask), 'float32') scale_factor = layers.reduce_sum(mask) mask.stop_gradient = True scale_factor.stop_gradient = True # mdl decoder = Decoder(decoder_config['hidden_dim'], rnn_layer=1) image_embed, global_image_feat = self._img2feature(img) # [batch, k+1, hidden], [batch, hidden] # 这里要改,要么在rnn里面做embedding,要么在外面做! seq_out = decoder.call(global_image_feat, image_embed, embedding_function, words=source) loss = layers.squeeze(ImageCaptionModel.loss(target, seq_out), axes=[2]) loss = layers.elementwise_mul(loss, mask) output_loss = layers.elementwise_div(layers.reduce_sum(loss), scale_factor, name='loss') return output_loss
def cond_func(step_idx, selected_ids, selected_scores, gather_idx, caches, trg_src_attn_bias): length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) return layers.logical_and(x=length_cond, y=finish_cond)
def infilling_decode(self): if self.task_type == "dialog": emb_num = 4 else: emb_num = 3 input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \ [[-1, self.max_seq_len, self.max_seq_len]] input_dtypes = ['int64'] * emb_num + ['float32'] input_lod_levels = [0] * emb_num + [0] shapes = input_shapes + [[-1, self.max_seq_len, 1], [-1, self.max_seq_len, 1], [-1, 1], [-1], [-1, 1, self.max_seq_len], [-1, 1]] dtypes = input_dtypes + [ 'int64', 'int64', 'float32', 'int32', 'float32', 'int64' ] lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0] inputs = self.to_ternsor(shapes, dtypes, lod_levels) pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=False) emb_ids = {} for key, value in zip(self.emb_keys, inputs[:emb_num]): emb_ids[key] = value input_mask = inputs[emb_num] tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[ -6:] ernie = ErnieModel(emb_ids=emb_ids, input_mask=input_mask, config=self.ernie_config, use_fp16=self.use_fp16, task_type=self.task_type, decoding=True, gather_idx=parent_idx) max_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.max_dec_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=0, force_cpu=True) pos_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx) pos_biases = layers.array_write(layers.reshape(tgt_pos, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) tgt_masks = layers.array_write(tgt_input_mask, step_idx) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_mask = layers.array_read(tgt_masks, i=step_idx) def gen_batch_like(value, dtype="int64", shape=[-1, 1, 1], is_scalar=True): if is_scalar: return layers.fill_constant_batch_size_like( input=parent_idx, value=value, shape=shape, dtype=dtype) else: return layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=parent_idx, value=1, shape=shape, dtype=dtype), y=value, axis=0) tmp_mask = layers.gather(input=tmp_mask, index=parent_idx) append_0_mask = gen_batch_like(0.0, dtype=tmp_mask.dtype) append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype) tmp_mask = layers.concat([tmp_mask, append_1_mask], axis=2) pre_mask = layers.concat([tmp_mask, append_0_mask], axis=2) cur_mask = layers.concat([tmp_mask, append_1_mask], axis=2) cur_ids = gen_batch_like(self.attn_id) pre_pos = gen_batch_like(step_idx, is_scalar=False) cur_pos = gen_batch_like(pos_idx, is_scalar=False) if self.continuous_position: pre_pos = pre_pos + pos_bias cur_pos = cur_pos + pos_bias dec_emb_ids = { "word_embedding": layers.concat([pre_ids, cur_ids], axis=1), "pos_embedding": layers.concat([pre_pos, cur_pos], axis=1) } if self.task_type == "dialog": role_ids = gen_batch_like(0) turn_ids = gen_batch_like(0) dec_emb_ids["role_embedding"] = layers.concat( [role_ids, role_ids], axis=1) dec_emb_ids["turn_embedding"] = layers.concat( [turn_ids, turn_ids], axis=1) else: sent_ids = gen_batch_like(self.tgt_type_id) dec_emb_ids["sent_embedding"] = layers.concat( [sent_ids, sent_ids], axis=1) dec_mask = layers.concat([pre_mask, cur_mask], axis=1) dec_out = ernie.encode(dec_emb_ids, dec_mask, parent_idx, remove_query=True) fc_out = self.cal_logit(dec_out[:, 1:, :], None) topk_scores, topk_indices = layers.topk( input=layers.softmax(fc_out), k=self.beam_size) pre_lenpen = layers.pow( (5.0 + layers.cast(step_idx, pre_scores.dtype)) / 6.0, self.length_penalty) cur_lenpen = layers.pow( (5.0 + layers.cast(pos_idx, pre_scores.dtype)) / 6.0, self.length_penalty) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_lenpen, axis=0) / cur_lenpen topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=pos_idx, value=1.0, in_place=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(tmp_mask, i=step_idx, array=tgt_masks) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) graph_vars = { "finished_ids": finished_ids, "finished_scores": finished_scores, "data_ids": data_ids } for k, v in graph_vars.items(): v.persistable = True return pyreader, graph_vars
def _run_paddle_logical_not(x): x = cast_bool_if_necessary(x) return logical_not(x)
def beam_search(): max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=max_out_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, n_head, 0, d_key], dtype=enc_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, n_head, 0, d_value], dtype=enc_output.dtype, value=0), "static_k": # for encoder-decoder attention layers.create_tensor(dtype=enc_output.dtype), "static_v": # for encoder-decoder attention layers.create_tensor(dtype=enc_output.dtype) } for i in range(n_layer) ] with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder(trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), enc_output=enc_output, caches=caches, gather_idx=parent_idx, bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. accu_scores = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_attn_bias, trg_src_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=eos_idx) return finished_ids, finished_scores
def beam_search(): """Beam search function""" max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.min_out_len) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-INF) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "static_k_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_v_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_k_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype), "static_v_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype) } for i in range(self._dec_n_layer) ] trigram_blocking = TrigramBlocking(start_tokens, self.tokenizer, use_fp16=self._use_fp16, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_words_attn_bias = layers.gather( tgt_src_words_attn_bias, index=parent_idx) pre_src_sents_attn_bias = layers.gather( tgt_src_sents_attn_bias, index=parent_idx) pre_graph_attn_bias = layers.gather(graph_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input= pre_src_sents_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = self.decode( dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias, pre_src_sents_attn_bias, pre_graph_attn_bias), enc_words_output=enc_words_output, enc_sents_output=enc_sents_output, caches=caches, gather_idx=parent_idx) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant( shape=[layers.shape(logits)[0]], dtype='int64', value=self.eos_idx) eos_index = fluid.one_hot(eos_index, depth=self.voc_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(logits, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.len_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func( func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) layers.Print(trigram_blocking.delta_score_out, summarize=100, message="trigram_blocking.delta_score_out") pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.len_penalty) curr_scores = layers.elementwise_div( accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias) layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias) layers.assign(pre_graph_attn_bias, graph_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not( layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) return finished_ids, finished_scores
def decoder(self, init_state): """ implement decoder in inference mode """ # pd.Print(init_state) # define counter variable in the decoding array_len = pd.fill_constant(shape=[1], dtype='int64', value=self.max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) static_count = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # define tensor array to save content at each time step, and write initial id, score and state state_h_array = pd.create_array('float32') pd.array_write(self.h, array=state_h_array, i=counter) state_c_array = pd.create_array('float32') pd.array_write(self.c, array=state_c_array, i=counter) src_indexes = fluid.layers.data(name='source_index', shape=[1], dtype='int64', lod_level=1) src_index_array = pd.create_array('int64') pd.array_write(src_indexes, array=src_index_array, i=counter) ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) encoder_vec_array = pd.create_array('float32') pd.array_write(self.encoder_vec, array=encoder_vec_array, i=static_count) encoder_vec_full_array = pd.create_array('float32') pd.array_write(self.encoder_vec_full, array=encoder_vec_full_array, i=static_count) encoder_proj_array = pd.create_array('float32') pd.array_write(self.encoder_proj, array=encoder_proj_array, i=static_count) event_embedding_array = pd.create_array('float32') pd.array_write(self.event_embedding, array=event_embedding_array, i=static_count) # define conditional variable to stop loop cond = pd.less_than(x=counter, y=array_len) # define while_op while_op = pd.While(cond=cond) with while_op.block(): # define the computing of each step # pd.Print(counter) # obtain input at present step of decoder, including id chosen at previous step, corresponding score and state at previous step. pre_ids = pd.array_read(array=ids_array, i=counter) pre_h_state = pd.array_read(array=state_h_array, i=counter) pre_c_state = pd.array_read(array=state_c_array, i=counter) # pre_score = pd.array_read(array=scores_array, i=counter) pre_score = pd.array_read(array=scores_array, i=static_count) _encoder_input_ids = pd.array_read(array=src_index_array, i=static_count) event_embedding = pd.array_read(array=event_embedding_array, i=static_count) # print("pre_h_state", pre_h_state) encoder_vec = pd.array_read(array=encoder_vec_array, i=static_count) encoder_vec_full = pd.array_read(array=encoder_vec_full_array, i=static_count) encoder_proj = pd.array_read(array=encoder_proj_array, i=static_count) # # update input state as state correspondent with id chosen at previous step # pre_h_state_expanded = pd.sequence_expand(pre_h_state, pre_score) # pre_c_state_expanded = pd.sequence_expand(pre_c_state, pre_score) # computing logic of decoder under the same train mode, including input vector and computing unit of decoder # compute predicting probability of normalized word pre_ids_emb = pd.embedding( input=pre_ids, size=[self.target_dict_dim, self.embedding_dim], dtype='float32', param_attr=fluid.ParamAttr(name="trg_embedding")) # pd.Print(pre_ids_emb) att_context = self.simple_attention(encoder_vec, encoder_proj, pre_h_state) # print("att_context", att_context) # print("pre_ids_emb", pre_ids_emb) # pd.Print(att_context) prob_c = fluid.layers.sequence_expand_as(pre_score, encoder_vec) # pd.Print(prob_c) current_score, current_h, current_c, this_prob_c = self.copy_decoder( pre_ids_emb, encoder_vec, encoder_vec_full, encoder_proj, _encoder_input_ids, pre_ids, prob_c, att_context, pre_h_state, pre_c_state, event_embedding) # decoder_inputs = fluid.layers.concat( # input=[att_context, pre_ids_emb], axis=1) # current_h, current_c = self.lstm_step( # decoder_inputs, pre_h_state, pre_c_state, self.decoder_size) # # compute predicting probability of nomarlized word # current_score = fluid.layers.fc(input=current_h, # size=self.target_dict_dim, # act='softmax', # param_attr=fluid.ParamAttr(name="out_softmax_w"), # bias_attr=fluid.ParamAttr(name="out_softmax_b")) # # current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], # # size=decoder_size, # # act='tanh') # current_state_with_lod = pd.lod_reset(x=current_h, y=pre_score) # current_score = pd.fc(input=current_state_with_lod, # size=self.target_dict_dim, # act='softmax', # param_attr=fluid.ParamAttr(name="out_softmax_w"), # bias_attr=fluid.ParamAttr(name="out_softmax_b")) # print(current_score) topk_scores, topk_indices = pd.topk(current_score, k=self.beam_size) # pd.Print(topk_indices) # pd.Print(topk_scores) selected_ids, selected_scores = topk_indices, topk_scores # # compute accumulated score and perform beam search # accu_scores = pd.elementwise_add( # x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0) # selected_ids, selected_scores = pd.beam_search( # pre_ids, # pre_score, # topk_indices, # accu_scores, # self.beam_size, # # end_id=self.end_id, # end_id=999999, # level=0) # pd.Print(selected_ids) # pd.Print(selected_scores) pd.increment(x=counter, value=1, in_place=True) # write search result and corresponding hidden layer into tensor array pd.array_write(current_h, array=state_h_array, i=counter) pd.array_write(current_c, array=state_c_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) # pd.Print(selected_ids) # pd.Print(selected_scores) # update condition to stop loop length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) # pd.Print(array_len) # translation_ids, translation_scores = pd.beam_search_decode( # ids=ids_array, scores=scores_array, beam_size=self.beam_size, end_id=self.end_id) # pd.Print(translation_ids) translation_ids, translation_ids_index = pd.tensor_array_to_tensor( ids_array, axis=1) translation_scores, translation_scores_index = pd.tensor_array_to_tensor( scores_array, axis=1) return translation_ids, translation_scores
def _do_beam_search(trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, beam_size, max_len, bos_idx, eos_idx, ids, scores, parent_idx, trg_src_attn_bias, caches, enc_output, step_idx): """ do beam search """ cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder(trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), enc_output=enc_output, caches=caches, gather_idx=parent_idx, bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk(input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. accu_scores = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_attn_bias, trg_src_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond)
def inference(self, model, inputs, outputs): """ Run inference. Args: inputs(dict): Its key is input name(str) and its value is a Variable. model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`. Returns: dict(str:Variable): Its key is output name(str) and its value is a Variable. """ # prepare while loop max_len = layers.fill_constant( shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True) min_len = layers.fill_constant( shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True) step_idx = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=True) ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx) pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx) scores = layers.array_write(inputs["init_score"], step_idx) tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx) parent_idx = inputs["parent_idx"] if self.decoding_strategy == "beam_search": beam_size = self.beam_size else: beam_size = 1 eos_penalty = np.zeros(self.vocab_size, dtype="float32") eos_penalty[self.eos_id] = -1e9 eos_penalty = layers.assign(eos_penalty) token_penalty = np.zeros(self.vocab_size, dtype="float32") token_penalty[self.unk_id] = -1e9 if self.mask_id >= 0: token_penalty[self.mask_id] = -1e9 token_penalty = layers.assign(token_penalty) # start while loop cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx) dtype = tmp_tgt_generation_mask.dtype append_mask = layers.fill_constant_batch_size_like( input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype) tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2) pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx) pre_sent = layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype) if self.continuous_position: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias else: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) if self.use_role: pre_role = layers.fill_constant_batch_size_like( input=pre_mask, value=0, shape=[-1, 1, 1], dtype=pre_ids.dtype) else: pre_role = None dec_out, _ = model._generation_network( token_ids=pre_ids, type_ids=pre_sent, pos_ids=pre_pos, role_ids=pre_role, generation_mask=tmp_tgt_generation_mask, gather_idx=parent_idx) logits = model._calc_logits(dec_out) # ignore unk and mask token if self.ignore_unk: logits = layers.elementwise_add(logits, token_penalty, axis=1) # min dec length min_len_cond = layers.less_than(x=step_idx, y=min_len) def min_len_penalty(): """Plus minimum length penalty.""" return layers.elementwise_add(logits, eos_penalty, axis=1) def no_penalty(): """No penalty.""" return logits logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty) # get probs probs = layers.softmax(logits / self.temperature) if self.decoding_strategy == "beam_search": topk_scores, topk_indices = layers.topk( input=probs, k=beam_size) else: if self.decoding_strategy.startswith("sampling"): sampling_ids = layers.sampling_id(probs, dtype="int") elif self.decoding_strategy.startswith("topk_sampling"): topk_probs, _ = layers.topk(input=probs, k=self.topk) ge_cond = layers.cast( layers.greater_equal( probs, layers.unsqueeze(topk_probs[:, -1], [1])), "float32") old_probs = probs probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") probs = old_probs else: raise ValueError(self.decoding_strategy) sampling_scores = layers.one_hot( layers.unsqueeze(sampling_ids, [1]), probs.shape[1] ) sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3 topk_scores, topk_indices = layers.topk( input=sampling_scores, k=1) pre_len = layers.cast(step_idx, "float32") layers.increment(x=step_idx, value=1.0, in_place=True) cur_len = layers.cast(step_idx, "float32") # update scores if self.length_average: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len elif self.length_penalty > 0: pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty) cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp else: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores, axis=0) topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=self.eos_id, return_parent_idx=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=self.eos_id) predictions = { "finished_ids": finished_ids, "finished_scores": finished_scores, "token_ids": inputs["token_ids"], "data_id": inputs["data_id"] } return predictions
def decode_with_grammar(decoder, inits, decode_vocab, max_step_num, **kwargs): """A modification of paddle.fluid.layers.dynamic_decode(...). Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned Tensor indicating finished status contains all True values or the number of decoding step reachs to :attr:`max_step_num`. :code:`decoder.initialize()` would be called once before the decoding loop. If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()` would be called once after the decoding loop. Args: decoder(Decoder): An instance of `Decoder`. inits(tuple): Argument passed to `decoder.initialize`. decode_vocab(DecoderDynamicVocab): namedtuple(table table_len column column_len value value_len) max_step_num(int): The maximum number of steps. **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. Returns: tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ outputs and states, both are Tensor or nested structure of Tensor. \ `final_outputs` has the same structure and data types as \ :code:`decoder.output_dtype` , and each Tenser in `final_outputs` \ is the stacked of all decoding steps' outputs, which might be revised \ by :code:`decoder.finalize` . `final_states` is the counterpart \ at last time step of initial states returned by :code:`decoder.initialize` , \ thus has the same structure with it and has tensors with same shapes \ and data types. """ step_cnt = tensor.fill_constant(shape=[1], dtype="int64", value=1) max_step_num_tensor = tensor.fill_constant(shape=[1], dtype="int64", value=max_step_num - 2) # shape = [batch_size, beam_size, ...] initial_inputs, initial_states, initial_finished = decoder.initialize( inits, decode_vocab) global_inputs, global_states, global_finished = (initial_inputs, initial_states, initial_finished) inputs = initial_inputs states = initial_states # 保存输出结果 outputs_arr_data = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, max_step_num], dtype=decoder.output_dtype.predicted_ids, value=0) outputs_arr_pos = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0) outputs_array = data_structure.ArrayData( decoder.merge_batch_beams(outputs_arr_data), decoder.merge_batch_beams(outputs_arr_pos)) sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished), "int64") # 按语法解码的相关约束数据结构 grammar_stack_dat = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, max_step_num * STACK_EXPAND_TIMES], dtype='int64', value=0) grammar_stack_pos = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0) grammar_stack = data_structure.StackData( decoder.merge_batch_beams(grammar_stack_dat), decoder.merge_batch_beams(grammar_stack_pos)) ############ 循环解码,直到全部为 finish 状态 ############ # finish 的判断:通过 global_finished/next_finished && max_step_num 判断 cond = layers.logical_not((layers.reduce_all(initial_finished))) while_op = layers.While(cond) with while_op.block(): # step_outputs --> OutputWrapper # next_states --> StateWrapper # next_inputs --> DecoderInputsWrapper step_outputs, next_states, next_inputs = decoder.step( inputs, states, **kwargs) predicted_ids = step_outputs.predicted_ids _save_predict_output(outputs_array, predicted_ids, next_states.finished) pred_gmr_type = decoder.grammar_type(predicted_ids) cond_type_leaf = layers.equal(pred_gmr_type, decoder.GMR_TYPE.LEAF) cond_type_midd = layers.equal(pred_gmr_type, decoder.GMR_TYPE.MID) _process_type_leaf(cond_type_leaf, decoder, grammar_stack, next_inputs, next_states.finished) _process_type_midd(cond_type_midd, decoder, grammar_stack, next_inputs, predicted_ids) ##next_sequence_lengths = layers.elementwise_add(sequence_lengths, ## tensor.cast(layers.logical_not(global_finished), sequence_lengths.dtype)) _check_finished(decoder, next_inputs, next_states.finished, outputs_array) layers.utils.map_structure(tensor.assign, next_inputs, global_inputs) layers.utils.map_structure(tensor.assign, next_states, global_states) tensor.assign(next_states.finished, global_finished) ##tensor.assign(next_sequence_lengths, sequence_lengths) # 更新循环条件 layers.increment(x=step_cnt, value=1.0, in_place=True) layers.logical_and( layers.logical_not(layers.reduce_all(next_states.finished)), layers.less_equal(step_cnt, max_step_num_tensor), cond) final_outputs = outputs_array.data final_states = global_states final_outputs, final_states = decoder.finalize(final_outputs, global_states, sequence_lengths) return final_outputs, final_states
def gru_attention_infer(self, decoder_boot, max_length, char_num, word_vector_dim, encoded_vector, encoded_proj, decoder_size): init_state = decoder_boot beam_size = 1 array_len = layers.fill_constant( shape=[1], dtype='int64', value=max_length) counter = layers.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = layers.create_array('float32') layers.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = layers.create_array('int64') scores_array = layers.create_array('float32') rois_shape = layers.shape(init_state) batch_size = layers.slice( rois_shape, axes=[0], starts=[0], ends=[1]) + 1 lod_level = layers.range( start=0, end=batch_size, step=1, dtype=batch_size.dtype) init_ids = layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], value=0, dtype='int64') init_ids = layers.lod_reset(init_ids, lod_level) init_ids = layers.lod_append(init_ids, lod_level) init_scores = layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], value=1, dtype='float32') init_scores = layers.lod_reset(init_scores, init_ids) layers.array_write(init_ids, array=ids_array, i=counter) layers.array_write(init_scores, array=scores_array, i=counter) full_ids = fluid.layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], dtype='int64', value=1) cond = layers.less_than(x=counter, y=array_len) while_op = layers.While(cond=cond) with while_op.block(): pre_ids = layers.array_read(array=ids_array, i=counter) pre_state = layers.array_read(array=state_array, i=counter) pre_score = layers.array_read(array=scores_array, i=counter) pre_ids_emb = layers.embedding( input=pre_ids, size=[char_num, word_vector_dim], dtype='float32') context = self.simple_attention(encoded_vector, encoded_proj, pre_state, decoder_size) # expand the recursive_sequence_lengths of pre_state # to be the same with pre_score pre_state_expanded = layers.sequence_expand(pre_state, pre_score) context_expanded = layers.sequence_expand(context, pre_score) fc_1 = layers.fc(input=context_expanded, size=decoder_size * 3, bias_attr=False, name="rnn_fc1") fc_2 = layers.fc(input=pre_ids_emb, size=decoder_size * 3, bias_attr=False, name="rnn_fc2") decoder_inputs = fc_1 + fc_2 current_state, _, _ = layers.gru_unit( input=decoder_inputs, hidden=pre_state_expanded, size=decoder_size * 3) current_state_with_lod = layers.lod_reset( x=current_state, y=pre_score) # use score to do beam search current_score = layers.fc(input=current_state_with_lod, size=char_num, bias_attr=True, act='softmax', name="rnn_out_fc") topk_scores, topk_indices = layers.topk(current_score, k=beam_size) new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1) fluid.layers.assign(new_ids, full_ids) layers.increment(x=counter, value=1, in_place=True) # update the memories layers.array_write(current_state, array=state_array, i=counter) layers.array_write(topk_indices, array=ids_array, i=counter) layers.array_write(topk_scores, array=scores_array, i=counter) # update the break condition: # up to the max length or all candidates of # source sentences have ended. length_cond = layers.less_than(x=counter, y=array_len) finish_cond = layers.logical_not(layers.is_empty(x=topk_indices)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) return full_ids
def beam_search(): max_len = layers.fill_constant( shape=[1], dtype=start_tokens.dtype, value=max_out_len) step_idx = layers.fill_constant( shape=[1], dtype=start_tokens.dtype, value=0) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(start_tokens, step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps to reduce redundant # computation in decoder. caches = [{ "k": layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, 0, d_model], dtype=enc_output.dtype, value=0), "v": layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, 0, d_model], dtype=enc_output.dtype, value=0) } for i in range(n_layer)] with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_scores = layers.array_read(array=scores, i=step_idx) # sequence_expand can gather sequences according to lod thus can be # used in beam search to sift states corresponding to selected ids. pre_src_attn_bias = layers.sequence_expand( x=trg_src_attn_bias, y=pre_scores) pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores) pre_caches = [{ "k": layers.sequence_expand( x=cache["k"], y=pre_scores), "v": layers.sequence_expand( x=cache["v"], y=pre_scores), } for cache in caches] pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_enc_output, # cann't use pre_ids here since it has lod value=1, shape=[-1, 1], dtype=pre_ids.dtype), y=layers.increment( x=step_idx, value=1.0, in_place=False), axis=0) logits = wrap_decoder( trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, weight_sharing, dec_inputs=( pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape, slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, src_attn_post_softmax_shape), enc_output=pre_enc_output, caches=pre_caches) topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=layers.reshape( pre_scores, shape=[-1]), axis=0) # beam_search op uses lod to distinguish branches. topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx) layers.increment(x=step_idx, value=1.0, in_place=True) # update states layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(pre_src_attn_bias, trg_src_attn_bias) layers.assign(pre_enc_output, enc_output) for i in range(n_layer): layers.assign(pre_caches[i]["k"], caches[i]["k"]) layers.assign(pre_caches[i]["v"], caches[i]["v"]) layers.assign( layers.elementwise_add( x=slf_attn_pre_softmax_shape, y=attn_pre_softmax_shape_delta), slf_attn_pre_softmax_shape) layers.assign( layers.elementwise_add( x=slf_attn_post_softmax_shape, y=attn_post_softmax_shape_delta), slf_attn_post_softmax_shape) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=eos_idx) return finished_ids, finished_scores
def forward(self, x, y): # x,y误差一帧 u1 = zeros_like(x) u2 = zeros_like(x) l_t = self.l * self.t taut = self.a / self.t grad2_x = self.conv_img_grad(y) # grad2_x[:, :, :, 0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0]) # grad2_x[:, :, :, -1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2]) grad2_y = self.conv_img_grad2(y) # grad2_y[:, :, 0, :] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :]) # grad2_y[:, :, -1, :] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :]) p11 = zeros_like(x) p12 = zeros_like(x) p21 = zeros_like(x) p22 = zeros_like(x) gsqx = grad2_x**2 gsqy = grad2_y**2 grad = gsqx + gsqy + 1e-12 rho_c = y - grad2_x * u1 - grad2_y * u2 - x for i in range(self.n_iter): rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12 v1 = zeros_like(x) v2 = zeros_like(x) mask1 = rho < -l_t * grad mask2 = rho > l_t * grad mask3 = logical_and(logical_not(logical_or(mask1, mask2)), (grad > 1e-12)) mask1 = cast(mask1, dtype='float32') mask2 = cast(mask2, dtype='float32') mask3 = cast(mask3, dtype='float32') mask1.stop_gradient = True mask2.stop_gradient = True mask3.stop_gradient = True # v1 = v1 + l_t * grad2_x * mask1 - l_t * grad2_x * mask2 - (rho / grad) * grad2_x * mask3 # v2 = v2 + l_t * grad2_y * mask1 - l_t * grad2_y * mask2 - (rho / grad) * grad2_y * mask3 v1 = elementwise_add( u1, elementwise_add( elementwise_mul(l_t * grad2_x, mask1), elementwise_add( elementwise_mul(-l_t * grad2_x, mask2), elementwise_mul(-elementwise_div(rho, grad), elementwise_mul(grad2_x, mask3))))) v2 = elementwise_add( u2, elementwise_add( elementwise_mul(l_t * grad2_y, mask1), elementwise_add( elementwise_mul(-l_t * grad2_y, mask2), elementwise_mul(-elementwise_div(rho, grad), elementwise_mul(grad2_y, mask3))))) del rho del mask1 del mask2 del mask3 v1 += u1 v2 += u2 u1 = v1 + self.t * self.divergence(p11, p12) u2 = v2 + self.t * self.divergence(p21, p22) del v1 del v2 u1 = u1 u2 = u2 u1x, u1y = self.forward_grad(u1) u2x, u2y = self.forward_grad(u2) p11 = (p11 + taut * u1x) / (1. + taut * sqrt(u1x**2 + u1y**2 + 1e-12)) p12 = (p12 + taut * u1y) / (1. + taut * sqrt(u1x**2 + u1y**2 + 1e-12)) p21 = (p21 + taut * u2x) / (1. + taut * sqrt(u2x**2 + u2y**2 + 1e-12)) p22 = (p22 + taut * u2y) / (1. + taut * sqrt(u2x**2 + u2y**2 + 1e-12)) del u1x del u1y del u2x del u2y return u1, u2
def decoder_decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data( name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) cond = pd.less_than(x=counter, y=array_len) while_op = pd.While(cond=cond) with while_op.block(): pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) # expand the recursive_sequence_lengths of pre_state to be the same with pre_score pre_state_expanded = pd.sequence_expand(pre_state, pre_score) pre_ids_emb = pd.embedding( input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse) # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=beam_size) # calculate accumulated scores after topk to reduce computation cost accu_scores = pd.elementwise_add( x=pd.log(topk_scores), y=pd.reshape( pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search( pre_ids, pre_score, topk_indices, accu_scores, beam_size, end_id=10, level=0) pd.increment(x=counter, value=1, in_place=True) # update the memories pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) # update the break condition: up to the max length or all candidates of # source sentences have ended. length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) # return init_ids, init_scores return translation_ids, translation_scores
def fast_decode(self): """create model for inference""" if self.task_type == "dialog": emb_num = 4 else: emb_num = 3 input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \ [[-1, self.max_seq_len, self.max_seq_len]] input_dtypes = ['int64'] * emb_num + ['float32'] input_lod_levels = [0] * emb_num + [0] shapes = input_shapes + [[-1, 1, 1], [-1, 1, 1], [-1, 1], [-1], [-1, 1, self.max_seq_len], [-1, 1]] dtypes = input_dtypes + [ 'int64', 'int64', 'float32', 'int32', 'float32', 'int64' ] lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0] inputs = self.to_tensor(shapes, dtypes, lod_levels) pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=70, iterable=False) emb_ids = {} for key, value in zip(self.emb_keys, inputs[:emb_num]): emb_ids[key] = value input_mask = inputs[emb_num] tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[ -6:] unimo = UNIMOModel(emb_ids=emb_ids, input_mask=input_mask, config=self.gene_config, task_type=self.task_type, decoding=True, gather_idx=parent_idx) max_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.min_out_len, force_cpu=True) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-1e18) step_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx) pos_biases = layers.array_write(tgt_pos, step_idx) scores = layers.array_write(init_scores, step_idx) tgt_masks = layers.array_write(tgt_input_mask, step_idx) trigram_blocking = TrigramBlocking(tgt_ids, self.tokenizer, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) def gen_batch_like(value, dtype="int64", shape=[-1, 1, 1], is_scalar=True): """generate batch""" if is_scalar: return layers.fill_constant_batch_size_like( input=parent_idx, value=value, shape=shape, dtype=dtype) else: return layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=parent_idx, value=1, shape=shape, dtype=dtype), y=value, axis=0) tmp_mask = layers.array_read(tgt_masks, i=step_idx) tmp_mask = layers.gather(input=tmp_mask, index=parent_idx) append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype) pre_mask = layers.concat([tmp_mask, append_1_mask], axis=2) pre_pos = gen_batch_like(step_idx, is_scalar=False) pre_pos = pre_pos + pos_bias ####################### pos start from 2 pre_sent = gen_batch_like(self.tgt_type_id, dtype=pre_ids.dtype) dec_emb_ids = {"word_embedding": pre_ids, "pos_embedding": pre_pos} if self.task_type == "dialog": role_ids = gen_batch_like(0) turn_ids = gen_batch_like(0) dec_emb_ids["role_embedding"] = role_ids dec_emb_ids["turn_embedding"] = turn_ids else: dec_emb_ids["sent_embedding"] = pre_sent dec_out = unimo.encode(emb_ids=dec_emb_ids, input_mask=pre_mask, gather_idx=parent_idx) fc_out = self.cal_logit(dec_out, None) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant(shape=[layers.shape(fc_out)[0]], dtype='int64', value=self.eos_id) eos_index = fluid.one_hot(eos_index, depth=self.vocab_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(fc_out, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.length_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func(func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.length_penalty) curr_scores = layers.elementwise_div(accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_id, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_masks) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_id) graph_vars = { "finished_ids": finished_ids, "finished_scores": finished_scores, "data_ids": data_ids } for k, v in graph_vars.items(): v.persistable = True return pyreader, graph_vars