def simple_net(self): d0 = layers.data( "d0", shape=[10], append_batch_size=False, dtype='float32') d1 = layers.data( "d1", shape=[10], append_batch_size=False, dtype='float32') d2 = layers.data( "d2", shape=[10], append_batch_size=False, dtype='float32') # fill_constant npu op doesn't support int64 i = layers.zeros(shape=[1], dtype='int32') i = layers.cast(i, 'int64') i.stop_gradient = True init = layers.zeros(shape=[10], dtype='float32') mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) i = layers.increment(i) layers.array_write(d1, i, array=data_array) i = layers.increment(i) layers.array_write(d2, i, array=data_array) i = layers.zeros(shape=[1], dtype='int32') i = layers.cast(i, 'int64') i.stop_gradient = True array_len = layers.fill_constant(shape=[1], dtype='int32', value=5) array_len = layers.cast(array_len, 'int64') array_len.stop_gradient = True cond = layers.ones(shape=[1], dtype='int32') cond = layers.cast(cond, 'bool') j = layers.fill_constant(shape=[1], dtype='int32', value=1) j = layers.cast(j, 'int64') j.stop_gradient = True array_len2 = layers.fill_constant(shape=[1], dtype='int32', value=3) array_len2 = layers.cast(array_len2, 'int64') array_len2.stop_gradient = True cond2 = layers.logical_or(x=j, y=array_len2) cond2 = layers.ones(shape=[1], dtype='int32') cond2 = layers.cast(cond2, 'bool') while_op = layers.While(cond=cond) while_op2 = layers.While(cond=cond2) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) result = layers.sums(input=[d, prev]) i = layers.increment(x=i, in_place=True) layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) with while_op2.block(): d2 = layers.array_read(array=data_array, i=j) prev2 = layers.array_read(array=mem_array, i=j) result2 = layers.sums(input=[d2, prev2]) j = layers.increment(x=j, in_place=True) layers.array_write(result2, i=j, array=mem_array) layers.less_than(x=j, y=array_len2, cond=cond2) sum_result = layers.array_read(array=mem_array, i=j) loss = layers.mean(sum_result) return loss, sum_result
def test_exceptions(self): i = layers.zeros(shape=[2], dtype='int64') array_len = layers.fill_constant(shape=[2], dtype='int64', value=1) cond = layers.less_than(x=i, y=array_len) with self.assertRaises(TypeError): layers.While(cond=cond) cond = layers.cast(cond, dtype='float64') with self.assertRaises(TypeError): layers.While(cond=cond)
def build_and_run_program(place, batch_size, beam_size, stop_gradient=False): fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 np.random.seed(2) x = layers.assign( np.random.rand(batch_size, beam_size, 32).astype("float32")) indices = fluid.data(shape=[None, beam_size], dtype="int64", name="indices") step_idx = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=True) max_len = layers.fill_constant( shape=[1], dtype="int64", value=10, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) scores = layers.array_write(x, step_idx) with while_op.block(): bs = layers.cast(layers.shape(x)[0], "int64") for _ in range(20): bs = layers.cast(bs, 'int64') bs.stop_gradient = stop_gradient batch_pos = layers.expand( layers.unsqueeze( layers.range( 0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size]) topk_coordinates = layers.stack([batch_pos, indices], axis=2) topk_coordinates.stop_gradient = stop_gradient score = layers.gather_nd(x, topk_coordinates) layers.increment(x=step_idx, value=1.0, in_place=True) layers.array_write(score, i=step_idx, array=scores) length_cond = layers.less_than(x=step_idx, y=max_len) layers.assign(length_cond, cond) out = layers.tensor_array_to_tensor(scores, axis=0, use_stack=True)[0] loss = layers.reduce_mean(out) opt = fluid.optimizer.Adam(0.01) opt.minimize(loss) exe = fluid.Executor(place) data = np.random.random_integers( low=0, high=beam_size - 1, size=(batch_size, beam_size)).astype("int64") loss_val, = exe.run(feed={"indices": data}, fetch_list=[loss]) return loss_val
def _push_to_stack(gmr_desc, gmr_pos, gmr_lens, gmr_stack_info): """push grammar id in gmr_desc from gmr_pos to gmr_lens to gmr_stack. and update step_gmr_pos Args: gmr_desc (TYPE): NULL gmr_pos (TYPE): NULL gmr_lens (TYPE): NULL gmr_stack_info (tuple): [in/out] (gmr_stack, gmr_stack_pos) Returns: tuple (gmr_stack, gmr_stack_pos) Raises: NULL """ gmr_stack, gmr_stack_pos = gmr_stack_info mv_step = layers.cast(layers.greater_than(gmr_lens, layers.zeros_like(gmr_lens)), dtype=gmr_lens.dtype) gmr_mv_pos = layers.elementwise_sub(gmr_lens, mv_step) cond = layers.reduce_any(layers.greater_than(gmr_mv_pos, gmr_pos)) while_op = layers.While(cond) with while_op.block(): gmr_ids = nn_utils.batch_gather(gmr_desc, gmr_mv_pos) gmr_stack_tmp, gmr_stack_pos_tmp = data_structure.Stack.push( gmr_stack_info, gmr_ids, in_place=False) mv_cond = layers.greater_than(gmr_mv_pos, gmr_pos) gmr_mv_pos_tmp = fluider.elementwise_sub(gmr_mv_pos, mv_cond, force=True) new_gmr_stack, new_gmr_stack_pos = nn_utils.ifelse( mv_cond, [gmr_stack_tmp, gmr_stack_pos_tmp], [gmr_stack, gmr_stack_pos]) layers.utils.map_structure(layers.assign, [new_gmr_stack, new_gmr_stack_pos], [gmr_stack, gmr_stack_pos]) layers.assign(gmr_mv_pos_tmp, gmr_mv_pos) layers.assign( layers.reduce_any(layers.greater_than(gmr_mv_pos, gmr_pos)), cond) return gmr_stack, gmr_stack_pos
def beam_search(): max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=max_out_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, n_head, 0, d_key], dtype=enc_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, n_head, 0, d_value], dtype=enc_output.dtype, value=0), "static_k": # for encoder-decoder attention layers.create_tensor(dtype=enc_output.dtype), "static_v": # for encoder-decoder attention layers.create_tensor(dtype=enc_output.dtype) } for i in range(n_layer) ] with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder(trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), enc_output=enc_output, caches=caches, gather_idx=parent_idx, bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. accu_scores = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_attn_bias, trg_src_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=eos_idx) return finished_ids, finished_scores
def call(self, global_img_feat, p_img_feat, embedding_fn, words=None): # 图片特征 img_feat = layers.fc(p_img_feat, self.hid_size, num_flatten_dims=2, act='tanh') # [batch, k, hid] img_feat_emb = layers.fc(p_img_feat, self.hid_size, num_flatten_dims=2) if self.mode == 'eval': word = layers.fill_constant_batch_size_like(global_img_feat, [-1], dtype='int64', value=config.data['start_idx']) else: words = layers.transpose(words, [1, 0]) # [seq, batch] words.stop_gradient = True # lstm 初始化 hid, cell = create_zero_state(global_img_feat), create_zero_state(global_img_feat) # While loop 参数初始化 mx = decoder_config['sentence_length'] - 1 if self.mode == 'train' else decoder_config['infer_max_length'] if self.mode == 'eval': mx = decoder_config['infer_max_length'] while_op_output = layers.create_array('int64') else: while_op_output = layers.create_array('float32') max_step = layers.fill_constant(shape=[1], dtype='int64', value=mx) step = layers.fill_constant(shape=[1], dtype='int64', value=0) cond = layers.less_than(step, max_step) while_op = layers.While(cond) with while_op.block(): if self.mode == 'train': st = layers.cast(step, 'int32') word = layers.slice(words, axes=[0], starts=st, ends=st + 1) word = layers.squeeze(word, [0]) word.stop_gradient = True word_emb = embedding_fn(word) # 这里可能用+效果更好? xt = layers.concat([word_emb, global_img_feat], axis=-1) # [batch, feat] h, c = layers.lstm_unit(xt, hid, cell, param_attr=fluid.ParamAttr('lstm_w'), bias_attr=fluid.ParamAttr('lstm_b')) p_word_emb = layers.fc(xt, size=self.hid_size) p_hidden = layers.fc(hid, size=self.hid_size) sentinel_gate = layers.sigmoid(p_word_emb + p_hidden) # [batch, hidden] sentinel = layers.elementwise_mul(sentinel_gate, layers.tanh(c)) # [batch, hidden] layers.assign(h, hid) layers.assign(c, cell) k = layers.shape(p_img_feat)[1] p_hid = layers.fc(h, self.hid_size, act='tanh') # attention 部分 # alpha hid_emb = layers.fc(p_hid, self.hid_size) # [batch, hidden] exp_hid_emb = layers.expand(layers.unsqueeze(hid_emb, 1), [1, k + 1, 1]) # [batch, k+1, hidden] sentinel_emb = layers.unsqueeze(layers.fc(sentinel, self.hid_size), axes=1) # [batch, 1, hidden] feat_emb = layers.concat([img_feat_emb, sentinel_emb], axis=1) # [batch, k+1, hidden] z = layers.tanh(feat_emb + exp_hid_emb) # [batch, k+1, 1] alpha = layers.fc(z, size=1, num_flatten_dims=2, act='softmax') # [batch, k+1, 1] # context vector context = layers.concat([img_feat, layers.unsqueeze(sentinel, axes=1)], axis=1) # [batch, k+1, hidden] context = layers.elementwise_mul(context, alpha, axis=0) context = layers.reduce_mean(context, dim=1) # [batch, hidden] out = layers.fc(context + p_hid, self.hid_size, act='tanh') word_pred = weight_tying_fc(out) # [batch, vocab] if self.mode == 'eval': next_word = layers.argmax(word_pred, axis=-1) layers.assign(next_word, word) next_word = layers.cast(next_word, 'float32') layers.array_write(next_word, step, array=while_op_output) else: layers.array_write(word_pred, step, array=while_op_output) layers.increment(step) layers.less_than(step, max_step, cond=cond) if self.mode == 'train': output_time_major, _ = layers.tensor_array_to_tensor(while_op_output, axis=0, use_stack=True) output = layers.transpose(output_time_major, [1, 0, 2]) else: output_time_major = layers.tensor_array_to_tensor(while_op_output, axis=0, use_stack=True)[0] output = layers.transpose(output_time_major, [1, 0]) return output
def run_main(self, place, with_data_parallel): self.place = place self.with_data_parallel = with_data_parallel if not core.is_compiled_with_cuda() and isinstance( self.place, core.CUDAPlace): return if isinstance(self.place, core.CUDAPlace): device_cnt = core.get_cuda_device_count( ) if self.with_data_parallel else 1 else: device_cnt = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count()) ) if self.with_data_parallel else 1 d0 = layers.data("d0", shape=[10], append_batch_size=False, dtype='float32') d1 = layers.data("d1", shape=[10], append_batch_size=False, dtype='float32') d2 = layers.data("d2", shape=[10], append_batch_size=False, dtype='float32') i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True init = layers.zeros(shape=[10], dtype='float32') mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) i = layers.increment(i) layers.array_write(d1, i, array=data_array) i = layers.increment(i) layers.array_write(d2, i, array=data_array) i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True array_len = layers.fill_constant(shape=[1], dtype='int64', value=1) array_len.stop_gradient = True cond = layers.less_than(x=i, y=array_len) j = layers.fill_constant(shape=[1], dtype='int64', value=1) j.stop_gradient = True array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3) array_len2.stop_gradient = True cond2 = layers.less_than(x=j, y=array_len2) while_op = layers.While(cond=cond) while_op2 = layers.While(cond=cond2) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) d = layers.reshape(d, shape=[10]) prev = layers.reshape(prev, shape=[10]) result = layers.sums(input=[d, prev]) i = layers.increment(x=i, in_place=True) layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) with while_op2.block(): d2 = layers.array_read(array=data_array, i=j) prev2 = layers.array_read(array=mem_array, i=j) d2 = layers.reshape(d2, shape=[10]) prev2 = layers.reshape(prev2, shape=[10]) result2 = layers.sums(input=[d2, prev2]) j = layers.increment(x=j, in_place=True) layers.array_write(result2, i=j, array=mem_array) layers.less_than(x=j, y=array_len2, cond=cond2) sum_result = layers.array_read(array=mem_array, i=j) sum_result.persistable = True tmp = layers.unsqueeze(sum_result, axes=[0]) tmp = layers.expand(tmp, expand_times=[10, 1]) fc = layers.fc(tmp, size=256) loss = layers.mean(sum_result) optim = fluid.optimizer.Adam(learning_rate=1e-3) optim.minimize(loss) exe = Executor(self.place) exe.run(fluid.default_startup_program()) prog = fluid.default_main_program() if self.with_data_parallel: prog = compiler.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=loss.name) for _ in range(5): d = [] for i in range(3): tmp = numpy.random.random(size=[10]).astype('float32') if not self.with_data_parallel: d.append(tmp) else: d.append(numpy.array([tmp] * device_cnt)) outs = exe.run(program=prog, feed={ 'd0': d[0], 'd1': d[1], 'd2': d[2] }, fetch_list=[sum_result]) self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
def _do_beam_search(trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, beam_size, max_len, bos_idx, eos_idx, ids, scores, parent_idx, trg_src_attn_bias, caches, enc_output, step_idx): """ do beam search """ cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder(trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), enc_output=enc_output, caches=caches, gather_idx=parent_idx, bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk(input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. accu_scores = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_attn_bias, trg_src_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond)
def beam_search(): max_len = layers.fill_constant( shape=[1], dtype=start_tokens.dtype, value=max_out_len) step_idx = layers.fill_constant( shape=[1], dtype=start_tokens.dtype, value=0) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(start_tokens, step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps to reduce redundant # computation in decoder. caches = [{ "k": layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, 0, d_model], dtype=enc_output.dtype, value=0), "v": layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, 0, d_model], dtype=enc_output.dtype, value=0) } for i in range(n_layer)] with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_scores = layers.array_read(array=scores, i=step_idx) # sequence_expand can gather sequences according to lod thus can be # used in beam search to sift states corresponding to selected ids. pre_src_attn_bias = layers.sequence_expand( x=trg_src_attn_bias, y=pre_scores) pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores) pre_caches = [{ "k": layers.sequence_expand( x=cache["k"], y=pre_scores), "v": layers.sequence_expand( x=cache["v"], y=pre_scores), } for cache in caches] pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_enc_output, # cann't use pre_ids here since it has lod value=1, shape=[-1, 1], dtype=pre_ids.dtype), y=layers.increment( x=step_idx, value=1.0, in_place=False), axis=0) logits = wrap_decoder( trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, weight_sharing, dec_inputs=( pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape, slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, src_attn_post_softmax_shape), enc_output=pre_enc_output, caches=pre_caches) topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=layers.reshape( pre_scores, shape=[-1]), axis=0) # beam_search op uses lod to distinguish branches. topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx) layers.increment(x=step_idx, value=1.0, in_place=True) # update states layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(pre_src_attn_bias, trg_src_attn_bias) layers.assign(pre_enc_output, enc_output) for i in range(n_layer): layers.assign(pre_caches[i]["k"], caches[i]["k"]) layers.assign(pre_caches[i]["v"], caches[i]["v"]) layers.assign( layers.elementwise_add( x=slf_attn_pre_softmax_shape, y=attn_pre_softmax_shape_delta), slf_attn_pre_softmax_shape) layers.assign( layers.elementwise_add( x=slf_attn_post_softmax_shape, y=attn_post_softmax_shape_delta), slf_attn_post_softmax_shape) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=eos_idx) return finished_ids, finished_scores
def test_simple_forward(self): d0 = layers.data("d0", shape=[10], append_batch_size=False, dtype='float32') d1 = layers.data("d1", shape=[10], append_batch_size=False, dtype='float32') d2 = layers.data("d2", shape=[10], append_batch_size=False, dtype='float32') i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True init = layers.zeros(shape=[10], dtype='float32') mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) i = layers.increment(i) layers.array_write(d1, i, array=data_array) i = layers.increment(i) layers.array_write(d2, i, array=data_array) i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True array_len = layers.fill_constant(shape=[1], dtype='int64', value=1) array_len.stop_gradient = True cond = layers.less_than(x=i, y=array_len) j = layers.fill_constant(shape=[1], dtype='int64', value=1) j.stop_gradient = True array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3) array_len2.stop_gradient = True cond2 = layers.less_than(x=j, y=array_len2) while_op = layers.While(cond=cond) while_op2 = layers.While(cond=cond2) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) result = layers.sums(input=[d, prev]) i = layers.increment(x=i, in_place=True) layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) with while_op2.block(): d2 = layers.array_read(array=data_array, i=j) prev2 = layers.array_read(array=mem_array, i=j) result2 = layers.sums(input=[d2, prev2]) j = layers.increment(x=j, in_place=True) layers.array_write(result2, i=j, array=mem_array) layers.less_than(x=j, y=array_len2, cond=cond2) sum_result = layers.array_read(array=mem_array, i=j) loss = layers.mean(sum_result) append_backward(loss) cpu = core.CPUPlace() exe = Executor(cpu) d = [] for i in range(3): d.append(numpy.random.random(size=[10]).astype('float32')) outs = exe.run(feed={ 'd0': d[0], 'd1': d[1], 'd2': d[2] }, fetch_list=[sum_result]) self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
def gru_attention_infer(self, decoder_boot, max_length, char_num, word_vector_dim, encoded_vector, encoded_proj, decoder_size): init_state = decoder_boot beam_size = 1 array_len = layers.fill_constant( shape=[1], dtype='int64', value=max_length) counter = layers.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = layers.create_array('float32') layers.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = layers.create_array('int64') scores_array = layers.create_array('float32') rois_shape = layers.shape(init_state) batch_size = layers.slice( rois_shape, axes=[0], starts=[0], ends=[1]) + 1 lod_level = layers.range( start=0, end=batch_size, step=1, dtype=batch_size.dtype) init_ids = layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], value=0, dtype='int64') init_ids = layers.lod_reset(init_ids, lod_level) init_ids = layers.lod_append(init_ids, lod_level) init_scores = layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], value=1, dtype='float32') init_scores = layers.lod_reset(init_scores, init_ids) layers.array_write(init_ids, array=ids_array, i=counter) layers.array_write(init_scores, array=scores_array, i=counter) full_ids = fluid.layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], dtype='int64', value=1) cond = layers.less_than(x=counter, y=array_len) while_op = layers.While(cond=cond) with while_op.block(): pre_ids = layers.array_read(array=ids_array, i=counter) pre_state = layers.array_read(array=state_array, i=counter) pre_score = layers.array_read(array=scores_array, i=counter) pre_ids_emb = layers.embedding( input=pre_ids, size=[char_num, word_vector_dim], dtype='float32') context = self.simple_attention(encoded_vector, encoded_proj, pre_state, decoder_size) # expand the recursive_sequence_lengths of pre_state # to be the same with pre_score pre_state_expanded = layers.sequence_expand(pre_state, pre_score) context_expanded = layers.sequence_expand(context, pre_score) fc_1 = layers.fc(input=context_expanded, size=decoder_size * 3, bias_attr=False, name="rnn_fc1") fc_2 = layers.fc(input=pre_ids_emb, size=decoder_size * 3, bias_attr=False, name="rnn_fc2") decoder_inputs = fc_1 + fc_2 current_state, _, _ = layers.gru_unit( input=decoder_inputs, hidden=pre_state_expanded, size=decoder_size * 3) current_state_with_lod = layers.lod_reset( x=current_state, y=pre_score) # use score to do beam search current_score = layers.fc(input=current_state_with_lod, size=char_num, bias_attr=True, act='softmax', name="rnn_out_fc") topk_scores, topk_indices = layers.topk(current_score, k=beam_size) new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1) fluid.layers.assign(new_ids, full_ids) layers.increment(x=counter, value=1, in_place=True) # update the memories layers.array_write(current_state, array=state_array, i=counter) layers.array_write(topk_indices, array=ids_array, i=counter) layers.array_write(topk_scores, array=scores_array, i=counter) # update the break condition: # up to the max length or all candidates of # source sentences have ended. length_cond = layers.less_than(x=counter, y=array_len) finish_cond = layers.logical_not(layers.is_empty(x=topk_indices)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) return full_ids
def forward(self, inputs, use_cache=False, cache=None): """ Args: inputs (dict): include src_ids. pos_ids, input_mask and max_dec_len are optional. """ ######### forward context ######### input_ids = inputs['src_ids'] position_ids = inputs['pos_ids'] if 'pos_ids' in inputs else None attention_mask = inputs[ 'input_mask'] if 'input_mask' in inputs else None causal_mask = paddle.tensor.triu(paddle.ones( (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1) if attention_mask is not None: tgt_pos = paddle.sum(attention_mask, axis=-1, keepdim=True).astype('int64') if len(attention_mask.shape) == 2: attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]) encode_mask = attention_mask + causal_mask else: encode_mask = causal_mask # if cached_kvs are assigned to next step in _prepare_qkv of MultiHeadAttention, # need to init the global caches here gen_caches = self._init_generation_caches(input_ids) logits, cached_kvs = self.model(input_ids, position_ids, encode_mask, use_cache=True, cache=gen_caches) next_id = paddle.argmax(logits[:, -1, :], axis=-1).reshape([-1, 1]) #################################### if 'max_dec_len' not in inputs: max_len = layers.fill_constant([1], dtype=int_type, value=self.max_dec_len, force_cpu=True) else: max_len = inputs['max_dec_len'] min_len = layers.fill_constant(shape=[1], dtype=int_type, value=self.min_dec_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], value=0, dtype='int64', force_cpu=True) placehold_ids = layers.fill_constant_batch_size_like( input=inputs["src_ids"], value=0, shape=[-1, 1], dtype=next_id.dtype) ids = layers.array_write(next_id, step_idx) if 'max_dec_len' in inputs: max_len = paddle.tensor.creation._memcpy(max_len, place=paddle.CPUPlace()) cond_int = paddle.full([1], 0, dtype=int_type, name="cond_int") cond = paddle.less_than(step_idx, max_len) if attention_mask is not None: append_mask = layers.fill_constant_batch_size_like( input=next_id, value=1, shape=[-1, 1, 1, 1], dtype=attention_mask.dtype) while_op = layers.While(cond, is_test=True) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) if attention_mask: decode_mask = paddle.concat([attention_mask, append_mask], axis=-1) tgt_pos = tgt_pos + step_idx att_mask = (1 - decode_mask) * -1e4 else: att_mask = None tgt_pos = None layers.increment(x=step_idx, value=1.0, in_place=True) layers.array_write(placehold_ids, i=step_idx, array=ids) logits, decode_cached_kvs = self.model(pre_ids, tgt_pos, att_mask, use_cache=True, cache=cached_kvs) logits = paddle.reshape(logits, shape=(-1, self.vocab_size)) probs = F.softmax(logits / self.temperature) if self.decoding_strategy.startswith("sampling"): sampling_ids = layers.sampling_id(probs, dtype="int") elif self.decoding_strategy.startswith("topk_sampling"): probs, sampling_ids = self.topk_sampling(probs) elif self.decoding_strategy.startswith("topp_sampling"): probs, sampling_ids = self.topp_sampling(probs) else: raise ValueError(self.decoding_strategy) selected_ids = paddle.unsqueeze(sampling_ids, -1) layers.array_write(selected_ids, i=step_idx, array=ids) length_cond = paddle.less_than(x=step_idx, y=max_len, name="length_cond") finish_cond = paddle.logical_not(paddle.is_empty(x=selected_ids), name="finish_cond") paddle.logical_and(x=length_cond, y=finish_cond, out=cond, name="logical_and_cond") paddle.assign(layers.cast(cond, dtype='bool'), cond) if attention_mask: paddle.assign(decode_mask, attention_mask) for i in range(len(decode_cached_kvs)): if self._fuse: paddle.assign(decode_cached_kvs[i].kv, cached_kvs[i].kv) else: paddle.assign(decode_cached_kvs[i].k, cached_kvs[i].k) paddle.assign(decode_cached_kvs[i].v, cached_kvs[i].v) ids, _ = layers.tensor_array_to_tensor(ids) return ids
def decode_with_grammar(decoder, inits, decode_vocab, max_step_num, **kwargs): """A modification of paddle.fluid.layers.dynamic_decode(...). Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned Tensor indicating finished status contains all True values or the number of decoding step reachs to :attr:`max_step_num`. :code:`decoder.initialize()` would be called once before the decoding loop. If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()` would be called once after the decoding loop. Args: decoder(Decoder): An instance of `Decoder`. inits(tuple): Argument passed to `decoder.initialize`. decode_vocab(DecoderDynamicVocab): namedtuple(table table_len column column_len value value_len) max_step_num(int): The maximum number of steps. **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. Returns: tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ outputs and states, both are Tensor or nested structure of Tensor. \ `final_outputs` has the same structure and data types as \ :code:`decoder.output_dtype` , and each Tenser in `final_outputs` \ is the stacked of all decoding steps' outputs, which might be revised \ by :code:`decoder.finalize` . `final_states` is the counterpart \ at last time step of initial states returned by :code:`decoder.initialize` , \ thus has the same structure with it and has tensors with same shapes \ and data types. """ step_cnt = tensor.fill_constant(shape=[1], dtype="int64", value=1) max_step_num_tensor = tensor.fill_constant(shape=[1], dtype="int64", value=max_step_num - 2) # shape = [batch_size, beam_size, ...] initial_inputs, initial_states, initial_finished = decoder.initialize( inits, decode_vocab) global_inputs, global_states, global_finished = (initial_inputs, initial_states, initial_finished) inputs = initial_inputs states = initial_states # 保存输出结果 outputs_arr_data = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, max_step_num], dtype=decoder.output_dtype.predicted_ids, value=0) outputs_arr_pos = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0) outputs_array = data_structure.ArrayData( decoder.merge_batch_beams(outputs_arr_data), decoder.merge_batch_beams(outputs_arr_pos)) sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished), "int64") # 按语法解码的相关约束数据结构 grammar_stack_dat = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, max_step_num * STACK_EXPAND_TIMES], dtype='int64', value=0) grammar_stack_pos = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0) grammar_stack = data_structure.StackData( decoder.merge_batch_beams(grammar_stack_dat), decoder.merge_batch_beams(grammar_stack_pos)) ############ 循环解码,直到全部为 finish 状态 ############ # finish 的判断:通过 global_finished/next_finished && max_step_num 判断 cond = layers.logical_not((layers.reduce_all(initial_finished))) while_op = layers.While(cond) with while_op.block(): # step_outputs --> OutputWrapper # next_states --> StateWrapper # next_inputs --> DecoderInputsWrapper step_outputs, next_states, next_inputs = decoder.step( inputs, states, **kwargs) predicted_ids = step_outputs.predicted_ids _save_predict_output(outputs_array, predicted_ids, next_states.finished) pred_gmr_type = decoder.grammar_type(predicted_ids) cond_type_leaf = layers.equal(pred_gmr_type, decoder.GMR_TYPE.LEAF) cond_type_midd = layers.equal(pred_gmr_type, decoder.GMR_TYPE.MID) _process_type_leaf(cond_type_leaf, decoder, grammar_stack, next_inputs, next_states.finished) _process_type_midd(cond_type_midd, decoder, grammar_stack, next_inputs, predicted_ids) ##next_sequence_lengths = layers.elementwise_add(sequence_lengths, ## tensor.cast(layers.logical_not(global_finished), sequence_lengths.dtype)) _check_finished(decoder, next_inputs, next_states.finished, outputs_array) layers.utils.map_structure(tensor.assign, next_inputs, global_inputs) layers.utils.map_structure(tensor.assign, next_states, global_states) tensor.assign(next_states.finished, global_finished) ##tensor.assign(next_sequence_lengths, sequence_lengths) # 更新循环条件 layers.increment(x=step_cnt, value=1.0, in_place=True) layers.logical_and( layers.logical_not(layers.reduce_all(next_states.finished)), layers.less_equal(step_cnt, max_step_num_tensor), cond) final_outputs = outputs_array.data final_states = global_states final_outputs, final_states = decoder.finalize(final_outputs, global_states, sequence_lengths) return final_outputs, final_states
def beam_search(enc_output, enc_bias, source_length): """ beam_search """ max_len = layers.fill_constant( shape=[1], dtype='int64', value=max_out_len) step_idx = layers.fill_constant( shape=[1], dtype='int64', value=0) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) caches_batch_size = batch_size * beam_size init_score = np.zeros([1, beam_size]).astype('float32') init_score[:, 1:] = -INF initial_log_probs = layers.assign(init_score) alive_log_probs = layers.expand(initial_log_probs, [batch_size, 1]) # alive seq [batch_size, beam_size, 1] initial_ids = layers.zeros([batch_size, 1, 1], 'float32') alive_seq = layers.expand(initial_ids, [1, beam_size, 1]) alive_seq = layers.cast(alive_seq, 'int64') enc_output = layers.unsqueeze(enc_output, axes=[1]) enc_output = layers.expand(enc_output, [1, beam_size, 1, 1]) enc_output = layers.reshape(enc_output, [caches_batch_size, -1, d_model]) tgt_src_attn_bias = layers.unsqueeze(enc_bias, axes=[1]) tgt_src_attn_bias = layers.expand(tgt_src_attn_bias, [1, beam_size, n_head, 1, 1]) enc_bias_shape = layers.shape(tgt_src_attn_bias) tgt_src_attn_bias = layers.reshape(tgt_src_attn_bias, [-1, enc_bias_shape[2], enc_bias_shape[3], enc_bias_shape[4]]) beam_search = BeamSearch(beam_size, batch_size, decode_alpha, trg_vocab_size, d_model) caches = [{ "k": layers.fill_constant( shape=[caches_batch_size, 0, d_model], dtype=enc_output.dtype, value=0), "v": layers.fill_constant( shape=[caches_batch_size, 0, d_model], dtype=enc_output.dtype, value=0) } for i in range(n_layer)] finished_seq = layers.zeros_like(alive_seq) finished_scores = layers.fill_constant([batch_size, beam_size], dtype='float32', value=-INF) finished_flags = layers.fill_constant([batch_size, beam_size], dtype='float32', value=0) with while_op.block(): pos = layers.fill_constant([caches_batch_size, 1, 1], dtype='int64', value=1) pos = layers.elementwise_mul(pos, step_idx, axis=0) alive_seq_1 = layers.reshape(alive_seq, [caches_batch_size, -1]) alive_seq_2 = alive_seq_1[:, -1:] alive_seq_2 = layers.unsqueeze(alive_seq_2, axes=[1]) logits = wrap_decoder( trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, dec_inputs=(alive_seq_2, alive_seq_2, pos, None, tgt_src_attn_bias), enc_output=enc_output, caches=caches, is_train=False, params_type=params_type) alive_seq_2, alive_log_probs_2, finished_seq_2, finished_scores_2, finished_flags_2, caches_2 = \ beam_search.inner_func(step_idx, logits, alive_seq_1, alive_log_probs, finished_seq, finished_scores, finished_flags, caches, enc_output, tgt_src_attn_bias) layers.increment(x=step_idx, value=1.0, in_place=True) finish_cond = beam_search.is_finished(step_idx, source_length, alive_log_probs_2, finished_scores_2, finished_flags_2) layers.assign(alive_seq_2, alive_seq) layers.assign(alive_log_probs_2, alive_log_probs) layers.assign(finished_seq_2, finished_seq) layers.assign(finished_scores_2, finished_scores) layers.assign(finished_flags_2, finished_flags) for i in xrange(len(caches_2)): layers.assign(caches_2[i]["k"], caches[i]["k"]) layers.assign(caches_2[i]["v"], caches[i]["v"]) layers.logical_and(x=cond, y=finish_cond, out=cond) finished_flags = layers.reduce_sum(finished_flags, dim=1, keep_dim=True) / beam_size finished_flags = layers.cast(finished_flags, 'bool') mask = layers.cast(layers.reduce_any(input=finished_flags, dim=1, keep_dim=True), 'float32') mask = layers.expand(mask, [1, beam_size]) mask2 = 1.0 - mask finished_seq = layers.cast(finished_seq, 'float32') alive_seq = layers.cast(alive_seq, 'float32') #print mask finished_seq = layers.elementwise_mul(finished_seq, mask, axis=0) + \ layers.elementwise_mul(alive_seq, mask2, axis = 0) finished_seq = layers.cast(finished_seq, 'int32') finished_scores = layers.elementwise_mul(finished_scores, mask, axis=0) + \ layers.elementwise_mul(alive_log_probs, mask2) finished_seq.persistable = True finished_scores.persistable = True return finished_seq, finished_scores
def decoder(self, init_state): """ implement decoder in inference mode """ # pd.Print(init_state) # define counter variable in the decoding array_len = pd.fill_constant(shape=[1], dtype='int64', value=self.max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) static_count = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # define tensor array to save content at each time step, and write initial id, score and state state_h_array = pd.create_array('float32') pd.array_write(self.h, array=state_h_array, i=counter) state_c_array = pd.create_array('float32') pd.array_write(self.c, array=state_c_array, i=counter) src_indexes = fluid.layers.data(name='source_index', shape=[1], dtype='int64', lod_level=1) src_index_array = pd.create_array('int64') pd.array_write(src_indexes, array=src_index_array, i=counter) ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) encoder_vec_array = pd.create_array('float32') pd.array_write(self.encoder_vec, array=encoder_vec_array, i=static_count) encoder_vec_full_array = pd.create_array('float32') pd.array_write(self.encoder_vec_full, array=encoder_vec_full_array, i=static_count) encoder_proj_array = pd.create_array('float32') pd.array_write(self.encoder_proj, array=encoder_proj_array, i=static_count) event_embedding_array = pd.create_array('float32') pd.array_write(self.event_embedding, array=event_embedding_array, i=static_count) # define conditional variable to stop loop cond = pd.less_than(x=counter, y=array_len) # define while_op while_op = pd.While(cond=cond) with while_op.block(): # define the computing of each step # pd.Print(counter) # obtain input at present step of decoder, including id chosen at previous step, corresponding score and state at previous step. pre_ids = pd.array_read(array=ids_array, i=counter) pre_h_state = pd.array_read(array=state_h_array, i=counter) pre_c_state = pd.array_read(array=state_c_array, i=counter) # pre_score = pd.array_read(array=scores_array, i=counter) pre_score = pd.array_read(array=scores_array, i=static_count) _encoder_input_ids = pd.array_read(array=src_index_array, i=static_count) event_embedding = pd.array_read(array=event_embedding_array, i=static_count) # print("pre_h_state", pre_h_state) encoder_vec = pd.array_read(array=encoder_vec_array, i=static_count) encoder_vec_full = pd.array_read(array=encoder_vec_full_array, i=static_count) encoder_proj = pd.array_read(array=encoder_proj_array, i=static_count) # # update input state as state correspondent with id chosen at previous step # pre_h_state_expanded = pd.sequence_expand(pre_h_state, pre_score) # pre_c_state_expanded = pd.sequence_expand(pre_c_state, pre_score) # computing logic of decoder under the same train mode, including input vector and computing unit of decoder # compute predicting probability of normalized word pre_ids_emb = pd.embedding( input=pre_ids, size=[self.target_dict_dim, self.embedding_dim], dtype='float32', param_attr=fluid.ParamAttr(name="trg_embedding")) # pd.Print(pre_ids_emb) att_context = self.simple_attention(encoder_vec, encoder_proj, pre_h_state) # print("att_context", att_context) # print("pre_ids_emb", pre_ids_emb) # pd.Print(att_context) prob_c = fluid.layers.sequence_expand_as(pre_score, encoder_vec) # pd.Print(prob_c) current_score, current_h, current_c, this_prob_c = self.copy_decoder( pre_ids_emb, encoder_vec, encoder_vec_full, encoder_proj, _encoder_input_ids, pre_ids, prob_c, att_context, pre_h_state, pre_c_state, event_embedding) # decoder_inputs = fluid.layers.concat( # input=[att_context, pre_ids_emb], axis=1) # current_h, current_c = self.lstm_step( # decoder_inputs, pre_h_state, pre_c_state, self.decoder_size) # # compute predicting probability of nomarlized word # current_score = fluid.layers.fc(input=current_h, # size=self.target_dict_dim, # act='softmax', # param_attr=fluid.ParamAttr(name="out_softmax_w"), # bias_attr=fluid.ParamAttr(name="out_softmax_b")) # # current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], # # size=decoder_size, # # act='tanh') # current_state_with_lod = pd.lod_reset(x=current_h, y=pre_score) # current_score = pd.fc(input=current_state_with_lod, # size=self.target_dict_dim, # act='softmax', # param_attr=fluid.ParamAttr(name="out_softmax_w"), # bias_attr=fluid.ParamAttr(name="out_softmax_b")) # print(current_score) topk_scores, topk_indices = pd.topk(current_score, k=self.beam_size) # pd.Print(topk_indices) # pd.Print(topk_scores) selected_ids, selected_scores = topk_indices, topk_scores # # compute accumulated score and perform beam search # accu_scores = pd.elementwise_add( # x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0) # selected_ids, selected_scores = pd.beam_search( # pre_ids, # pre_score, # topk_indices, # accu_scores, # self.beam_size, # # end_id=self.end_id, # end_id=999999, # level=0) # pd.Print(selected_ids) # pd.Print(selected_scores) pd.increment(x=counter, value=1, in_place=True) # write search result and corresponding hidden layer into tensor array pd.array_write(current_h, array=state_h_array, i=counter) pd.array_write(current_c, array=state_c_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) # pd.Print(selected_ids) # pd.Print(selected_scores) # update condition to stop loop length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) # pd.Print(array_len) # translation_ids, translation_scores = pd.beam_search_decode( # ids=ids_array, scores=scores_array, beam_size=self.beam_size, end_id=self.end_id) # pd.Print(translation_ids) translation_ids, translation_ids_index = pd.tensor_array_to_tensor( ids_array, axis=1) translation_scores, translation_scores_index = pd.tensor_array_to_tensor( scores_array, axis=1) return translation_ids, translation_scores
def infilling_decode(self): if self.task_type == "dialog": emb_num = 4 else: emb_num = 3 input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \ [[-1, self.max_seq_len, self.max_seq_len]] input_dtypes = ['int64'] * emb_num + ['float32'] input_lod_levels = [0] * emb_num + [0] shapes = input_shapes + [[-1, self.max_seq_len, 1], [-1, self.max_seq_len, 1], [-1, 1], [-1], [-1, 1, self.max_seq_len], [-1, 1]] dtypes = input_dtypes + [ 'int64', 'int64', 'float32', 'int32', 'float32', 'int64' ] lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0] inputs = self.to_ternsor(shapes, dtypes, lod_levels) pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=False) emb_ids = {} for key, value in zip(self.emb_keys, inputs[:emb_num]): emb_ids[key] = value input_mask = inputs[emb_num] tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[ -6:] ernie = ErnieModel(emb_ids=emb_ids, input_mask=input_mask, config=self.ernie_config, use_fp16=self.use_fp16, task_type=self.task_type, decoding=True, gather_idx=parent_idx) max_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.max_dec_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=0, force_cpu=True) pos_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx) pos_biases = layers.array_write(layers.reshape(tgt_pos, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) tgt_masks = layers.array_write(tgt_input_mask, step_idx) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_mask = layers.array_read(tgt_masks, i=step_idx) def gen_batch_like(value, dtype="int64", shape=[-1, 1, 1], is_scalar=True): if is_scalar: return layers.fill_constant_batch_size_like( input=parent_idx, value=value, shape=shape, dtype=dtype) else: return layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=parent_idx, value=1, shape=shape, dtype=dtype), y=value, axis=0) tmp_mask = layers.gather(input=tmp_mask, index=parent_idx) append_0_mask = gen_batch_like(0.0, dtype=tmp_mask.dtype) append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype) tmp_mask = layers.concat([tmp_mask, append_1_mask], axis=2) pre_mask = layers.concat([tmp_mask, append_0_mask], axis=2) cur_mask = layers.concat([tmp_mask, append_1_mask], axis=2) cur_ids = gen_batch_like(self.attn_id) pre_pos = gen_batch_like(step_idx, is_scalar=False) cur_pos = gen_batch_like(pos_idx, is_scalar=False) if self.continuous_position: pre_pos = pre_pos + pos_bias cur_pos = cur_pos + pos_bias dec_emb_ids = { "word_embedding": layers.concat([pre_ids, cur_ids], axis=1), "pos_embedding": layers.concat([pre_pos, cur_pos], axis=1) } if self.task_type == "dialog": role_ids = gen_batch_like(0) turn_ids = gen_batch_like(0) dec_emb_ids["role_embedding"] = layers.concat( [role_ids, role_ids], axis=1) dec_emb_ids["turn_embedding"] = layers.concat( [turn_ids, turn_ids], axis=1) else: sent_ids = gen_batch_like(self.tgt_type_id) dec_emb_ids["sent_embedding"] = layers.concat( [sent_ids, sent_ids], axis=1) dec_mask = layers.concat([pre_mask, cur_mask], axis=1) dec_out = ernie.encode(dec_emb_ids, dec_mask, parent_idx, remove_query=True) fc_out = self.cal_logit(dec_out[:, 1:, :], None) topk_scores, topk_indices = layers.topk( input=layers.softmax(fc_out), k=self.beam_size) pre_lenpen = layers.pow( (5.0 + layers.cast(step_idx, pre_scores.dtype)) / 6.0, self.length_penalty) cur_lenpen = layers.pow( (5.0 + layers.cast(pos_idx, pre_scores.dtype)) / 6.0, self.length_penalty) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_lenpen, axis=0) / cur_lenpen topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=pos_idx, value=1.0, in_place=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(tmp_mask, i=step_idx, array=tgt_masks) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) graph_vars = { "finished_ids": finished_ids, "finished_scores": finished_scores, "data_ids": data_ids } for k, v in graph_vars.items(): v.persistable = True return pyreader, graph_vars
def _build_decoder(self, enc_last_hidden, enc_last_cell, mode='train', beam_size=10): softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale)) if mode == 'train': dec_output, dec_last_hidden, dec_last_cell = basic_lstm( self.tar_emb, enc_last_hidden, enc_last_cell, \ self.hidden_size, num_layers=self.num_layers, \ batch_first=self.batch_first, \ dropout_prob=self.dropout, \ param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale) ), \ bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) )) dec_output = layers.matmul(dec_output, softmax_weight) return dec_output elif mode == 'beam_search' or mode == 'greedy_search': dec_unit_list = [] name = 'basic_lstm' for i in range(self.num_layers): new_name = name + "_layers_" + str(i) dec_unit_list.append( BasicLSTMUnit(new_name, self.hidden_size, dtype='float32')) def decoder_step(current_in, pre_hidden_array, pre_cell_array): new_hidden_array = [] new_cell_array = [] step_in = current_in for i in range(self.num_layers): pre_hidden = pre_hidden_array[i] pre_cell = pre_cell_array[i] new_hidden, new_cell = dec_unit_list[i](step_in, pre_hidden, pre_cell) new_hidden_array.append(new_hidden) new_cell_array.append(new_cell) step_in = new_hidden return step_in, new_hidden_array, new_cell_array if mode == 'beam_search': max_src_seq_len = layers.shape(self.src)[1] max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) #eos_ids = layers.fill_constant( [1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([beam_size, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append( layers.expand(enc_last_hidden[i], [beam_size, 1])) pre_cell_array.append( layers.expand(enc_last_cell[i], [beam_size, 1])) eos_ids = layers.fill_constant([beam_size], dtype='int64', value=2) init_score = np.zeros((beam_size)).astype('float32') init_score[1:] = -INF pre_score = layers.assign(init_score) #pre_score = layers.fill_constant( [1,], dtype='float32', value= 0.0) tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) enc_memory = layers.expand(self.enc_output, [beam_size, 1, 1]) pre_tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) finished_seq = layers.fill_constant([beam_size, 1], dtype='int64', value=0) finished_scores = layers.fill_constant([beam_size], dtype='float32', value=-INF) finished_flag = layers.fill_constant([beam_size], dtype='float32', value=0.0) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True parent_idx = layers.fill_constant([1], dtype='int32', value=0) while_op = layers.While(cond) def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags, beam_size, select_beam=None, generate_id=None): scores = layers.reshape(scores, shape=[1, -1]) _, topk_indexs = layers.topk(scores, k=beam_size) topk_indexs = layers.reshape(topk_indexs, shape=[-1]) # gather result top_seq = layers.gather(sequences, topk_indexs) topk_flags = layers.gather(flags, topk_indexs) topk_gather_scores = layers.gather(scores_to_gather, topk_indexs) if select_beam: topk_beam = layers.gather(select_beam, topk_indexs) else: topk_beam = select_beam if generate_id: topk_id = layers.gather(generate_id, topk_indexs) else: topk_id = generate_id return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, select_beam, generate_id): curr_scores += curr_finished * -INF return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs, curr_finished, beam_size, select_beam, generate_id=generate_id) def grow_finished(finished_seq, finished_scores, finished_flag, curr_seq, curr_scores, curr_finished): finished_seq = layers.concat([ finished_seq, layers.fill_constant( [beam_size, 1], dtype='int64', value=1) ], axis=1) curr_scores += (1.0 - curr_finished) * -INF #layers.Print( curr_scores, message="curr scores") curr_finished_seq = layers.concat([finished_seq, curr_seq], axis=0) curr_finished_scores = layers.concat( [finished_scores, curr_scores], axis=0) curr_finished_flags = layers.concat( [finished_flag, curr_finished], axis=0) return compute_topk_scores_and_seq(curr_finished_seq, curr_finished_scores, curr_finished_scores, curr_finished_flags, beam_size) def is_finished(alive_log_prob, finished_scores, finished_in_finished): max_out_len = 200 max_length_penalty = layers.pow( layers.fill_constant([1], dtype='float32', value=((5.0 + max_out_len) / 6.0)), alpha) lower_bound_alive_score = layers.slice( alive_log_prob, starts=[0], ends=[1], axes=[0]) / max_length_penalty lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished lowest_score_of_fininshed_in_finished += ( 1.0 - finished_in_finished) * -INF lowest_score_of_fininshed_in_finished = layers.reduce_min( lowest_score_of_fininshed_in_finished) met = layers.less_than( lower_bound_alive_score, lowest_score_of_fininshed_in_finished) met = layers.cast(met, 'float32') bound_is_met = layers.reduce_sum(met) finished_eos_num = layers.reduce_sum(finished_in_finished) finish_cond = layers.less_than( finished_eos_num, layers.fill_constant([1], dtype='float32', value=beam_size)) return finish_cond def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_hidden_array, pre_cell_array) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape( topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array with while_op.block(): topk_seq, topk_log_probs, topk_scores, topk_finished, topk_beam, topk_generate_id, attention_out, new_hidden_array, new_cell_array = \ grow_top_k( step_idx, pre_tokens, pre_score, parent_idx) alive_seq, alive_log_prob, _, alive_beam, alive_id = grow_alive( topk_seq, topk_scores, topk_log_probs, topk_finished, topk_beam, topk_generate_id) finished_seq_2, finished_scores_2, finished_flags_2, _, _ = grow_finished( finished_seq, finished_scores, finished_flag, topk_seq, topk_scores, topk_finished) finished_cond = is_finished(alive_log_prob, finished_scores_2, finished_flags_2) layers.increment(x=step_idx, value=1.0, in_place=True) layers.assign(alive_beam, parent_idx) layers.assign(alive_id, pre_tokens) layers.assign(alive_log_prob, pre_score) layers.assign(alive_seq, tokens) layers.assign(finished_seq_2, finished_seq) layers.assign(finished_scores_2, finished_scores) layers.assign(finished_flags_2, finished_flag) # update init_hidden, init_cell, input_feed new_feed = layers.gather(attention_out, parent_idx) layers.assign(new_feed, pre_feed) for i in range(self.num_layers): new_hidden_var = layers.gather(new_hidden_array[i], parent_idx) layers.assign(new_hidden_var, pre_hidden_array[i]) new_cell_var = layers.gather(new_cell_array[i], parent_idx) layers.assign(new_cell_var, pre_cell_array[i]) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=finished_cond, out=cond) tokens_with_eos = tokens all_seq = layers.concat([tokens_with_eos, finished_seq], axis=0) all_score = layers.concat([pre_score, finished_scores], axis=0) _, topk_index = layers.topk(all_score, k=beam_size) topk_index = layers.reshape(topk_index, shape=[-1]) final_seq = layers.gather(all_seq, topk_index) final_score = layers.gather(all_score, topk_index) return final_seq elif mode == 'greedy_search': max_src_seq_len = layers.shape(self.src)[1] max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) eos_ids = layers.fill_constant([1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([1, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append(enc_last_hidden[i]) pre_cell_array.append(enc_last_cell[i]) #pre_hidden_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) #pre_cell_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True while_op = layers.While(cond) with while_op.block(): dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_hidden_array, pre_cell_array) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) logits = layers.log(logits) current_log = layers.elementwise_add(logits, score, axis=0) topk_score, topk_indices = layers.topk(input=current_log, k=1) new_ids = layers.concat([full_ids, topk_indices]) layers.assign(new_ids, full_ids) #layers.Print( full_ids, message="ful ids") layers.assign(topk_score, score) layers.assign(topk_indices, pre_ids) layers.assign(dec_att_out, pre_feed) for i in range(self.num_layers): layers.assign(new_hidden_array[i], pre_hidden_array[i]) layers.assign(new_cell_array[i], pre_cell_array[i]) layers.increment(x=step_idx, value=1.0, in_place=True) eos_met = layers.not_equal(topk_indices, eos_ids) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=eos_met, out=cond) return full_ids raise Exception("error") else: print("mode not supprt", mode)
def inference(self, model, inputs, outputs): """ Run inference. Args: inputs(dict): Its key is input name(str) and its value is a Variable. model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`. Returns: dict(str:Variable): Its key is output name(str) and its value is a Variable. """ # prepare while loop max_len = layers.fill_constant( shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True) min_len = layers.fill_constant( shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True) step_idx = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=True) ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx) pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx) scores = layers.array_write(inputs["init_score"], step_idx) tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx) parent_idx = inputs["parent_idx"] if self.decoding_strategy == "beam_search": beam_size = self.beam_size else: beam_size = 1 eos_penalty = np.zeros(self.vocab_size, dtype="float32") eos_penalty[self.eos_id] = -1e9 eos_penalty = layers.assign(eos_penalty) token_penalty = np.zeros(self.vocab_size, dtype="float32") token_penalty[self.unk_id] = -1e9 if self.mask_id >= 0: token_penalty[self.mask_id] = -1e9 token_penalty = layers.assign(token_penalty) # start while loop cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx) dtype = tmp_tgt_generation_mask.dtype append_mask = layers.fill_constant_batch_size_like( input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype) tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2) pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx) pre_sent = layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype) if self.continuous_position: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias else: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) if self.use_role: pre_role = layers.fill_constant_batch_size_like( input=pre_mask, value=0, shape=[-1, 1, 1], dtype=pre_ids.dtype) else: pre_role = None dec_out, _ = model._generation_network( token_ids=pre_ids, type_ids=pre_sent, pos_ids=pre_pos, role_ids=pre_role, generation_mask=tmp_tgt_generation_mask, gather_idx=parent_idx) logits = model._calc_logits(dec_out) # ignore unk and mask token if self.ignore_unk: logits = layers.elementwise_add(logits, token_penalty, axis=1) # min dec length min_len_cond = layers.less_than(x=step_idx, y=min_len) def min_len_penalty(): """Plus minimum length penalty.""" return layers.elementwise_add(logits, eos_penalty, axis=1) def no_penalty(): """No penalty.""" return logits logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty) # get probs probs = layers.softmax(logits / self.temperature) if self.decoding_strategy == "beam_search": topk_scores, topk_indices = layers.topk( input=probs, k=beam_size) else: if self.decoding_strategy.startswith("sampling"): sampling_ids = layers.sampling_id(probs, dtype="int") elif self.decoding_strategy.startswith("topk_sampling"): topk_probs, _ = layers.topk(input=probs, k=self.topk) ge_cond = layers.cast( layers.greater_equal( probs, layers.unsqueeze(topk_probs[:, -1], [1])), "float32") old_probs = probs probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") probs = old_probs else: raise ValueError(self.decoding_strategy) sampling_scores = layers.one_hot( layers.unsqueeze(sampling_ids, [1]), probs.shape[1] ) sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3 topk_scores, topk_indices = layers.topk( input=sampling_scores, k=1) pre_len = layers.cast(step_idx, "float32") layers.increment(x=step_idx, value=1.0, in_place=True) cur_len = layers.cast(step_idx, "float32") # update scores if self.length_average: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len elif self.length_penalty > 0: pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty) cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp else: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores, axis=0) topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=self.eos_id, return_parent_idx=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=self.eos_id) predictions = { "finished_ids": finished_ids, "finished_scores": finished_scores, "token_ids": inputs["token_ids"], "data_id": inputs["data_id"] } return predictions
def beam_search(): """Beam search function""" max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.min_out_len) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-INF) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "static_k_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_v_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_k_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype), "static_v_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype) } for i in range(self._dec_n_layer) ] trigram_blocking = TrigramBlocking(start_tokens, self.tokenizer, use_fp16=self._use_fp16, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_words_attn_bias = layers.gather( tgt_src_words_attn_bias, index=parent_idx) pre_src_sents_attn_bias = layers.gather( tgt_src_sents_attn_bias, index=parent_idx) pre_graph_attn_bias = layers.gather(graph_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input= pre_src_sents_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = self.decode( dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias, pre_src_sents_attn_bias, pre_graph_attn_bias), enc_words_output=enc_words_output, enc_sents_output=enc_sents_output, caches=caches, gather_idx=parent_idx) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant( shape=[layers.shape(logits)[0]], dtype='int64', value=self.eos_idx) eos_index = fluid.one_hot(eos_index, depth=self.voc_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(logits, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.len_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func( func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) layers.Print(trigram_blocking.delta_score_out, summarize=100, message="trigram_blocking.delta_score_out") pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.len_penalty) curr_scores = layers.elementwise_div( accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias) layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias) layers.assign(pre_graph_attn_bias, graph_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not( layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) return finished_ids, finished_scores
def decoder_decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data( name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) cond = pd.less_than(x=counter, y=array_len) while_op = pd.While(cond=cond) with while_op.block(): pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) # expand the recursive_sequence_lengths of pre_state to be the same with pre_score pre_state_expanded = pd.sequence_expand(pre_state, pre_score) pre_ids_emb = pd.embedding( input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse) # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=beam_size) # calculate accumulated scores after topk to reduce computation cost accu_scores = pd.elementwise_add( x=pd.log(topk_scores), y=pd.reshape( pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search( pre_ids, pre_score, topk_indices, accu_scores, beam_size, end_id=10, level=0) pd.increment(x=counter, value=1, in_place=True) # update the memories pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) # update the break condition: up to the max length or all candidates of # source sentences have ended. length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) # return init_ids, init_scores return translation_ids, translation_scores
def test_hybrid_parallel_inference_helper_mp1pp2(self): nranks = int(os.getenv("PADDLE_TRAINERS_NUM", 1)) rank = int(os.getenv("PADDLE_TRAINER_ID", 0)) dev_id = int(os.getenv("FLAGS_selected_gpus", 0)) main_program = paddle.static.Program() startup_program = paddle.static.Program() device = "gpu" with paddle.static.program_guard(main_program, startup_program): with paddle.fluid.device_guard(f'{device}:0'): X = paddle.static.data( name='X', shape=[None, 2], dtype='float32') with paddle.fluid.device_guard(f'{device}:all'): max_len = layers.fill_constant( shape=[1], dtype="int64", value=2, force_cpu=False, name="n") step_idx = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=False, name="i") data = layers.array_write(X, step_idx) cond_int = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=False, name="cond_int") cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond, is_test=True) with while_op.block(): with paddle.fluid.device_guard(f'{device}:all'): input = layers.array_read(array=data, i=step_idx) layers.increment(x=step_idx, value=1.0, in_place=True) layers.array_write(input, i=step_idx, array=data) with paddle.fluid.device_guard(f'{device}:0'): param_attr = paddle.ParamAttr( initializer=paddle.nn.initializer.Constant(1.0)) weight1 = paddle.static.create_parameter( shape=[2, 5], dtype='float32', attr=param_attr, is_bias=False) hidden1 = paddle.matmul(input, weight1) with paddle.fluid.device_guard(f'{device}:1'): param_attr = paddle.ParamAttr( initializer=paddle.nn.initializer.Constant(2.0)) weight2 = paddle.static.create_parameter( shape=[5, 2], dtype='float32', attr=param_attr, is_bias=False) hidden2 = paddle.matmul(hidden1, weight2) layers.array_write(hidden2, i=step_idx, array=data) # update cond and assign to cond_int, we will sync cond_int layers.less_than(x=step_idx, y=max_len, cond=cond) layers.assign(layers.cast(cond, dtype="int32"), cond_int) with paddle.fluid.device_guard(f'{device}:all'): # the code below must at end of while block and exists in device:all layers.assign(layers.cast(cond_int, dtype='bool'), cond) with paddle.fluid.device_guard(f'{device}:all'): out = layers.create_array(data.dtype) layers.assign(data, out) with paddle.fluid.device_guard(f'{device}:all'): # use a empty lod_tensor_array to clear lod_tensor_array layers.assign(layers.create_array(data.dtype), data) helper = HybridParallelInferenceHelper( startup_program, main_program, micro_batch_size=2, num_mp=1, num_pp=2, init_comm=nranks > 1, ) helper.gen_infer_program( ['array_write_0.out'], ['cond_int.tmp_0'], debug=True) exe = paddle.static.Executor(paddle.CUDAPlace(dev_id)) exe.run(startup_program) for step in range(2): init_data = np.random.uniform( low=0.0, high=1.0, size=[2, 2]).astype('float32') [res] = exe.run(main_program, feed={"X": init_data}, fetch_list=[out]) res_np = numpy_while(init_data) assert len(res) == len(res_np) for d1, d2 in zip(res, res_np): np.testing.assert_allclose(d1, d2)
def decoder_decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data(name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) cond = pd.less_than(x=counter, y=array_len) while_op = pd.While(cond=cond) with while_op.block(): pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) # expand the lod of pre_state to be the same with pre_score pre_state_expanded = pd.sequence_expand(pre_state, pre_score) pre_ids_emb = pd.embedding(input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse) # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=50) selected_ids, selected_scores = pd.beam_search(pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) pd.increment(x=counter, value=1, in_place=True) # update the memories pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) pd.less_than(x=counter, y=array_len, cond=cond) translation_ids, translation_scores = pd.beam_search_decode( ids=ids_array, scores=scores_array) # return init_ids, init_scores return translation_ids, translation_scores
def fast_decode(self): """create model for inference""" if self.task_type == "dialog": emb_num = 4 else: emb_num = 3 input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \ [[-1, self.max_seq_len, self.max_seq_len]] input_dtypes = ['int64'] * emb_num + ['float32'] input_lod_levels = [0] * emb_num + [0] shapes = input_shapes + [[-1, 1, 1], [-1, 1, 1], [-1, 1], [-1], [-1, 1, self.max_seq_len], [-1, 1]] dtypes = input_dtypes + [ 'int64', 'int64', 'float32', 'int32', 'float32', 'int64' ] lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0] inputs = self.to_tensor(shapes, dtypes, lod_levels) pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=70, iterable=False) emb_ids = {} for key, value in zip(self.emb_keys, inputs[:emb_num]): emb_ids[key] = value input_mask = inputs[emb_num] tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[ -6:] unimo = UNIMOModel(emb_ids=emb_ids, input_mask=input_mask, config=self.gene_config, task_type=self.task_type, decoding=True, gather_idx=parent_idx) max_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.min_out_len, force_cpu=True) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-1e18) step_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx) pos_biases = layers.array_write(tgt_pos, step_idx) scores = layers.array_write(init_scores, step_idx) tgt_masks = layers.array_write(tgt_input_mask, step_idx) trigram_blocking = TrigramBlocking(tgt_ids, self.tokenizer, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) def gen_batch_like(value, dtype="int64", shape=[-1, 1, 1], is_scalar=True): """generate batch""" if is_scalar: return layers.fill_constant_batch_size_like( input=parent_idx, value=value, shape=shape, dtype=dtype) else: return layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=parent_idx, value=1, shape=shape, dtype=dtype), y=value, axis=0) tmp_mask = layers.array_read(tgt_masks, i=step_idx) tmp_mask = layers.gather(input=tmp_mask, index=parent_idx) append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype) pre_mask = layers.concat([tmp_mask, append_1_mask], axis=2) pre_pos = gen_batch_like(step_idx, is_scalar=False) pre_pos = pre_pos + pos_bias ####################### pos start from 2 pre_sent = gen_batch_like(self.tgt_type_id, dtype=pre_ids.dtype) dec_emb_ids = {"word_embedding": pre_ids, "pos_embedding": pre_pos} if self.task_type == "dialog": role_ids = gen_batch_like(0) turn_ids = gen_batch_like(0) dec_emb_ids["role_embedding"] = role_ids dec_emb_ids["turn_embedding"] = turn_ids else: dec_emb_ids["sent_embedding"] = pre_sent dec_out = unimo.encode(emb_ids=dec_emb_ids, input_mask=pre_mask, gather_idx=parent_idx) fc_out = self.cal_logit(dec_out, None) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant(shape=[layers.shape(fc_out)[0]], dtype='int64', value=self.eos_id) eos_index = fluid.one_hot(eos_index, depth=self.vocab_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(fc_out, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.length_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func(func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.length_penalty) curr_scores = layers.elementwise_div(accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_id, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_masks) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_id) graph_vars = { "finished_ids": finished_ids, "finished_scores": finished_scores, "data_ids": data_ids } for k, v in graph_vars.items(): v.persistable = True return pyreader, graph_vars