def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """ Scaled Dot-Product Attention [[ 0 L*L -inf -inf -inf ]]maxLen*maxLen """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias weights = layers.softmax(product) ############################ # add code layers.Print(attn_bias, message="The content of input layer:") attn_mask = attn_bias == 0 attn_mask = layers.cast(attn_mask, 'float64') layers.Print(weights) weights = layers.elementwise_mul(attn_mask, weights) layers.Print(weights) # weights = layers.elementwise_mul(weights, attn_mask) ############################ if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out
def build_network(self, only_forward, **kargs): x = layers.data('x', shape=[3], dtype='float32', lod_level=1) x.stop_gradient = False layers.Print(input=x, **kargs) loss = layers.mean(x) append_backward(loss=loss) return loss
def static_func(x): x = fluid.layers.assign(x) iter_num = fluid.layers.fill_constant(shape=[1], value=3, dtype='int32') a = fluid.layers.create_array(dtype='float32') i = 0 a = fluid.dygraph.dygraph_to_static.variable_trans_func.to_static_variable( a) i = fluid.dygraph.dygraph_to_static.variable_trans_func.to_static_variable( i) iter_num = (fluid.dygraph.dygraph_to_static.variable_trans_func. to_static_variable(iter_num)) x = fluid.dygraph.dygraph_to_static.variable_trans_func.to_static_variable( x) def while_condition_0(a, i, iter_num, x): return i < iter_num def while_body_0(a, i, iter_num, x): fluid.layers.array_write(x=x, i=fluid.layers.array_length(a), array=a) i += 1 return a, i, iter_num, x a, i, iter_num, x = fluid.layers.while_loop(while_condition_0, while_body_0, [a, i, iter_num, x]) length = layers.array_length(a) layers.Print(length) return a[0]
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """ Scaled Dot-Product Attention """ product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5) if attn_bias: product += attn_bias weights = layers.softmax(product) layers.Print(weights) layers.Print(weights) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False) out = layers.matmul(weights, v) return out
def dygraph_func(x): x = fluid.dygraph.to_variable(x) iter_num = fluid.layers.fill_constant(shape=[1], value=3, dtype="int32") a = [] i = 0 while i < iter_num: a.append(x) i += 1 length = layers.array_length(a) layers.Print(length) return a[0]
def lstmp_encoder(input_seq, gate_size, h_0, c_0, para_name, proj_size, test_mode, args): # A lstm encoder implementation with projection. # Linear transformation part for input gate, output gate, forget gate # and cell activation vectors need be done outside of dynamic_lstm. # So the output size is 4 times of gate_size. if args.para_init: init = fluid.initializer.Constant(args.init1) init_b = fluid.initializer.Constant(0.0) else: init = None init_b = None input_seq = dropout(input_seq, test_mode, args) input_proj = layers.fc(input=input_seq, param_attr=fluid.ParamAttr( name=para_name + '_gate_w', initializer=init), size=gate_size * 4, act=None, bias_attr=False) if args.debug: layers.Print(input_seq, message='input_seq', summarize=10) layers.Print(input_proj, message='input_proj', summarize=10) hidden, cell = layers.dynamic_lstmp( input=input_proj, size=gate_size * 4, proj_size=proj_size, h_0=h_0, c_0=c_0, use_peepholes=False, proj_clip=args.proj_clip, cell_clip=args.cell_clip, proj_activation="identity", param_attr=fluid.ParamAttr(initializer=init), bias_attr=fluid.ParamAttr(initializer=init_b)) return hidden, cell, input_proj
def test_all_parameters(self): x = layers.data('x', shape=[3], dtype='float32', lod_level=1) x.stop_gradient = False for print_tensor_name in [True, False]: for print_tensor_type in [True, False]: for print_tensor_shape in [True, False]: for print_tensor_lod in [True, False]: layers.Print( input=x, print_tensor_name=print_tensor_name, print_tensor_type=print_tensor_type, print_tensor_shape=print_tensor_shape, print_tensor_lod=print_tensor_lod, ) loss = layers.mean(x) append_backward(loss=loss) exe = Executor(self.place) outs = exe.run(feed={'x': self.x_tensor}, fetch_list=[loss], return_numpy=False)
def run_boxps_preload(self, is_cpu=True): x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0) y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0) emb_x, emb_y = _pull_box_sparse([x, y], size=2) emb_xp = _pull_box_sparse(x, size=2) layers.Print(emb_xp) concat = layers.concat([emb_x, emb_y], axis=1) fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) loss = layers.reduce_mean(fc) layers.Print(loss) place = fluid.CPUPlace( ) if is_cpu or not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) exe = fluid.Executor(place) optimizer = fluid.optimizer.SGD(learning_rate=0.5) batch_size = 2 def binary_print(slot, fout): fout.write(str(len(slot)) + " ") for e in slot: fout.write(str(e) + " ") batch1 = np.ones( (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1) filelist = [] place_str = "cpu" if is_cpu else "gpu" for i in range(2): filelist.append("test_hdfs_" + place_str + "_" + str(i)) for f in filelist: with open(f, "w") as fout: for ins in batch1: for slot in ins: binary_print(slot, fout) fout.write("\n") def create_dataset(): dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") dataset.set_use_var([x, y]) dataset.set_batch_size(2) dataset.set_thread(1) dataset.set_filelist(filelist) return dataset datasets = [] datasets.append(create_dataset()) datasets.append(create_dataset()) optimizer.minimize(loss) exe.run(fluid.default_startup_program()) datasets[0].load_into_memory() datasets[0].begin_pass() datasets[1].preload_into_memory() exe.train_from_dataset(program=fluid.default_main_program(), dataset=datasets[0], print_period=1) datasets[0].end_pass() datasets[1].wait_preload_done() datasets[1].begin_pass() exe.train_from_dataset(program=fluid.default_main_program(), dataset=datasets[1], print_period=1) datasets[1].end_pass() for f in filelist: os.remove(f)
def beam_search(): """Beam search function""" max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.min_out_len) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-INF) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "static_k_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_v_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_k_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype), "static_v_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype) } for i in range(self._dec_n_layer) ] trigram_blocking = TrigramBlocking(start_tokens, self.tokenizer, use_fp16=self._use_fp16, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_words_attn_bias = layers.gather( tgt_src_words_attn_bias, index=parent_idx) pre_src_sents_attn_bias = layers.gather( tgt_src_sents_attn_bias, index=parent_idx) pre_graph_attn_bias = layers.gather(graph_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input= pre_src_sents_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = self.decode( dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias, pre_src_sents_attn_bias, pre_graph_attn_bias), enc_words_output=enc_words_output, enc_sents_output=enc_sents_output, caches=caches, gather_idx=parent_idx) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant( shape=[layers.shape(logits)[0]], dtype='int64', value=self.eos_idx) eos_index = fluid.one_hot(eos_index, depth=self.voc_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(logits, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.len_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func( func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) layers.Print(trigram_blocking.delta_score_out, summarize=100, message="trigram_blocking.delta_score_out") pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.len_penalty) curr_scores = layers.elementwise_div( accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias) layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias) layers.assign(pre_graph_attn_bias, graph_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not( layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) return finished_ids, finished_scores
fc0 = fluid.layers.fc(image, size=3, act=None, bias_attr=False, param_attr=fluid.initializer.Constant(value=2.0)) fc1 = fluid.layers.fc(fc0, size=3, act=None, bias_attr=False, param_attr=fluid.initializer.TruncatedNormal(loc=0.0, scale=0.02, seed=0)) #fc1 = fluid.layers.fc(fc0, size=cls_num, act='relu', bias_attr=False, param_attr=fluid.initializer.Constant(value=2.0)) if ASCEND == False: fc0 = layers.Print(fc0, message="fc0") fc1 = layers.Print(fc1, message="fc1") # CLASS_NUM = 10 # fc1 = fluid.layers.fc(fc0, size=CLASS_NUM, bias_attr=False,param_attr=fluid.initializer.Constant(value=2.0)) # layers.Print(fc1) cross_entropy = fluid.layers.softmax_with_cross_entropy(fc1, label) if ASCEND == False: cross_entropy = layers.Print(cross_entropy, message="cross_entropy") cost = fluid.layers.reduce_sum(cross_entropy) #cost = fluid.layers.log(cost) #cost = fluid.layers.tanh(cost) #cost = fluid.layers.pow(cost, 2) #cost = fluid.layers.sqrt(cost) #cost = fluid.layers.mean(cost) if ASCEND == False:
dtype="float64") attn_bias = fluid.layers.data(name='attn_bias', shape=[None, n_head, max_len, max_len], dtype="float64") # layers.Print(attn_bias) attn_bias1 = fluid.layers.data(name='attn_bias1', shape=[None, n_head, max_len, max_len], dtype="float64") output = multi_head_attention(q, k, v, attn_bias, d_model, d_model, d_model, n_head) output1 = multi_head_attention(q, k, v, attn_bias1, d_model, d_model, d_model, n_head) soft_max = layers.softmax(attn_bias1) layers.Print(soft_max) # layers.Print(output) # work INF = -2 ^ 32 + 1 input_data = np.random.rand(batch_size, max_len, d_model) attn_data = np.zeros((batch_size, n_head, max_len)) attn_data[:, :, 4:] = -INF attn_data = np.zeros( (batch_size, n_head, max_len, max_len)) + np.expand_dims(attn_data, axis=2) attn_data1 = np.zeros((batch_size, n_head, max_len)) attn_data1[:, :, 4:] = -INF a = np.expand_dims(attn_data1, axis=2) b = np.expand_dims(attn_data1, axis=3)
def attn_flow(q_enc, p_enc, p_ids_name): tag = p_ids_name + "::" drnn = layers.DynamicRNN() with drnn.block(): h_cur = drnn.step_input(p_enc) u_all = drnn.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t1 = layers.reduce_sum(input=s_t_, dim=1) s_t = layers.sequence_softmax(input=s_t1) u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0) u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') if args.debug == True: ''' layers.Print(h_expd, message='h_expd') layers.Print(h_cur, message='h_cur') layers.Print(u_all, message='u_all') layers.Print(s_t, message='s_t') layers.Print(s_t_, message='s_t_') layers.Print(u_expr, message='u_expr') ''' drnn.output(u_expr) U_expr = drnn() #''' drnn2 = layers.DynamicRNN() with drnn2.block(): h_cur = drnn2.step_input(p_enc) u_all = drnn2.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t2 = layers.reduce_sum(input=s_t_, dim=1, keep_dim=True) b_t = layers.sequence_pool(input=s_t2, pool_type='max') if args.debug == True: ''' layers.Print(s_t2, message='s_t2') layers.Print(b_t, message='b_t') ''' drnn2.output(b_t) b = drnn2() b_norm = layers.sequence_softmax(input=b) h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0) h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') H_expr = layers.sequence_expand(x=h_expr, y=p_enc) H_expr = layers.lod_reset(x=H_expr, y=p_enc) h_u = layers.elementwise_mul(x=H_expr, y=U_expr, axis=0) h_h = layers.elementwise_mul(x=H_expr, y=p_enc, axis=0) g = layers.concat(input=[H_expr, U_expr, h_u, h_h], axis = 1) #fusion m = bi_lstm_encoder(input_seq=g, gate_size=embedding_dim) if args.debug == True: layers.Print(U_expr, message=tag + 'U_expr') layers.Print(H_expr, message=tag + 'H_expr') layers.Print(b, message=tag + 'b') layers.Print(b_norm, message=tag + 'b_norm') layers.Print(g, message=tag +'g') layers.Print(m, message=tag + 'm') layers.Print(h_h, message=tag + 'h_h') layers.Print(q_enc, message=tag + 'q_enc') layers.Print(p_enc, message=tag + 'p_enc') return m, g
def bidaf(embedding_dim, encoder_size, decoder_size, source_dict_dim, target_dict_dim, max_length, args): def bi_lstm_encoder(input_seq, gate_size): # A bi-directional lstm encoder implementation. # Linear transformation part for input gate, output gate, forget gate # and cell activation vectors need be done outside of dynamic_lstm. # So the output size is 4 times of gate_size. input_forward_proj = layers.fc(input=input_seq, size=gate_size * 4, act='tanh', bias_attr=False) forward, _ = layers.dynamic_lstm( input=input_forward_proj, size=gate_size * 4, use_peepholes=False) input_reversed_proj = layers.fc(input=input_seq, size=gate_size * 4, act='tanh', bias_attr=False) reversed, _ = layers.dynamic_lstm( input=input_reversed_proj, size=gate_size * 4, is_reverse=True, use_peepholes=False) encoder_out = layers.concat(input=[forward, reversed], axis = 1) return encoder_out def encoder(input_name): input_ids = layers.data( name=input_name, shape=[1], dtype='int64', lod_level=1) input_embedding = layers.embedding( input=input_ids, size=[source_dict_dim, embedding_dim], dtype='float32', is_sparse=True) encoder_out = bi_lstm_encoder(input_seq=input_embedding, gate_size=embedding_dim) return encoder_out def attn_flow(q_enc, p_enc, p_ids_name): tag = p_ids_name + "::" drnn = layers.DynamicRNN() with drnn.block(): h_cur = drnn.step_input(p_enc) u_all = drnn.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t1 = layers.reduce_sum(input=s_t_, dim=1) s_t = layers.sequence_softmax(input=s_t1) u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0) u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') if args.debug == True: ''' layers.Print(h_expd, message='h_expd') layers.Print(h_cur, message='h_cur') layers.Print(u_all, message='u_all') layers.Print(s_t, message='s_t') layers.Print(s_t_, message='s_t_') layers.Print(u_expr, message='u_expr') ''' drnn.output(u_expr) U_expr = drnn() #''' drnn2 = layers.DynamicRNN() with drnn2.block(): h_cur = drnn2.step_input(p_enc) u_all = drnn2.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t2 = layers.reduce_sum(input=s_t_, dim=1, keep_dim=True) b_t = layers.sequence_pool(input=s_t2, pool_type='max') if args.debug == True: ''' layers.Print(s_t2, message='s_t2') layers.Print(b_t, message='b_t') ''' drnn2.output(b_t) b = drnn2() b_norm = layers.sequence_softmax(input=b) h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0) h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') H_expr = layers.sequence_expand(x=h_expr, y=p_enc) H_expr = layers.lod_reset(x=H_expr, y=p_enc) h_u = layers.elementwise_mul(x=H_expr, y=U_expr, axis=0) h_h = layers.elementwise_mul(x=H_expr, y=p_enc, axis=0) g = layers.concat(input=[H_expr, U_expr, h_u, h_h], axis = 1) #fusion m = bi_lstm_encoder(input_seq=g, gate_size=embedding_dim) if args.debug == True: layers.Print(U_expr, message=tag + 'U_expr') layers.Print(H_expr, message=tag + 'H_expr') layers.Print(b, message=tag + 'b') layers.Print(b_norm, message=tag + 'b_norm') layers.Print(g, message=tag +'g') layers.Print(m, message=tag + 'm') layers.Print(h_h, message=tag + 'h_h') layers.Print(q_enc, message=tag + 'q_enc') layers.Print(p_enc, message=tag + 'p_enc') return m, g def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): def linear(inputs): return layers.fc(input=inputs, size=size, bias_attr=True) forget_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t])) input_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t])) output_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t])) cell_tilde = layers.tanh(x=linear([hidden_t_prev, x_t])) cell_t = layers.sums(input=[ layers.elementwise_mul( x=forget_gate, y=cell_t_prev), layers.elementwise_mul( x=input_gate, y=cell_tilde) ]) hidden_t = layers.elementwise_mul( x=output_gate, y=layers.tanh(x=cell_t)) return hidden_t, cell_t #point network def point_network_decoder(p_vec, q_vec, decoder_size): random_attn = layers.gaussian_random(shape=[1, decoder_size]) random_attn = layers.sequence_expand(x=random_attn, y=q_vec) random_attn = layers.fc(input=random_attn, size=decoder_size, act=None) U = layers.fc(input=q_vec, size=decoder_size, act=None) + random_attn U = layers.tanh(U) logits = layers.fc(input=U, size=1, act=None) scores = layers.sequence_softmax(input=logits) pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0) pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum') init_state = layers.fc(input=pooled_vec, size=decoder_size, act=None) def custom_dynamic_rnn(p_vec, init_state, decoder_size): context = layers.fc(input=p_vec, size=decoder_size, act=None) drnn = layers.DynamicRNN() with drnn.block(): H_s = drnn.step_input(p_vec) ctx = drnn.static_input(context) c_prev = drnn.memory(init=init_state, need_reorder=True) m_prev = drnn.memory(init=init_state, need_reorder=True) m_prev1 = layers.fc(input=m_prev, size=decoder_size, act=None) m_prev1 = layers.sequence_expand(x=m_prev1, y=ctx) Fk = ctx + m_prev1 Fk = layers.fc(input=Fk, size=decoder_size, act='tanh') logits = layers.fc(input=Fk, size=1, act=None) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=ctx, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step(attn_ctx, hidden_t_prev=m_prev1, cell_t_prev=c_prev, size=decoder_size) drnn.update_memory(ex_mem=m_prev, new_mem=hidden_t) drnn.update_memory(ex_mem=c_prev, new_mem=cell_t) drnn.output(scores) beta = drnn() return beta fw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size) bw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size) def sequence_slice(x, index): #offset = layers.fill_constant(shape=[1, args.batch_size], value=index, dtype='float32') #length = layers.fill_constant(shape=[1, args.batch_size], value=1, dtype='float32') #return layers.sequence_slice(x, offset, length) idx = layers.fill_constant(shape=[1], value=1, dtype='int32') idx.stop_gradient = True from paddle.fluid.layers.control_flow import lod_rank_table from paddle.fluid.layers.control_flow import lod_tensor_to_array from paddle.fluid.layers.control_flow import array_read from paddle.fluid.layers.control_flow import array_to_lod_tensor table = lod_rank_table(x, level=0) table.stop_gradient = True array = lod_tensor_to_array(x, table) slice_array = array_read(array=array, i=idx) return array_to_lod_tensor(slice_array, table) start_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 0), y=sequence_slice(bw_outputs, 1), axis=0) / 2 end_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 1), y=sequence_slice(bw_outputs, 0), axis=0) / 2 return start_prob, end_prob q_enc = encoder('q_ids') if args.single_doc: p_enc = encoder('p_ids') m, g = attn_flow(q_enc, p_enc, 'p_ids') else: p_ids_names = [] ms = [] gs = [] for i in range(args.doc_num): p_ids_name = "pids_%d" % i p_ids_names.append(p_ids_name) p_enc = encoder(p_ids_name) m_i, g_i = attn_flow(q_enc, p_enc, p_ids_name) ms.append(m_i) gs.append(g_i) m = layers.sequence_concat(x=ms, axis = 0) g = layers.sequence_concat(x=gs, axis = 0) if args.simple_decode: m2 = bi_lstm_encoder(input_seq=m, gate_size=embedding_dim) gm1 = layers.concat(input=[g, m], axis = 1) gm2 = layers.concat(input=[g, m2], axis = 1) start_prob = layers.fc(input=gm1, size=1, act='softmax') end_prob = layers.fc(input=gm2, size=1, act='softmax') else: p_vec = layers.sequence_concat(x=m, axis = 0) q_vec = bi_lstm_encoder(input_seq=q_enc, gate_size=embedding_dim) start_prob, end_prob = point_network_decoder(p_vec=p_vec, q_vec=q_vec, decoder_size = decoder_size) start_prob = layers.sequence_softmax(start_prob) end_prob = layers.sequence_softmax(end_prob) pred = layers.concat(input=[start_prob, end_prob], axis = 0) #''' start_labels = layers.data( name="start_lables", shape=[1], dtype='float32', lod_level=1) end_labels = layers.data( name="end_lables", shape=[1], dtype='float32', lod_level=1) label = layers.concat(input=[start_labels, end_labels], axis=0) label.stop_gradient = True #compute loss cost = layers.cross_entropy(input=pred, label=label, soft_label=True) #cost = layers.cross_entropy(input=decode_out, label=end_labels, soft_label=True) cost = layers.reduce_sum(cost) / args.batch_size if args.debug == True: layers.Print(p1, message='p1') layers.Print(pred, message='pred') layers.Print(label, message='label') layers.Print(start_labels, message='start_labels') layers.Print(cost, message='cost') if args.single_doc: feeding_list = ['q_ids', "start_lables", "end_lables", 'p_ids'] else: feeding_list = ['q_ids', "start_lables", "end_lables" ] + p_ids_names return cost, feeding_list
def run_boxps_preload(self, is_cpu=True, random_with_lineid=False): program = fluid.Program() with fluid.program_guard(program): x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0) y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0) emb_x, emb_y = _pull_box_sparse([x, y], size=2) emb_xp = _pull_box_sparse(x, size=2) concat = layers.concat([emb_x, emb_y], axis=1) fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) loss = layers.reduce_mean(fc) layers.Print(loss) place = fluid.CPUPlace( ) if is_cpu or not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) exe = fluid.Executor(place) batch_size = 100 def binary_print(slot, fout): fout.write(str(len(slot)) + " ") for e in slot: fout.write(str(e) + " ") batch1 = np.ones( (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1) filelist = [] place_str = "cpu" if is_cpu else "gpu" for i in range(2): filelist.append("test_hdfs_" + place_str + "_" + str(i)) for f in filelist: with open(f, "w") as fout: for ins in batch1: for slot in ins: binary_print(slot, fout) fout.write("\n") def create_dataset(): dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") dataset.set_date("20190930") dataset.set_use_var([x, y]) dataset.set_batch_size(2) dataset.set_thread(1) dataset.set_filelist(filelist) return dataset datasets = [] datasets.append(create_dataset()) datasets.append(create_dataset()) optimizer = fluid.optimizer.SGD(learning_rate=0.5) optimizer = fluid.optimizer.PipelineOptimizer(optimizer, cut_list=[], place_list=[place], concurrency_list=[1], queue_size=1, sync_steps=-1) optimizer.minimize(loss) program._pipeline_opt["dump_fields"] = [ "fc.tmp_0", "fc.tmp_0@GRAD", "hehe" ] program._pipeline_opt["dump_fields_path"] = "./dump_log/" program._pipeline_opt["dump_param"] = ["fc.w_0"] program._pipeline_opt["enable_random_dump"] = True program._pipeline_opt["dump_interval"] = 10 program._pipeline_opt["random_with_lineid"] = random_with_lineid exe.run(fluid.default_startup_program()) datasets[0].load_into_memory() datasets[0].begin_pass() datasets[1].preload_into_memory() exe.train_from_dataset(program=fluid.default_main_program(), dataset=datasets[0], print_period=1) datasets[0].end_pass(True) datasets[1].wait_preload_done() datasets[1].begin_pass() exe.train_from_dataset(program=fluid.default_main_program(), dataset=datasets[1], print_period=1, debug=True) datasets[1].end_pass(False) for f in filelist: os.remove(f) if os.path.isdir("dump_log"): shutil.rmtree("dump_log")
def encoder(x, y, vocab_size, emb_size, init_hidden=None, init_cell=None, para_name='', custom_samples=None, custom_probabilities=None, test_mode=False, args=None): x_emb = layers.embedding( input=x, size=[vocab_size, emb_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr(name='embedding_para')) rnn_input = x_emb rnn_outs = [] rnn_outs_ori = [] cells = [] projs = [] for i in range(args.num_layers): rnn_input = dropout(rnn_input, test_mode, args) if init_hidden and init_cell: h0 = layers.squeeze( layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) c0 = layers.squeeze( layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) else: h0 = c0 = None rnn_out, cell, input_proj = lstmp_encoder( rnn_input, args.hidden_size, h0, c0, para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args) rnn_out_ori = rnn_out if i > 0: rnn_out = rnn_out + rnn_input rnn_out = dropout(rnn_out, test_mode, args) cell = dropout(cell, test_mode, args) rnn_outs.append(rnn_out) rnn_outs_ori.append(rnn_out_ori) rnn_input = rnn_out cells.append(cell) projs.append(input_proj) softmax_weight = layers.create_parameter( [vocab_size, emb_size], dtype="float32", name="softmax_weight") softmax_bias = layers.create_parameter( [vocab_size], dtype="float32", name='softmax_bias') projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape(projection, shape=[-1, vocab_size]) if args.sample_softmax and (not test_mode): loss = layers.sampled_softmax_with_cross_entropy( logits=projection, label=y, num_samples=args.n_negative_samples_batch, seed=args.random_seed) if args.debug: layers.Print(loss, message='out_loss', summarize=100) else: label = layers.one_hot(input=y, depth=vocab_size) loss = layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=True) return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
def build(self): args = self.args emb_size = args.embed_size proj_size = args.embed_size hidden_size = args.hidden_size batch_size = args.batch_size num_layers = args.num_layers num_steps = args.num_steps lstm_outputs = [] x_f = layers.data(name="x", shape=[1], dtype='int64', lod_level=1) y_f = layers.data(name="y", shape=[1], dtype='int64', lod_level=1) x_b = layers.data(name="x_r", shape=[1], dtype='int64', lod_level=1) y_b = layers.data(name="y_r", shape=[1], dtype='int64', lod_level=1) init_hiddens_ = layers.data( name="init_hiddens", shape=[1], dtype='float32') init_cells_ = layers.data( name="init_cells", shape=[1], dtype='float32') if args.debug: layers.Print(init_cells_, message='init_cells_', summarize=10) layers.Print(init_hiddens_, message='init_hiddens_', summarize=10) init_hiddens = layers.reshape( init_hiddens_, shape=[2 * num_layers, -1, proj_size]) init_cells = layers.reshape( init_cells_, shape=[2 * num_layers, -1, hidden_size]) init_hidden = layers.slice( init_hiddens, axes=[0], starts=[0], ends=[num_layers]) init_cell = layers.slice( init_cells, axes=[0], starts=[0], ends=[num_layers]) init_hidden_r = layers.slice( init_hiddens, axes=[0], starts=[num_layers], ends=[2 * num_layers]) init_cell_r = layers.slice( init_cells, axes=[0], starts=[num_layers], ends=[2 * num_layers]) if args.use_custom_samples: custom_samples = layers.data( name="custom_samples", shape=[args.n_negative_samples_batch + 1], dtype='int64', lod_level=1) custom_samples_r = layers.data( name="custom_samples_r", shape=[args.n_negative_samples_batch + 1], dtype='int64', lod_level=1) custom_probabilities = layers.data( name="custom_probabilities", shape=[args.n_negative_samples_batch + 1], dtype='float32', lod_level=1) else: custom_samples = None custom_samples_r = None custom_probabilities = None forward, fw_hiddens, fw_hiddens_ori, fw_cells, fw_projs = encoder( x_f, y_f, self.vocab_size, emb_size, init_hidden, init_cell, para_name='fw_', custom_samples=custom_samples, custom_probabilities=custom_probabilities, test_mode=self.test_mode, args=args) backward, bw_hiddens, bw_hiddens_ori, bw_cells, bw_projs = encoder( x_b, y_b, self.vocab_size, emb_size, init_hidden_r, init_cell_r, para_name='bw_', custom_samples=custom_samples_r, custom_probabilities=custom_probabilities, test_mode=self.test_mode, args=args) losses = layers.concat([forward[-1], backward[-1]]) self.loss = layers.reduce_mean(losses) self.loss.permissions = True self.loss.persistable = True if args.debug: x_emb, projection, loss = forward layers.Print(init_cells, message='init_cells', summarize=10) layers.Print(init_hiddens, message='init_hiddens', summarize=10) layers.Print(init_cell, message='init_cell', summarize=10) layers.Print(y_b, message='y_b', summarize=10) layers.Print(x_emb, message='x_emb', summarize=10) layers.Print(projection, message='projection', summarize=10) layers.Print(losses, message='losses', summarize=320) layers.Print(self.loss, message='loss', summarize=320) self.grad_vars = [x_f, y_f, x_b, y_b, self.loss] self.grad_vars_name = ['x', 'y', 'x_r', 'y_r', 'final_loss'] fw_vars_name = ['x_emb', 'proj', 'loss'] + [ 'init_hidden', 'init_cell' ] + ['rnn_out', 'rnn_out2', 'cell', 'cell2', 'xproj', 'xproj2'] bw_vars_name = ['x_emb_r', 'proj_r', 'loss_r'] + [ 'init_hidden_r', 'init_cell_r' ] + [ 'rnn_out_r', 'rnn_out2_r', 'cell_r', 'cell2_r', 'xproj_r', 'xproj2_r' ] fw_vars = forward + [init_hidden, init_cell ] + fw_hiddens + fw_cells + fw_projs bw_vars = backward + [init_hidden_r, init_cell_r ] + bw_hiddens + bw_cells + bw_projs for i in range(len(fw_vars_name)): self.grad_vars.append(fw_vars[i]) self.grad_vars.append(bw_vars[i]) self.grad_vars_name.append(fw_vars_name[i]) self.grad_vars_name.append(bw_vars_name[i]) if args.use_custom_samples: self.feed_order = [ 'x', 'y', 'x_r', 'y_r', 'custom_samples', 'custom_samples_r', 'custom_probabilities' ] else: self.feed_order = ['x', 'y', 'x_r', 'y_r'] self.last_hidden = [ fluid.layers.sequence_last_step(input=x) for x in fw_hiddens_ori + bw_hiddens_ori ] self.last_cell = [ fluid.layers.sequence_last_step(input=x) for x in fw_cells + bw_cells ] self.last_hidden = layers.concat(self.last_hidden, axis=0) self.last_hidden.persistable = True self.last_cell = layers.concat(self.last_cell, axis=0) self.last_cell.persistable = True if args.debug: layers.Print(self.last_cell, message='last_cell', summarize=10) layers.Print(self.last_hidden, message='last_hidden', summarize=10)