def attn_flow(q_enc, p_enc, p_ids_name, args): """Bidirectional Attention layer""" tag = p_ids_name + "__" drnn = layers.DynamicRNN() with drnn.block(): h_cur = drnn.step_input(p_enc) u_all = drnn.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_mul = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t_sum = layers.reduce_sum(input=s_t_mul, dim=1, keep_dim=True) s_t_re = layers.reshape(s_t_sum, shape=[-1, 0]) s_t = layers.sequence_softmax(input=s_t_re) u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0) u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') b_t = layers.sequence_pool(input=s_t_sum, pool_type='max') drnn.output(u_expr, b_t) U_expr, b = drnn() b_norm = layers.sequence_softmax(input=b) h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0) h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') H_expr = layers.sequence_expand(x=h_expr, y=p_enc) H_expr = layers.lod_reset(x=H_expr, y=p_enc) h_u = layers.elementwise_mul(x=p_enc, y=U_expr, axis=0) h_h = layers.elementwise_mul(x=p_enc, y=H_expr, axis=0) g = layers.concat(input=[p_enc, U_expr, h_u, h_h], axis=1) return dropout(g, args)
def decoder_train(context, is_sparse): # decoder trg_language_word = pd.data(name="target_language_word", shape=[1], dtype='int64', lod_level=1) trg_embedding = pd.embedding(input=trg_language_word, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse, param_attr=fluid.ParamAttr(name='vemb')) rnn = pd.DynamicRNN() with rnn.block(): current_word = rnn.step_input(trg_embedding) pre_state = rnn.memory(init=context) current_state = pd.fc(input=[current_word, pre_state], size=decoder_size, act='tanh') current_score = pd.fc(input=current_state, size=target_dict_dim, act='softmax') rnn.update_memory(pre_state, current_state) rnn.output(current_score) return rnn()
def gru_decoder_with_attention(self, target_embedding, encoder_vec, encoder_proj, decoder_boot, decoder_size, char_num): rnn = layers.DynamicRNN() with rnn.block(): current_word = rnn.step_input(target_embedding) encoder_vec = rnn.static_input(encoder_vec) encoder_proj = rnn.static_input(encoder_proj) hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) context = self.simple_attention(encoder_vec, encoder_proj, hidden_mem, decoder_size) fc_1 = layers.fc(input=context, size=decoder_size * 3, bias_attr=False, name="rnn_fc1") fc_2 = layers.fc(input=current_word, size=decoder_size * 3, bias_attr=False, name="rnn_fc2") decoder_inputs = fc_1 + fc_2 h, _, _ = layers.gru_unit( input=decoder_inputs, hidden=hidden_mem, size=decoder_size * 3) rnn.update_memory(hidden_mem, h) out = layers.fc(input=h, size=char_num, bias_attr=True, act='softmax', name="rnn_out_fc") rnn.output(out) return rnn()
def rc_model(hidden_size, vocab, args): emb_shape = [vocab.size(), vocab.embed_dim] start_labels = layers.data(name="start_lables", shape=[1], dtype='float32', lod_level=1) end_labels = layers.data(name="end_lables", shape=[1], dtype='float32', lod_level=1) # stage 1:encode q_id0 = get_data('q_id0', 1, args) q_ids = get_data('q_ids', 2, args) p_ids_name = 'p_ids' p_ids = get_data('p_ids', 2, args) p_embs = embedding(p_ids, emb_shape, args) q_embs = embedding(q_ids, emb_shape, args) drnn = layers.DynamicRNN() with drnn.block(): p_emb = drnn.step_input(p_embs) q_emb = drnn.step_input(q_embs) p_enc = encoder(p_emb, 'p_enc', hidden_size, args) q_enc = encoder(q_emb, 'q_enc', hidden_size, args) # stage 2:match g_i = attn_flow(q_enc, p_enc, p_ids_name, args) # stage 3:fusion m_i = fusion(g_i, args) drnn.output(m_i, q_enc) ms, q_encs = drnn() p_vec = layers.lod_reset(x=ms, y=start_labels) q_vec = layers.lod_reset(x=q_encs, y=q_id0) # stage 4:decode start_probs, end_probs = point_network_decoder(p_vec=p_vec, q_vec=q_vec, hidden_size=hidden_size, args=args) cost0 = layers.sequence_pool( layers.cross_entropy(input=start_probs, label=start_labels, soft_label=True), 'sum') cost1 = layers.sequence_pool( layers.cross_entropy(input=end_probs, label=end_labels, soft_label=True), 'sum') cost0 = layers.mean(cost0) cost1 = layers.mean(cost1) cost = cost0 + cost1 cost.persistable = True feeding_list = ["q_ids", "start_lables", "end_lables", "p_ids", "q_id0"] return cost, start_probs, end_probs, ms, feeding_list
def custom_dynamic_rnn(p_vec, init_state, decoder_size): context = layers.fc(input=p_vec, size=decoder_size, act=None) drnn = layers.DynamicRNN() with drnn.block(): H_s = drnn.step_input(p_vec) ctx = drnn.static_input(context) c_prev = drnn.memory(init=init_state, need_reorder=True) m_prev = drnn.memory(init=init_state, need_reorder=True) m_prev1 = layers.fc(input=m_prev, size=decoder_size, act=None) m_prev1 = layers.sequence_expand(x=m_prev1, y=ctx) Fk = ctx + m_prev1 Fk = layers.fc(input=Fk, size=decoder_size, act='tanh') logits = layers.fc(input=Fk, size=1, act=None) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=ctx, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step(attn_ctx, hidden_t_prev=m_prev1, cell_t_prev=c_prev, size=decoder_size) drnn.update_memory(ex_mem=m_prev, new_mem=hidden_t) drnn.update_memory(ex_mem=c_prev, new_mem=cell_t) drnn.output(scores) beta = drnn() return beta
def dynamic_rnn_net(self): x = layers.data(shape=[BATCH_SIZE * SEQ_LEN, INPUT_DIM], dtype="float32", name="x", append_batch_size=False) x.stop_gradient = False rnn = layers.DynamicRNN() with rnn.block(): x_t = rnn.step_input(x) h_pre = rnn.memory(shape=[INPUT_DIM]) h = layers.scale(x=layers.elementwise_add(x=h_pre, y=x_t), scale=self.scale) rnn.update_memory(h_pre, h) rnn.output(h) return layers.mean(rnn())
def rc_model(hidden_size, vocab, args): """This function build the whole BiDAF network""" emb_shape = [vocab.size(), vocab.embed_dim] start_labels = layers.data(name="start_lables", shape=[1], dtype='float32', lod_level=1) end_labels = layers.data(name="end_lables", shape=[1], dtype='float32', lod_level=1) # stage 1:setup input data, embedding table & encode """ def get_data(input_name, lod_level, args): input_ids = layers.data( name=input_name, shape=[1], dtype='int64', lod_level=lod_level) return input_ids """ q_id0 = get_data('q_id0', 1, args) q_ids = get_data('q_ids', 2, args) p_ids_name = 'p_ids' p_ids = get_data('p_ids', 2, args) """ def embedding(input_ids, shape, args): # Embedding layer input_embedding = layers.embedding( input=input_ids, size=shape, dtype='float32', is_sparse=True, param_attr=fluid.ParamAttr(name='embedding_para')) return input_embedding """ p_embs = embedding(p_ids, emb_shape, args) # emb_shape = [vocab.size(), vocab.embed_dim] q_embs = embedding(q_ids, emb_shape, args) drnn = layers.DynamicRNN() with drnn.block(): p_emb = drnn.step_input(p_embs) # step_input()将序列标记为动态RNN输入 q_emb = drnn.step_input(q_embs) p_enc = encoder(p_emb, 'p_enc', hidden_size, args) # BiLSTM q_enc = encoder(q_emb, 'q_enc', hidden_size, args) # BiLSTM # stage 2:match g_i = attn_flow(q_enc, p_enc, p_ids_name, args) # stage 3:fusion m_i = fusion(g_i, args) drnn.output(m_i, q_enc) ms, q_encs = drnn() p_vec = layers.lod_reset(x=ms, y=start_labels) q_vec = layers.lod_reset(x=q_encs, y=q_id0) # stage 4:decode start_probs, end_probs = point_network_decoder(p_vec=p_vec, q_vec=q_vec, hidden_size=hidden_size, args=args) # calculate model loss cost0 = layers.sequence_pool( layers.cross_entropy(input=start_probs, label=start_labels, soft_label=True), 'sum') cost1 = layers.sequence_pool( layers.cross_entropy(input=end_probs, label=end_labels, soft_label=True), 'sum') cost0 = layers.mean(cost0) cost1 = layers.mean(cost1) cost = cost0 + cost1 cost.persistable = True feeding_list = ["q_ids", "start_lables", "end_lables", "p_ids", "q_id0"] return cost, start_probs, end_probs, ms, feeding_list
def attn_flow(q_enc, p_enc, p_ids_name): tag = p_ids_name + "::" drnn = layers.DynamicRNN() with drnn.block(): h_cur = drnn.step_input(p_enc) u_all = drnn.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t1 = layers.reduce_sum(input=s_t_, dim=1) s_t = layers.sequence_softmax(input=s_t1) u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0) u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') if args.debug == True: ''' layers.Print(h_expd, message='h_expd') layers.Print(h_cur, message='h_cur') layers.Print(u_all, message='u_all') layers.Print(s_t, message='s_t') layers.Print(s_t_, message='s_t_') layers.Print(u_expr, message='u_expr') ''' drnn.output(u_expr) U_expr = drnn() #''' drnn2 = layers.DynamicRNN() with drnn2.block(): h_cur = drnn2.step_input(p_enc) u_all = drnn2.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t2 = layers.reduce_sum(input=s_t_, dim=1, keep_dim=True) b_t = layers.sequence_pool(input=s_t2, pool_type='max') if args.debug == True: ''' layers.Print(s_t2, message='s_t2') layers.Print(b_t, message='b_t') ''' drnn2.output(b_t) b = drnn2() b_norm = layers.sequence_softmax(input=b) h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0) h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') H_expr = layers.sequence_expand(x=h_expr, y=p_enc) H_expr = layers.lod_reset(x=H_expr, y=p_enc) h_u = layers.elementwise_mul(x=H_expr, y=U_expr, axis=0) h_h = layers.elementwise_mul(x=H_expr, y=p_enc, axis=0) g = layers.concat(input=[H_expr, U_expr, h_u, h_h], axis = 1) #fusion m = bi_lstm_encoder(input_seq=g, gate_size=embedding_dim) if args.debug == True: layers.Print(U_expr, message=tag + 'U_expr') layers.Print(H_expr, message=tag + 'H_expr') layers.Print(b, message=tag + 'b') layers.Print(b_norm, message=tag + 'b_norm') layers.Print(g, message=tag +'g') layers.Print(m, message=tag + 'm') layers.Print(h_h, message=tag + 'h_h') layers.Print(q_enc, message=tag + 'q_enc') layers.Print(p_enc, message=tag + 'p_enc') return m, g
def rc_model(hidden_size, vocab, args): """This function build the whole BiDAF network""" emb_shape = [vocab.size(), vocab.embed_dim] start_labels = layers.data(name="start_lables", shape=[1], dtype='float32', lod_level=1) end_labels = layers.data(name="end_lables", shape=[1], dtype='float32', lod_level=1) # stage 1:setup input data, embedding table & encode q_id0 = get_data('q_id0', 1, args) q_ids = get_data('q_ids', 2, args) p_ids_name = 'p_ids' p_ids = get_data('p_ids', 2, args) # 没有字符级别的embedding # 单词级别的 embedding p_embs = embedding(p_ids, emb_shape, args) q_embs = embedding(q_ids, emb_shape, args) drnn = layers.DynamicRNN() with drnn.block(): p_emb = drnn.step_input(p_embs) q_emb = drnn.step_input(q_embs) # 句子级别的embedding p_enc = encoder(p_emb, 'p_enc', hidden_size, args) # paragraph q_enc = encoder(q_emb, 'q_enc', hidden_size, args) # query # Attention flow layer is responsible for linking and # fusing information from the context and the query words. # stage 2:match g_i = attn_flow(q_enc, p_enc, p_ids_name, args) # stage 3:fusion 融合 m_i = fusion(g_i, args) drnn.output(m_i, q_enc) ms, q_encs = drnn() p_vec = layers.lod_reset(x=ms, y=start_labels) q_vec = layers.lod_reset(x=q_encs, y=q_id0) # stage 4:decode start_probs, end_probs = point_network_decoder(p_vec=p_vec, q_vec=q_vec, hidden_size=hidden_size, args=args) # calculate model loss cost0 = layers.sequence_pool( layers.cross_entropy(input=start_probs, label=start_labels, soft_label=True), 'sum') cost1 = layers.sequence_pool( layers.cross_entropy(input=end_probs, label=end_labels, soft_label=True), 'sum') cost0 = layers.mean(cost0) cost1 = layers.mean(cost1) cost = cost0 + cost1 cost.persistable = True feeding_list = ["q_ids", "start_lables", "end_lables", "p_ids", "q_id0"] return cost, start_probs, end_probs, ms, feeding_list