def fluid_sequence_pad(input, pad_value, maxlen=None): """ args: input: (batch*seq_len, dim) returns: (batch, max_seq_len, dim) """ pad_value = layers.cast( fluid.layers.assign(input=np.array([pad_value], 'float32')), input.dtype) input_padded, _ = layers.sequence_pad( input, pad_value, maxlen=maxlen) # (batch, max_seq_len, 1), (batch, 1) # TODO, maxlen=300, used to solve issues: https://github.com/PaddlePaddle/Paddle/issues/14164 return input_padded
def recv_func(msg): pad_value = L.assign(input=np.array([0.0], dtype=np.float32)) output, length = L.sequence_pad(msg, pad_value, maxlen=max_neigh) mask = L.sequence_mask(length, dtype="float32", maxlen=max_neigh) mask = L.unsqueeze(mask, [2]) input_mask = (L.matmul(mask, mask, transpose_y=True) - 1) * -10000 for layer in range(num_layers): output = self_attention_and_residual(output, hidden_size, input_mask, name="cross_feat_%s" % layer, maxlen=max_neigh) return L.reduce_sum(output * mask, 1) / L.reduce_sum(mask, 1)
def knowledge_seq2seq(config): """ knowledge seq2seq """ emb_size = config.embed_size hidden_size = config.hidden_size input_size = emb_size num_layers = config.num_layers bi_direc = config.bidirectional batch_size = config.batch_size vocab_size = config.vocab_size run_type = config.run_type enc_input = layers.data(name="enc_input", shape=[1], dtype='int64', lod_level=1) #enc_input --> goal enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32') goal_input = layers.data(name="goal_input", shape=[1], dtype='int64', lod_level=1) #goal_input --> x cue_input = layers.data(name="cue_input", shape=[1], dtype='int64', lod_level=1) #cue_input --> kg #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32') memory_mask = layers.data(name='memory_mask', shape=[-1, 1], dtype='float32') tar_input = layers.data(name='tar_input', shape=[1], dtype='int64', lod_level=1) #tar_input --> y # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32') rnn_hidden_size = hidden_size if bi_direc: rnn_hidden_size //= 2 enc_out, enc_last_hidden = \ rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc") goal_out, goal_last_hidden = \ rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc1") context_goal_out = fluid.layers.concat( input=[enc_last_hidden, goal_last_hidden], axis=2) context_goal_out = layers.reshape(context_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # context_goal_out = layers.squeeze(context_goal_out, axes=[1]) context_goal_out = fluid.layers.fc(context_goal_out, size=rnn_hidden_size * 2, bias_attr=False) context_goal_out = layers.unsqueeze(context_goal_out, axes=[0]) bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge") bridge_out = layers.tanh(bridge_out) cue_last_mask = layers.data(name='cue_last_mask', shape=[-1], dtype='float32') knowledge_out, knowledge_last_hidden = \ rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc") query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1]) query = layers.squeeze(query, axes=[0]) query = layers.unsqueeze(query, axes=[1]) query = layers.reshape(query, shape=[batch_size, -1, hidden_size]) cue_memory = layers.slice(knowledge_last_hidden, axes=[0], starts=[0], ends=[1]) cue_memory = layers.reshape(cue_memory, shape=[batch_size, -1, hidden_size]) memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1]) weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask) cue_att = layers.reshape(cue_att, shape=[batch_size, -1]) knowledge = weighted_cue if config.use_posterior: print("config.use_posterior", config.use_posterior) target_out, target_last_hidden = \ rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="knowledge_enc1") target_goal_out = fluid.layers.concat( input=[target_last_hidden, goal_last_hidden], axis=2) target_goal_out = layers.reshape(target_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # target_goal_out = layers.squeeze(target_goal_out, axes=[1]) target_goal_out = fluid.layers.fc(target_goal_out, size=rnn_hidden_size * 2, bias_attr=False) target_goal_out = layers.unsqueeze(target_goal_out, axes=[0]) # get attenion # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1]) target_query = layers.slice(target_goal_out, axes=[0], starts=[0], ends=[1]) target_query = layers.squeeze(target_query, axes=[0]) target_query = layers.unsqueeze(target_query, axes=[1]) target_query = layers.reshape(target_query, shape=[batch_size, -1, hidden_size]) weight_target, target_att = dot_attention(target_query, cue_memory, mask=memory_mask) target_att = layers.reshape(target_att, shape=[batch_size, -1]) # add to output knowledge = weight_target enc_memory_mask = layers.data(name="enc_memory_mask", shape=[-1, 1], dtype='float32') enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1]) # decoder init_hidden, enc_memory, enc_mask dec_init_hidden = bridge_out pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32')) enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out, pad_value=pad_value) enc_memory.persistable = True gru_unit = GRU_unit(input_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_gru_unit") cue_gru_unit = GRU_unit(hidden_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_cue_gru_unit") tgt_vocab_size = config.vocab_size if run_type == "train": if config.use_bow: bow_logits = fc(knowledge, hidden_size, hidden_size, name='bow_fc_1') bow_logits = layers.tanh(bow_logits) bow_logits = fc(bow_logits, hidden_size, tgt_vocab_size, name='bow_fc_2') bow_logits = layers.softmax(bow_logits) bow_label = layers.data(name='bow_label', shape=[-1, config.max_len], dtype='int64') bow_mask = layers.data(name="bow_mask", shape=[-1, config.max_len], dtype='float32') bow_logits = layers.expand(bow_logits, [1, config.max_len, 1]) bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size]) bow_label = layers.reshape(bow_label, shape=[-1, 1]) bow_loss = layers.cross_entropy(bow_logits, bow_label, soft_label=False) bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len]) bow_loss *= bow_mask bow_loss = layers.reduce_sum(bow_loss, dim=[1]) bow_loss = layers.reduce_mean(bow_loss) dec_input = layers.data(name="dec_input", shape=[-1, 1, 1], dtype='int64') dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32') dec_knowledge = weight_target knowledge_goal_out = fluid.layers.concat( input=[dec_knowledge, target_query], axis=2) knowledge_goal_out = layers.reshape(knowledge_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1]) knowledge_goal_out = fluid.layers.fc(knowledge_goal_out, size=rnn_hidden_size * 2, bias_attr=False) knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0]) decoder_logits = \ rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers, enc_memory, enc_memory_mask, dec_knowledge, vocab_size, init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout) target_label = layers.data(name='target_label', shape=[-1, 1], dtype='int64') target_mask = layers.data(name='target_mask', shape=[-1, 1], dtype='float32') decoder_logits = layers.reshape(decoder_logits, shape=[-1, tgt_vocab_size]) target_label = layers.reshape(target_label, shape=[-1, 1]) nll_loss = layers.cross_entropy(decoder_logits, target_label, soft_label=False) nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1]) nll_loss *= target_mask nll_loss = layers.reduce_sum(nll_loss, dim=[1]) nll_loss = layers.reduce_mean(nll_loss) prior_attn = cue_att + 1e-10 posterior_att = target_att posterior_att.stop_gradient = True prior_attn = layers.log(prior_attn) kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) - prior_attn) kl_loss = layers.reduce_mean(kl_loss) kl_and_nll_factor = layers.data(name='kl_and_nll_factor', shape=[1], dtype='float32') kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1]) final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor return [bow_loss, kl_loss, nll_loss, final_loss] elif run_type == "test": beam_size = config.beam_size batch_size = config.batch_size token = layers.fill_constant(shape=[batch_size * beam_size, 1], value=config.bos_id, dtype='int64') token = layers.reshape(token, shape=[-1, 1]) max_decode_len = config.max_dec_len dec_knowledge = knowledge INF = 100000000.0 init_score_np = np.ones([beam_size * batch_size], dtype='float32') * -INF for i in range(batch_size): init_score_np[i * beam_size] = 0.0 pre_score = layers.assign(init_score_np) pos_index_np = np.arange(batch_size).reshape(-1, 1) pos_index_np = \ np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size pos_index = layers.assign(pos_index_np) id_array = [] score_array = [] index_array = [] init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1]) init_enc_memory = layers.reshape( init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size]) init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1]) init_enc_mask = layers.reshape(init_enc_mask, shape=[batch_size * beam_size, 1, -1]) dec_knowledge = layers.reshape(dec_knowledge, shape=[-1, 1, hidden_size]) init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1]) init_dec_knowledge = layers.reshape( init_dec_knowledge, shape=[batch_size * beam_size, -1, hidden_size]) dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size]) dec_init_hidden = layers.reshape(dec_init_hidden, shape=[1, -1, hidden_size]) length_average = config.length_average UNK = config.unk_id EOS = config.eos_id for i in range(1, max_decode_len + 1): dec_emb = get_embedding(token, input_size, vocab_size) dec_out, dec_last_hidden = \ decoder_step(gru_unit, cue_gru_unit, dec_emb, dec_init_hidden, input_size, hidden_size, init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(dec_out, dropout_prob=config.dropout, is_test=True) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2') log_softmax_output = log_softmax(rnnout) log_softmax_output = layers.squeeze(log_softmax_output, axes=[1]) if i > 1: if length_average: log_softmax_output = layers.elementwise_add( (log_softmax_output / i), (pre_score * (1.0 - 1.0 / i)), axis=0) else: log_softmax_output = layers.elementwise_add( log_softmax_output, pre_score, axis=0) else: log_softmax_output = layers.elementwise_add(log_softmax_output, pre_score, axis=0) log_softmax_output = layers.reshape(log_softmax_output, shape=[batch_size, -1]) topk_score, topk_index = layers.topk(log_softmax_output, k=beam_size) topk_score = layers.reshape(topk_score, shape=[-1]) topk_index = layers.reshape(topk_index, shape=[-1]) vocab_var = layers.fill_constant([1], dtype='int64', value=vocab_size) new_token = topk_index % vocab_var index = topk_index // vocab_var id_array.append(new_token) index_array.append(index) index = index + pos_index score_array.append(topk_score) eos_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=EOS) unk_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=UNK) eos_eq = layers.cast(layers.equal(new_token, eos_ids), dtype='float32') topk_score += eos_eq * -100000000.0 unk_eq = layers.cast(layers.equal(new_token, unk_ids), dtype='float32') topk_score += unk_eq * -100000000.0 # update token = new_token pre_score = topk_score token = layers.reshape(token, shape=[-1, 1]) index = layers.cast(index, dtype='int32') dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0]) dec_init_hidden = layers.gather(dec_last_hidden, index=index) dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0]) init_enc_memory = layers.gather(init_enc_memory, index) init_enc_mask = layers.gather(init_enc_mask, index) init_dec_knowledge = layers.gather(init_dec_knowledge, index) final_score = layers.concat(score_array, axis=0) final_ids = layers.concat(id_array, axis=0) final_index = layers.concat(index_array, axis=0) final_score = layers.reshape( final_score, shape=[max_decode_len, beam_size * batch_size]) final_ids = layers.reshape( final_ids, shape=[max_decode_len, beam_size * batch_size]) final_index = layers.reshape( final_index, shape=[max_decode_len, beam_size * batch_size]) return final_score, final_ids, final_index
def __init__(self, embedding_dim, encoder_size, decoder_size, source_dict_dim, target_dict_dim, tag_dict_dim, is_generating, beam_size, max_length, source_entity_dim, source_pos_dim, embedding_entity_dim, embedding_pos_dim, end_id): # The encoding process. Encodes the input words into tensors. self.encoder_size = encoder_size self.decoder_size = decoder_size self.embedding_dim = embedding_dim self.source_dict_dim = target_dict_dim self.is_generating = is_generating self.source_dict_dim = source_dict_dim self.target_dict_dim = target_dict_dim self.tag_dict_dim = tag_dict_dim self.max_length = max_length self.end_id = end_id self.beam_size = beam_size self.no_grad_set = [] self.dropout_prob = 0.5 src_word_idx = fluid.layers.data(name='source_sequence', shape=[1], dtype='int64', lod_level=1) # print(src_word_idx.shape) self.src_word_idx = src_word_idx src_embedding = fluid.layers.embedding( input=src_word_idx, size=[source_dict_dim, embedding_dim], dtype='float32', param_attr=fluid.ParamAttr(name='emb')) src_entity_idx = fluid.layers.data(name='source_entities', shape=[1], dtype='int64', lod_level=1) entity_embedding = fluid.layers.embedding( input=src_entity_idx, size=[source_entity_dim, embedding_entity_dim], dtype='float32') src_pos_idx = fluid.layers.data(name='source_pos', shape=[1], dtype='int64', lod_level=1) pos_embedding = fluid.layers.embedding( input=src_pos_idx, size=[source_pos_dim, embedding_pos_dim], dtype='float32') # print(src_embedding) # print(entity_embedding) # print(pos_embedding) embeddings = fluid.layers.concat( input=[src_embedding, entity_embedding, pos_embedding], axis=1) # print(embeddings) # if not is_generating: # embeddings = fluid.layers.dropout( # embeddings, dropout_prob=self.dropout_prob) src_forward, src_reversed = self.bi_lstm_encoder( input_seq=embeddings, gate_size=encoder_size) encoded_vector = fluid.layers.concat(input=[src_forward, src_reversed], axis=1) pad_zero = pd.fill_constant(shape=[self.encoder_size * 2], dtype='float32', value=0) encoded_vector_full, encoded_vector_length = pd.sequence_pad( encoded_vector, pad_zero, maxlen=self.max_length, name="copy_score_padding") print(encoded_vector_full) # if not is_generating: # encoded_vector = fluid.layers.dropout( # encoded_vector, dropout_prob=self.dropout_prob) self.encoder_vec = encoded_vector self.encoder_vec_full = encoded_vector_full encoded_proj = fluid.layers.fc(input=encoded_vector, size=decoder_size, bias_attr=False) self.encoder_proj = encoded_proj backward_first = fluid.layers.sequence_pool(input=src_reversed, pool_type='first') decoder_boot = fluid.layers.fc(input=backward_first, size=decoder_size, bias_attr=False, act='tanh') cell_init = fluid.layers.fill_constant_batch_size_like( input=decoder_boot, value=1.0, shape=[-1, decoder_size], dtype='float32') # cell_init.stop_gradient = False cell_init.stop_gradient = True # Create a RNN state cell by providing the input and hidden states, and # specifies the hidden state as output. # h = InitState(init=decoder_boot, need_reorder=True) self.h = decoder_boot self.c = cell_init event_cla_id = fluid.layers.data(name='event_class', shape=[1], dtype='int64') self.event_embedding = fluid.layers.embedding( input=event_cla_id, size=[self.tag_dict_dim, embedding_entity_dim], dtype='float32') # self.decoder_lstm = fluid.contrib.layers.BasicLSTMUnit( # "decoder_lstm", # self.decoder_size, # fluid.ParamAttr(initializer=fluid.initializer.UniformInitializer( # low=-0.1, high=0.1)), # fluid.ParamAttr(initializer=fluid.initializer.Constant(0.0)), ) ##### # DECODER ##### label = fluid.layers.data(name='label_sequence', shape=[1], dtype='int64', lod_level=1) if not is_generating: rnn_out = self.train_decoder(decoder_boot) predict_label = fluid.layers.argmax(x=rnn_out, axis=1) # print(label.shape) # print(rnn_out.shape) # print(predict_label.shape) cost = fluid.layers.cross_entropy(input=rnn_out, label=label) avg_cost = fluid.layers.mean(x=cost) self.predict = rnn_out self.label = predict_label self.avg_cost = avg_cost feeding_list = [ "source_sequence", "source_entities", "source_pos", "event_class", "source_index", "target_sequence", "label_sequence" ] # return avg_cost, feeding_list self.feeding_list = feeding_list else: # rnn_out = self.train_decoder(decoder_boot) # translation_ids = fluid.layers.argmax(x=rnn_out, axis=-1) beam_search_out = self.decoder(decoder_boot) translation_ids, translation_scores = beam_search_out feeding_list = [ "source_sequence", "source_entities", "source_pos", "event_class", "source_index", "label_sequence" ] # feeding_list = ["source_sequence", "source_entities", # "source_pos", "target_sequence"] # feeding_list = ["source_sequence", "source_entities", # "source_pos", "target_sequence", "label_sequence"] # return translation_ids, translation_scores, feeding_list self.translation_ids = translation_ids self.translation_scores = translation_scores self.feeding_list = feeding_list self.no_grad_set = set(self.no_grad_set)