def simple_rnn(rnn_input, init_hidden, hidden_size, kernel_param_attr=None, recurrent_param_attr=None, bias_attr=None, act='relu', sequence_length=None, name='simple_rnn'): # Transpose (sequence x batch x hidden) rnn_input = layers.transpose(rnn_input, [1, 0, 2]) # Generate Mask mask = None if sequence_length: max_seq_len = layers.shape(rnn_input)[0] mask = layers.sequence_mask(sequence_length, maxlen=max_seq_len, dtype='float32') mask = layers.transpose(mask, [1, 0]) # Init simple_rnn = SimpleRNN_unit(rnn_input, hidden_size, kernel_param_attr, recurrent_param_attr, bias_attr, act) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) if init_hidden: pre_hidden = rnn.memory(init=init_hidden) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size]) last_hidden = simple_rnn(step_in, pre_hidden) rnn.update_memory(pre_hidden, last_hidden) rnn.step_output(last_hidden) step_input = last_hidden rnn_out = rnn() last_hidden = rnn_out[-1] last_hidden = layers.reshape(last_hidden, shape=[1, -1, hidden_size]) rnn_output = layers.transpose(rnn_out, [1, 0, 2]) last_hidden = layers.transpose(last_hidden, [1, 0, 2]) return rnn_out, last_hidden
def get_single_direction_output(rnn_input, encode_hidden, unit_list, mask=None, direc_index=0): rnn = StaticRNN() #print(rnn_input.shape) with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size], ref_batch_dim_idx=1) encode_h = encode_hidden[i] pre_encode_hidden = layers.concat([pre_hidden, encode_h], axis=1) new_hidden = unit_list[i](step_input, pre_encode_hidden) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul( pre_hidden, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.step_output(new_hidden) step_input = new_hidden if dropout_prob is not None and dropout_prob > 0.0: step_input = layers.dropout(step_input, dropout_prob=dropout_prob, ) rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] all_hidden_array = [] # 增加这个来得到所有隐含状态 rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i] all_hidden_array.append(last_hidden) last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) all_hidden_array = layers.concat(all_hidden_array, axis=0) all_hidden_array = layers.reshape(all_hidden_array, shape=[num_layers, input.shape[0], -1, hidden_size]) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape(last_hidden_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output, all_hidden_array
def rnn_decoder(gru_unit, cue_gru_unit, input, input_size, hidden_size, num_layers, memory, memory_mask, knowledge, output_size, init_hidden=None, mask=None, dropout=0.0, batch_first=True, name="decoder"): """ rnn decoder """ input_emb = get_embedding(input, input_size, output_size) if batch_first: input_emb = layers.transpose(input_emb, perm=[1, 0, 2]) if mask: trans_mask = layers.transpose(mask, perm=[1, 0]) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(input_emb) step_mask = None if mask: step_mask = rnn.step_input(trans_mask) # split pre_hidden pre_hidden_list = [] pre_hidden = rnn.memory(init=init_hidden) real_out, last_hidden = \ decoder_step(gru_unit, cue_gru_unit, step_in, pre_hidden, input_size, hidden_size, memory, memory_mask, knowledge, mask=step_mask) rnn.update_memory(pre_hidden, last_hidden) step_in = layers.squeeze(real_out, axes=[1]) rnn.step_output(step_in) rnnout = rnn() rnnout = layers.transpose(rnnout, perm=[1, 0, 2]) rnnout = layers.elementwise_mul(rnnout, mask, axis=0) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(rnnout, dropout_prob=dropout) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, output_size, name='dec_out_fc2') softmax_out = layers.softmax(rnnout) return softmax_out
def gru_rnn(input, input_size, hidden_size, init_hidden=None, batch_first=False, mask=None, num_layers=1, dropout=0.0, name="gru"): """ gru rnn """ gru_unit = GRU_unit(input_size, hidden_size, num_layers=num_layers, dropout=dropout, name=name + "_gru_unit") if batch_first: input = layers.transpose(x=input, perm=[1, 0, 2]) if mask: mask = layers.transpose(mask, perm=[1, 0]) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(input) step_mask = None if mask: step_mask = rnn.step_input(mask) pre_hidden = rnn.memory(init=init_hidden) new_hidden, last_hidden = gru_unit(step_in, pre_hidden, step_mask) rnn.update_memory(pre_hidden, last_hidden) step_in = new_hidden rnn.step_output(step_in) rnn.step_output(last_hidden) rnn_res = rnn() rnn_out = rnn_res[0] last_hidden = layers.slice(rnn_res[1], axes=[0], starts=[-1], ends=[1000000000]) last_hidden = layers.reshape(last_hidden, shape=[num_layers, -1, hidden_size]) if batch_first: rnnout = layers.transpose(x=rnn_out, perm=[1, 0, 2]) return rnnout, last_hidden
def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): input = rnn.step_input(input_embedding) for k in range(num_layers): pre_hidden = rnn.memory(init=hidden_array[k]) pre_cell = rnn.memory(init=cell_array[k]) weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i = layers.slice( gate_input, axes=[1], starts=[0], ends=[hidden_size]) j = layers.slice( gate_input, axes=[1], starts=[hidden_size], ends=[hidden_size * 2]) f = layers.slice( gate_input, axes=[1], starts=[hidden_size * 2], ends=[hidden_size * 3]) o = layers.slice( gate_input, axes=[1], starts=[hidden_size * 3], ends=[hidden_size * 4]) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) rnn.step_output(m) rnn.step_output(c) input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') rnn.step_output(input) rnnout = rnn() last_hidden_array = [] last_cell_array = [] real_res = rnnout[-1] for i in range(num_layers): m = rnnout[i * 2] c = rnnout[i * 2 + 1] m.stop_gradient = True c.stop_gradient = True last_h = layers.slice( m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_hidden_array.append(last_h) last_c = layers.slice( c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_cell_array.append(last_c) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) return real_res, last_hidden, last_cell
def get_single_direction_output(rnn_input, unit_list, mask=None, direc_index=0): rnn = StaticRNN() with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) pre_cell = rnn.memory(init=init_cell[i, direc_index]) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size]) pre_cell = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size]) new_hidden, new_cell = unit_list[i](step_input, pre_hidden, pre_cell) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul(pre_hidden, (step_mask - 1), axis=0) new_cell = layers.elementwise_mul( new_cell, step_mask, axis=0) - layers.elementwise_mul( pre_cell, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.update_memory(pre_cell, new_cell) rnn.step_output(new_hidden) rnn.step_output(new_cell) step_input = new_hidden if dropout_prob != None and dropout_prob > 0.0: step_input = layers.dropout( step_input, dropout_prob=dropout_prob, dropout_implementation='upscale_in_train') rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] last_cell_array = [] rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i * 2] last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) last_cell = rnn_out[i * 2 + 1] last_cell = last_cell[-1] last_cell_array.append(last_cell) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape( last_hidden_output, shape=[num_layers, -1, hidden_size]) last_cell_output = layers.concat(last_cell_array, axis=0) last_cell_output = layers.reshape(last_cell_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output, last_cell_output
def _build_decoder(self, enc_last_hidden, enc_last_cell, mode='train', beam_size=10): dec_input = layers.transpose(self.tar_emb, [1, 0, 2]) dec_unit_list = [] for i in range(self.num_layers): new_name = "dec_layers_" + str(i) dec_unit_list.append( BasicLSTMUnit( new_name, self.hidden_size, ParamAttr(initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale)), ParamAttr(initializer=fluid.initializer.Constant(0.0)), )) attention_weight = layers.create_parameter([self.hidden_size * 2, self.hidden_size], dtype="float32", name="attention_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale)) memory_weight = layers.create_parameter([self.hidden_size, self.hidden_size], dtype="float32", name="memory_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale)) def dot_attention(query, memory, mask=None): attn = layers.matmul(query, memory, transpose_y=True) if mask: attn = layers.transpose(attn, [1, 0, 2]) attn = layers.elementwise_add(attn, mask * 1000000000, -1) attn = layers.transpose(attn, [1, 0, 2]) weight = layers.softmax(attn) weight_memory = layers.matmul(weight, memory) return weight_memory, weight max_src_seq_len = layers.shape(self.src)[1] src_mask = layers.sequence_mask(self.src_sequence_length, maxlen=max_src_seq_len, dtype='float32') softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale)) def decoder_step(currrent_in, pre_feed, pre_hidden_array, pre_cell_array, enc_memory): new_hidden_array = [] new_cell_array = [] step_input = layers.concat([currrent_in, pre_feed], 1) for i in range(self.num_layers): pre_hidden = pre_hidden_array[i] pre_cell = pre_cell_array[i] new_hidden, new_cell = dec_unit_list[i](step_input, pre_hidden, pre_cell) new_hidden_array.append(new_hidden) new_cell_array.append(new_cell) step_input = new_hidden memory_mask = src_mask - 1.0 enc_memory = layers.matmul(enc_memory, memory_weight) att_in = layers.unsqueeze(step_input, [1]) dec_att, _ = dot_attention(att_in, enc_memory) dec_att = layers.squeeze(dec_att, [1]) concat_att_out = layers.concat([dec_att, step_input], 1) concat_att_out = layers.matmul(concat_att_out, attention_weight) return concat_att_out, new_hidden_array, new_cell_array if mode == "train": dec_rnn = StaticRNN() with dec_rnn.step(): step_input = dec_rnn.step_input(dec_input) input_feed = dec_rnn.memory(batch_ref=dec_input, shape=[-1, self.hidden_size]) step_input = layers.concat([step_input, input_feed], 1) for i in range(self.num_layers): pre_hidden = dec_rnn.memory(init=enc_last_hidden[i]) pre_cell = dec_rnn.memory(init=enc_last_cell[i]) new_hidden, new_cell = dec_unit_list[i](step_input, pre_hidden, pre_cell) dec_rnn.update_memory(pre_hidden, new_hidden) dec_rnn.update_memory(pre_cell, new_cell) step_input = new_hidden if self.dropout != None and self.dropout > 0.0: print("using dropout", self.dropout) step_input = fluid.layers.dropout( step_input, dropout_prob=self.dropout, dropout_implementation='upscale_in_train') memory_mask = src_mask - 1.0 enc_memory = layers.matmul(self.enc_output, memory_weight) att_in = layers.unsqueeze(step_input, [1]) dec_att, _ = dot_attention(att_in, enc_memory, memory_mask) dec_att = layers.squeeze(dec_att, [1]) concat_att_out = layers.concat([dec_att, step_input], 1) concat_att_out = layers.matmul(concat_att_out, attention_weight) #concat_att_out = layers.tanh( concat_att_out ) dec_rnn.update_memory(input_feed, concat_att_out) dec_rnn.step_output(concat_att_out) dec_rnn_out = dec_rnn() dec_output = layers.transpose(dec_rnn_out, [1, 0, 2]) dec_output = layers.matmul(dec_output, softmax_weight) return dec_output elif mode == 'beam_search': max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) #eos_ids = layers.fill_constant( [1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([beam_size, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append( layers.expand(enc_last_hidden[i], [beam_size, 1])) pre_cell_array.append( layers.expand(enc_last_cell[i], [beam_size, 1])) eos_ids = layers.fill_constant([beam_size], dtype='int64', value=2) init_score = np.zeros((beam_size)).astype('float32') init_score[1:] = -INF pre_score = layers.assign(init_score) #pre_score = layers.fill_constant( [1,], dtype='float32', value= 0.0) tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) enc_memory = layers.expand(self.enc_output, [beam_size, 1, 1]) pre_tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) finished_seq = layers.fill_constant([beam_size, 1], dtype='int64', value=0) finished_scores = layers.fill_constant([beam_size], dtype='float32', value=-INF) finished_flag = layers.fill_constant([beam_size], dtype='float32', value=0.0) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True parent_idx = layers.fill_constant([1], dtype='int32', value=0) while_op = layers.While(cond) def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags, beam_size, select_beam=None, generate_id=None): scores = layers.reshape(scores, shape=[1, -1]) _, topk_indexs = layers.topk(scores, k=beam_size) topk_indexs = layers.reshape(topk_indexs, shape=[-1]) # gather result top_seq = layers.gather(sequences, topk_indexs) topk_flags = layers.gather(flags, topk_indexs) topk_gather_scores = layers.gather(scores_to_gather, topk_indexs) if select_beam: topk_beam = layers.gather(select_beam, topk_indexs) else: topk_beam = select_beam if generate_id: topk_id = layers.gather(generate_id, topk_indexs) else: topk_id = generate_id return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, select_beam, generate_id): curr_scores += curr_finished * -INF return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs, curr_finished, beam_size, select_beam, generate_id=generate_id) def grow_finished(finished_seq, finished_scores, finished_flag, curr_seq, curr_scores, curr_finished): finished_seq = layers.concat([ finished_seq, layers.fill_constant( [beam_size, 1], dtype='int64', value=1) ], axis=1) curr_scores += (1.0 - curr_finished) * -INF #layers.Print( curr_scores, message="curr scores") curr_finished_seq = layers.concat([finished_seq, curr_seq], axis=0) curr_finished_scores = layers.concat( [finished_scores, curr_scores], axis=0) curr_finished_flags = layers.concat( [finished_flag, curr_finished], axis=0) return compute_topk_scores_and_seq(curr_finished_seq, curr_finished_scores, curr_finished_scores, curr_finished_flags, beam_size) def is_finished(alive_log_prob, finished_scores, finished_in_finished): max_out_len = 200 max_length_penalty = layers.pow( layers.fill_constant([1], dtype='float32', value=((5.0 + max_out_len) / 6.0)), alpha) lower_bound_alive_score = layers.slice( alive_log_prob, starts=[0], ends=[1], axes=[0]) / max_length_penalty lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished lowest_score_of_fininshed_in_finished += ( 1.0 - finished_in_finished) * -INF lowest_score_of_fininshed_in_finished = layers.reduce_min( lowest_score_of_fininshed_in_finished) met = layers.less_than(lower_bound_alive_score, lowest_score_of_fininshed_in_finished) met = layers.cast(met, 'float32') bound_is_met = layers.reduce_sum(met) finished_eos_num = layers.reduce_sum(finished_in_finished) finish_cond = layers.less_than( finished_eos_num, layers.fill_constant([1], dtype='float32', value=beam_size)) return finish_cond def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array, enc_memory) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape(topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array with while_op.block(): topk_seq, topk_log_probs, topk_scores, topk_finished, topk_beam, topk_generate_id, attention_out, new_hidden_array, new_cell_array = \ grow_top_k( step_idx, pre_tokens, pre_score, parent_idx) alive_seq, alive_log_prob, _, alive_beam, alive_id = grow_alive( topk_seq, topk_scores, topk_log_probs, topk_finished, topk_beam, topk_generate_id) finished_seq_2, finished_scores_2, finished_flags_2, _, _ = grow_finished( finished_seq, finished_scores, finished_flag, topk_seq, topk_scores, topk_finished) finished_cond = is_finished(alive_log_prob, finished_scores_2, finished_flags_2) layers.increment(x=step_idx, value=1.0, in_place=True) layers.assign(alive_beam, parent_idx) layers.assign(alive_id, pre_tokens) layers.assign(alive_log_prob, pre_score) layers.assign(alive_seq, tokens) layers.assign(finished_seq_2, finished_seq) layers.assign(finished_scores_2, finished_scores) layers.assign(finished_flags_2, finished_flag) # update init_hidden, init_cell, input_feed new_feed = layers.gather(attention_out, parent_idx) layers.assign(new_feed, pre_feed) for i in range(self.num_layers): new_hidden_var = layers.gather(new_hidden_array[i], parent_idx) layers.assign(new_hidden_var, pre_hidden_array[i]) new_cell_var = layers.gather(new_cell_array[i], parent_idx) layers.assign(new_cell_var, pre_cell_array[i]) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=finished_cond, out=cond) tokens_with_eos = tokens all_seq = layers.concat([tokens_with_eos, finished_seq], axis=0) all_score = layers.concat([pre_score, finished_scores], axis=0) _, topk_index = layers.topk(all_score, k=beam_size) topk_index = layers.reshape(topk_index, shape=[-1]) final_seq = layers.gather(all_seq, topk_index) final_score = layers.gather(all_score, topk_index) return final_seq elif mode == 'greedy_search': max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) eos_ids = layers.fill_constant([1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([1, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append(enc_last_hidden[i]) pre_cell_array.append(enc_last_cell[i]) #pre_hidden_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) #pre_cell_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True while_op = layers.While(cond) with while_op.block(): dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array, self.enc_output) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) logits = layers.log(logits) current_log = layers.elementwise_add(logits, score, axis=0) topk_score, topk_indices = layers.topk(input=current_log, k=1) new_ids = layers.concat([full_ids, topk_indices]) layers.assign(new_ids, full_ids) #layers.Print( full_ids, message="ful ids") layers.assign(topk_score, score) layers.assign(topk_indices, pre_ids) layers.assign(dec_att_out, pre_feed) for i in range(self.num_layers): layers.assign(new_hidden_array[i], pre_hidden_array[i]) layers.assign(new_cell_array[i], pre_cell_array[i]) layers.increment(x=step_idx, value=1.0, in_place=True) eos_met = layers.not_equal(topk_indices, eos_ids) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=eos_met, out=cond) return full_ids
def convlstm2d_rnn(rnn_input, init_hidden, init_cell, padding, hidden_h, hidden_w, filters, filter_size, drop_out=None, sequence_length=None, name='conv_lstm_2d'): # transpose : (sequence x batch x C x H x W) rnn_input = layers.transpose(rnn_input, [1, 0, 4, 2, 3]) # generate mask mask = None if sequence_length: max_seq_len = layers.shape(rnn_input)[0] mask = layers.sequence_mask(sequence_length, maxlen=max_seq_len, dtype='float32') mask = layers.transpose(mask, [1, 0]) # init conv_lstm_2d = ConvLSTM2D_unit(filters, filter_size, padding) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) if init_hidden and init_cell: pre_hidden = rnn.memory(init=init_hidden) pre_cell = rnn.memory(init=init_cell) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, filters, hidden_h, hidden_w]) pre_cell = rnn.memory(batch_ref=rnn_input, shape=[-1, filters, hidden_h, hidden_w]) real_out, last_hidden, last_cell = conv_lstm_2d( step_in, pre_hidden, pre_cell) if mask: last_hidden = dot(last_hidden, step_mask, axis=0) - dot( pre_hidden, (step_mask - 1), axis=0) last_cell = dot(last_cell, step_mask, axis=0) - dot( pre_cell, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, last_hidden) rnn.update_memory(pre_cell, last_cell) rnn.step_output(last_hidden) rnn.step_output(last_cell) step_input = last_hidden if drop_out != None and drop_out > 0.0: step_input = layers.dropout( step_input, dropout_prob=drop_out, dropout_implementation='upscale_in_train') rnn_res = rnn() rnn_out = rnn_res[0] last_hidden = layers.slice(rnn_res[1], axes=[0], starts=[-1], ends=[1000000000]) rnn_out = layers.transpose(rnn_out, [1, 0, 3, 4, 2]) last_hidden = layers.transpose(last_hidden, [1, 0, 3, 4, 2]) # print('rnn_out ', rnn_out.shape) # print('last_hidden ', last_hidden.shape) return rnn_out, last_hidden