def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array, enc_memory) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape(topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array
def grow_topk(i, logits, alive_seq, alive_log_probs, states): logits = layers.reshape(logits, [batch_size, beam_size, -1]) candidate_log_probs = layers.log(layers.softmax(logits, axis=2)) log_probs = layers.elementwise_add(candidate_log_probs, alive_log_probs, 0) length_penalty = np.power(5.0 + (i + 1.0) / 6.0, alpha) curr_scores = log_probs / length_penalty flat_curr_scores = layers.reshape(curr_scores, [batch_size, -1]) topk_scores, topk_ids = layers.topk(flat_curr_scores, k=beam_size * 2) topk_log_probs = topk_scores * length_penalty topk_beam_index = topk_ids // self.trg_vocab_size topk_ids = topk_ids % self.trg_vocab_size # use gather as gather_nd, TODO: use gather_nd topk_seq = gather_2d_by_gather(alive_seq, topk_beam_index, beam_size, batch_size) topk_seq = layers.concat( [topk_seq, layers.reshape(topk_ids, topk_ids.shape + [1])], axis=2) states = update_states(states, topk_beam_index, beam_size) eos = layers.fill_constant(shape=topk_ids.shape, dtype="int64", value=eos_id) topk_finished = layers.cast(layers.equal(topk_ids, eos), "float32") #topk_seq: [batch_size, 2*beam_size, i+1] #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size] return topk_seq, topk_log_probs, topk_scores, topk_finished, states
def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags, beam_size, select_beam=None, generate_id=None): scores = layers.reshape(scores, shape=[1, -1]) _, topk_indexs = layers.topk(scores, k=beam_size) topk_indexs = layers.reshape(topk_indexs, shape=[-1]) # gather result top_seq = layers.gather(sequences, topk_indexs) topk_flags = layers.gather(flags, topk_indexs) topk_gather_scores = layers.gather(scores_to_gather, topk_indexs) if select_beam: topk_beam = layers.gather(select_beam, topk_indexs) else: topk_beam = select_beam if generate_id: topk_id = layers.gather(generate_id, topk_indexs) else: topk_id = generate_id return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id
def grow_finished(finished_seq, finished_scores, finished_flags, curr_seq, curr_scores, curr_finished): # finished scores finished_seq = layers.concat([ finished_seq, layers.fill_constant(shape=[batch_size, beam_size, 1], dtype="int64", value=eos_id) ], axis=2) # Set the scores of the unfinished seq in curr_seq to large negative # values curr_scores += (1. - curr_finished) * -inf # concatenating the sequences and scores along beam axis curr_finished_seq = layers.concat([finished_seq, curr_seq], axis=1) curr_finished_scores = layers.concat([finished_scores, curr_scores], axis=1) curr_finished_flags = layers.concat([finished_flags, curr_finished], axis=1) _, topk_indexes = layers.topk(curr_finished_scores, k=beam_size) finished_seq = gather_2d_by_gather(curr_finished_seq, topk_indexes, beam_size * 3, batch_size) finished_scores = gather_2d_by_gather(curr_finished_scores, topk_indexes, beam_size * 3, batch_size) finished_flags = gather_2d_by_gather(curr_finished_flags, topk_indexes, beam_size * 3, batch_size) return finished_seq, finished_scores, finished_flags
def test_topk(self): program = Program() with program_guard(program): data = layers.data(name="label", shape=[200], dtype="float32") values, indices = layers.topk(data, k=5) self.assertIsNotNone(values) self.assertIsNotNone(indices) print(str(program))
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" _, vocab_size = logits.shape bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] #log.debug(gather_idx.numpy()) #log.debug(state.finished.numpy()) #log.debug(next_finished.numpy()) next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') #log.debug(next_word_id.numpy()) #log.debug(next_beam_id.numpy()) next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def body_func(step_idx, pre_ids, pre_scores, gather_idx, caches, trg_src_attn_bias): # gather cell states corresponding to selected parent pre_caches = map_structure( lambda x: layers.gather(x, index=gather_idx), caches) pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=gather_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias), trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, enc_output=enc_output, caches=pre_caches, bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. accu_scores = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) step_idx = layers.increment(x=step_idx, value=1.0, in_place=False) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) return (step_idx, selected_ids, selected_scores, gather_idx, pre_caches, pre_src_attn_bias)
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" beam_size, vocab_size = logits.shape # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size logits_np = logits.numpy() for i in range(beam_size): logits_np[i][17963] = 0 # make [UNK] prob = 0 logits = D.to_variable(logits_np) bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, states): curr_scores += curr_finished * -inf _, topk_indexes = layers.topk(curr_scores, k=beam_size) alive_seq = gather_2d_by_gather(curr_seq, topk_indexes, beam_size * 2, batch_size) alive_log_probs = gather_2d_by_gather(curr_log_probs, topk_indexes, beam_size * 2, batch_size) states = update_states(states, topk_indexes, beam_size * 2) return alive_seq, alive_log_probs, states
def _sampling(self, logits): """ Implement top-k sampling. """ probs = layers.softmax(logits, axis=1) probs, indices = layers.topk(probs, self.top_k_num) probs = probs / layers.reduce_sum(probs, dim=1, keep_dim=True) preds = [] for p, ids in zip(probs.numpy(), indices.numpy()): o = np.random.choice(ids, p=p) preds.append(o) preds = np.array(preds, dtype="int64") return fluid.dygraph.to_variable(preds)
def __call__(self, inputs, labels=None, mode=None): encoder_features = self.encoder(inputs) char_num = self.char_num word_vector_dim = self.word_vector_dim decoder_size = self.decoder_size if self.encoder_type == "reshape": encoder_input = encoder_features encoded_vector = encoder_features else: encoder_input = encoder_features[1] encoded_vector = layers.concat(encoder_features, axis=1) encoded_proj = layers.fc(input=encoded_vector, size=decoder_size, bias_attr=False, name="encoded_proj_fc") backward_first = layers.sequence_pool( input=encoder_input, pool_type='first') decoder_boot = layers.fc(input=backward_first, size=decoder_size, bias_attr=False, act="relu", name='decoder_boot') if mode == "train": label_in = labels['label_in'] label_out = labels['label_out'] label_in = layers.cast(x=label_in, dtype='int64') trg_embedding = layers.embedding( input=label_in, size=[char_num, word_vector_dim], dtype='float32') predict = self.gru_decoder_with_attention( trg_embedding, encoded_vector, encoded_proj, decoder_boot, decoder_size, char_num) _, decoded_out = layers.topk(input=predict, k=1) decoded_out = layers.lod_reset(decoded_out, y=label_out) predicts = {'predict': predict, 'decoded_out': decoded_out} else: ids = self.gru_attention_infer( decoder_boot, self.max_length, char_num, word_vector_dim, encoded_vector, encoded_proj, decoder_size) predicts = {'decoded_out': ids} return predicts
def grow_topk(self, i, logits, alive_seq, alive_log_probs, cache, enc_output, enc_bias): """ grow_topk """ logits = layers.reshape(logits, [self.batch_size, self.beam_size, -1]) candidate_log_probs = layers.log(layers.softmax(logits, axis=2)) log_probs = candidate_log_probs + layers.unsqueeze(alive_log_probs, axes=[2]) base_1 = layers.cast(i, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, self.alpha) #length_penalty = layers.pow(((5.0 + layers.cast(i+1, 'float32')) / 6.0), self.alpha) curr_scores = log_probs / length_penalty flat_curr_scores = layers.reshape(curr_scores, [self.batch_size, self.beam_size * self.vocab_size]) topk_scores, topk_ids = layers.topk(flat_curr_scores, k=self.beam_size * 2) topk_log_probs = topk_scores * length_penalty select_beam_index = topk_ids // self.vocab_size select_id = topk_ids % self.vocab_size #layers.Print(select_id, message="select_id", summarize=1024) #layers.Print(topk_scores, message="topk_scores", summarize=10000000) flat_select_beam_index = layers.reshape(select_beam_index, [-1]) + self.gather_top2k_append_index topk_seq = layers.gather(alive_seq, [flat_select_beam_index]) topk_seq = layers.reshape(topk_seq, [self.batch_size, 2 * self.beam_size, -1]) #concat with current ids topk_seq = layers.concat([topk_seq, layers.unsqueeze(select_id, axes=[2])], axis=2) topk_finished = layers.cast(layers.equal(select_id, self.eos_id), 'float32') #gather cache self.gather_cache(cache, flat_select_beam_index) #topk_seq: [batch_size, 2*beam_size, i+1] #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size] return topk_seq, topk_log_probs, topk_scores, topk_finished, cache
def eval(): ocr_attention.eval() total_loss = 0.0 total_step = 0.0 equal_size = 0 for data in test_reader(): data_dict = get_attention_feeder_data(data) label_in = to_variable(data_dict["label_in"]) label_out = to_variable(data_dict["label_out"]) label_out._stop_gradient = True label_out.trainable = False img = to_variable(data_dict["pixel"]) prediction = ocr_attention(img, label_in) prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False) score, topk = layers.topk( prediction, 1) seq = topk.numpy() seq = seq.reshape( ( args.batch_size, -1)) mask = data_dict['mask'].reshape( (args.batch_size, -1)) seq_len = np.sum( mask, -1) trans_ref = data_dict["label_out"].reshape( (args.batch_size, -1)) for i in range( args.batch_size ): length = int(seq_len[i] -1 ) trans = seq[i][:length - 1] ref = trans_ref[i][ : length - 1] if np.array_equal( trans, ref ): equal_size += 1 total_step += args.batch_size print( "eval cost", equal_size / total_step )
def decoder(self, init_state): """ implement decoder in inference mode """ # pd.Print(init_state) # define counter variable in the decoding array_len = pd.fill_constant(shape=[1], dtype='int64', value=self.max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) static_count = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # define tensor array to save content at each time step, and write initial id, score and state state_h_array = pd.create_array('float32') pd.array_write(self.h, array=state_h_array, i=counter) state_c_array = pd.create_array('float32') pd.array_write(self.c, array=state_c_array, i=counter) src_indexes = fluid.layers.data(name='source_index', shape=[1], dtype='int64', lod_level=1) src_index_array = pd.create_array('int64') pd.array_write(src_indexes, array=src_index_array, i=counter) ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) encoder_vec_array = pd.create_array('float32') pd.array_write(self.encoder_vec, array=encoder_vec_array, i=static_count) encoder_vec_full_array = pd.create_array('float32') pd.array_write(self.encoder_vec_full, array=encoder_vec_full_array, i=static_count) encoder_proj_array = pd.create_array('float32') pd.array_write(self.encoder_proj, array=encoder_proj_array, i=static_count) event_embedding_array = pd.create_array('float32') pd.array_write(self.event_embedding, array=event_embedding_array, i=static_count) # define conditional variable to stop loop cond = pd.less_than(x=counter, y=array_len) # define while_op while_op = pd.While(cond=cond) with while_op.block(): # define the computing of each step # pd.Print(counter) # obtain input at present step of decoder, including id chosen at previous step, corresponding score and state at previous step. pre_ids = pd.array_read(array=ids_array, i=counter) pre_h_state = pd.array_read(array=state_h_array, i=counter) pre_c_state = pd.array_read(array=state_c_array, i=counter) # pre_score = pd.array_read(array=scores_array, i=counter) pre_score = pd.array_read(array=scores_array, i=static_count) _encoder_input_ids = pd.array_read(array=src_index_array, i=static_count) event_embedding = pd.array_read(array=event_embedding_array, i=static_count) # print("pre_h_state", pre_h_state) encoder_vec = pd.array_read(array=encoder_vec_array, i=static_count) encoder_vec_full = pd.array_read(array=encoder_vec_full_array, i=static_count) encoder_proj = pd.array_read(array=encoder_proj_array, i=static_count) # # update input state as state correspondent with id chosen at previous step # pre_h_state_expanded = pd.sequence_expand(pre_h_state, pre_score) # pre_c_state_expanded = pd.sequence_expand(pre_c_state, pre_score) # computing logic of decoder under the same train mode, including input vector and computing unit of decoder # compute predicting probability of normalized word pre_ids_emb = pd.embedding( input=pre_ids, size=[self.target_dict_dim, self.embedding_dim], dtype='float32', param_attr=fluid.ParamAttr(name="trg_embedding")) # pd.Print(pre_ids_emb) att_context = self.simple_attention(encoder_vec, encoder_proj, pre_h_state) # print("att_context", att_context) # print("pre_ids_emb", pre_ids_emb) # pd.Print(att_context) prob_c = fluid.layers.sequence_expand_as(pre_score, encoder_vec) # pd.Print(prob_c) current_score, current_h, current_c, this_prob_c = self.copy_decoder( pre_ids_emb, encoder_vec, encoder_vec_full, encoder_proj, _encoder_input_ids, pre_ids, prob_c, att_context, pre_h_state, pre_c_state, event_embedding) # decoder_inputs = fluid.layers.concat( # input=[att_context, pre_ids_emb], axis=1) # current_h, current_c = self.lstm_step( # decoder_inputs, pre_h_state, pre_c_state, self.decoder_size) # # compute predicting probability of nomarlized word # current_score = fluid.layers.fc(input=current_h, # size=self.target_dict_dim, # act='softmax', # param_attr=fluid.ParamAttr(name="out_softmax_w"), # bias_attr=fluid.ParamAttr(name="out_softmax_b")) # # current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], # # size=decoder_size, # # act='tanh') # current_state_with_lod = pd.lod_reset(x=current_h, y=pre_score) # current_score = pd.fc(input=current_state_with_lod, # size=self.target_dict_dim, # act='softmax', # param_attr=fluid.ParamAttr(name="out_softmax_w"), # bias_attr=fluid.ParamAttr(name="out_softmax_b")) # print(current_score) topk_scores, topk_indices = pd.topk(current_score, k=self.beam_size) # pd.Print(topk_indices) # pd.Print(topk_scores) selected_ids, selected_scores = topk_indices, topk_scores # # compute accumulated score and perform beam search # accu_scores = pd.elementwise_add( # x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0) # selected_ids, selected_scores = pd.beam_search( # pre_ids, # pre_score, # topk_indices, # accu_scores, # self.beam_size, # # end_id=self.end_id, # end_id=999999, # level=0) # pd.Print(selected_ids) # pd.Print(selected_scores) pd.increment(x=counter, value=1, in_place=True) # write search result and corresponding hidden layer into tensor array pd.array_write(current_h, array=state_h_array, i=counter) pd.array_write(current_c, array=state_c_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) # pd.Print(selected_ids) # pd.Print(selected_scores) # update condition to stop loop length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) # pd.Print(array_len) # translation_ids, translation_scores = pd.beam_search_decode( # ids=ids_array, scores=scores_array, beam_size=self.beam_size, end_id=self.end_id) # pd.Print(translation_ids) translation_ids, translation_ids_index = pd.tensor_array_to_tensor( ids_array, axis=1) translation_scores, translation_scores_index = pd.tensor_array_to_tensor( scores_array, axis=1) return translation_ids, translation_scores
def beam_search(): max_len = layers.fill_constant( shape=[1], dtype=start_tokens.dtype, value=max_out_len) step_idx = layers.fill_constant( shape=[1], dtype=start_tokens.dtype, value=0) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(start_tokens, step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps to reduce redundant # computation in decoder. caches = [{ "k": layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, 0, d_model], dtype=enc_output.dtype, value=0), "v": layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, 0, d_model], dtype=enc_output.dtype, value=0) } for i in range(n_layer)] with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_scores = layers.array_read(array=scores, i=step_idx) # sequence_expand can gather sequences according to lod thus can be # used in beam search to sift states corresponding to selected ids. pre_src_attn_bias = layers.sequence_expand( x=trg_src_attn_bias, y=pre_scores) pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores) pre_caches = [{ "k": layers.sequence_expand( x=cache["k"], y=pre_scores), "v": layers.sequence_expand( x=cache["v"], y=pre_scores), } for cache in caches] pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_enc_output, # cann't use pre_ids here since it has lod value=1, shape=[-1, 1], dtype=pre_ids.dtype), y=layers.increment( x=step_idx, value=1.0, in_place=False), axis=0) logits = wrap_decoder( trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, weight_sharing, dec_inputs=( pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape, slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, src_attn_post_softmax_shape), enc_output=pre_enc_output, caches=pre_caches) topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=layers.reshape( pre_scores, shape=[-1]), axis=0) # beam_search op uses lod to distinguish branches. topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx) layers.increment(x=step_idx, value=1.0, in_place=True) # update states layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(pre_src_attn_bias, trg_src_attn_bias) layers.assign(pre_enc_output, enc_output) for i in range(n_layer): layers.assign(pre_caches[i]["k"], caches[i]["k"]) layers.assign(pre_caches[i]["v"], caches[i]["v"]) layers.assign( layers.elementwise_add( x=slf_attn_pre_softmax_shape, y=attn_pre_softmax_shape_delta), slf_attn_pre_softmax_shape) layers.assign( layers.elementwise_add( x=slf_attn_post_softmax_shape, y=attn_post_softmax_shape_delta), slf_attn_post_softmax_shape) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=eos_idx) return finished_ids, finished_scores
def inference(self, model, inputs, outputs): """ Run inference. Args: inputs(dict): Its key is input name(str) and its value is a Variable. model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`. Returns: dict(str:Variable): Its key is output name(str) and its value is a Variable. """ # prepare while loop max_len = layers.fill_constant( shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True) min_len = layers.fill_constant( shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True) step_idx = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=True) ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx) pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx) scores = layers.array_write(inputs["init_score"], step_idx) tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx) parent_idx = inputs["parent_idx"] if self.decoding_strategy == "beam_search": beam_size = self.beam_size else: beam_size = 1 eos_penalty = np.zeros(self.vocab_size, dtype="float32") eos_penalty[self.eos_id] = -1e9 eos_penalty = layers.assign(eos_penalty) token_penalty = np.zeros(self.vocab_size, dtype="float32") token_penalty[self.unk_id] = -1e9 if self.mask_id >= 0: token_penalty[self.mask_id] = -1e9 token_penalty = layers.assign(token_penalty) # start while loop cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx) dtype = tmp_tgt_generation_mask.dtype append_mask = layers.fill_constant_batch_size_like( input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype) tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2) pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx) pre_sent = layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype) if self.continuous_position: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias else: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) if self.use_role: pre_role = layers.fill_constant_batch_size_like( input=pre_mask, value=0, shape=[-1, 1, 1], dtype=pre_ids.dtype) else: pre_role = None dec_out, _ = model._generation_network( token_ids=pre_ids, type_ids=pre_sent, pos_ids=pre_pos, role_ids=pre_role, generation_mask=tmp_tgt_generation_mask, gather_idx=parent_idx) logits = model._calc_logits(dec_out) # ignore unk and mask token if self.ignore_unk: logits = layers.elementwise_add(logits, token_penalty, axis=1) # min dec length min_len_cond = layers.less_than(x=step_idx, y=min_len) def min_len_penalty(): """Plus minimum length penalty.""" return layers.elementwise_add(logits, eos_penalty, axis=1) def no_penalty(): """No penalty.""" return logits logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty) # get probs probs = layers.softmax(logits / self.temperature) if self.decoding_strategy == "beam_search": topk_scores, topk_indices = layers.topk( input=probs, k=beam_size) else: if self.decoding_strategy.startswith("sampling"): sampling_ids = layers.sampling_id(probs, dtype="int") elif self.decoding_strategy.startswith("topk_sampling"): topk_probs, _ = layers.topk(input=probs, k=self.topk) ge_cond = layers.cast( layers.greater_equal( probs, layers.unsqueeze(topk_probs[:, -1], [1])), "float32") old_probs = probs probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") probs = old_probs else: raise ValueError(self.decoding_strategy) sampling_scores = layers.one_hot( layers.unsqueeze(sampling_ids, [1]), probs.shape[1] ) sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3 topk_scores, topk_indices = layers.topk( input=sampling_scores, k=1) pre_len = layers.cast(step_idx, "float32") layers.increment(x=step_idx, value=1.0, in_place=True) cur_len = layers.cast(step_idx, "float32") # update scores if self.length_average: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len elif self.length_penalty > 0: pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty) cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp else: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores, axis=0) topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=self.eos_id, return_parent_idx=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=self.eos_id) predictions = { "finished_ids": finished_ids, "finished_scores": finished_scores, "token_ids": inputs["token_ids"], "data_id": inputs["data_id"] } return predictions
def infilling_decode(self): if self.task_type == "dialog": emb_num = 4 else: emb_num = 3 input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \ [[-1, self.max_seq_len, self.max_seq_len]] input_dtypes = ['int64'] * emb_num + ['float32'] input_lod_levels = [0] * emb_num + [0] shapes = input_shapes + [[-1, self.max_seq_len, 1], [-1, self.max_seq_len, 1], [-1, 1], [-1], [-1, 1, self.max_seq_len], [-1, 1]] dtypes = input_dtypes + [ 'int64', 'int64', 'float32', 'int32', 'float32', 'int64' ] lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0] inputs = self.to_ternsor(shapes, dtypes, lod_levels) pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=False) emb_ids = {} for key, value in zip(self.emb_keys, inputs[:emb_num]): emb_ids[key] = value input_mask = inputs[emb_num] tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[ -6:] ernie = ErnieModel(emb_ids=emb_ids, input_mask=input_mask, config=self.ernie_config, use_fp16=self.use_fp16, task_type=self.task_type, decoding=True, gather_idx=parent_idx) max_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.max_dec_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=0, force_cpu=True) pos_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx) pos_biases = layers.array_write(layers.reshape(tgt_pos, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) tgt_masks = layers.array_write(tgt_input_mask, step_idx) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_mask = layers.array_read(tgt_masks, i=step_idx) def gen_batch_like(value, dtype="int64", shape=[-1, 1, 1], is_scalar=True): if is_scalar: return layers.fill_constant_batch_size_like( input=parent_idx, value=value, shape=shape, dtype=dtype) else: return layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=parent_idx, value=1, shape=shape, dtype=dtype), y=value, axis=0) tmp_mask = layers.gather(input=tmp_mask, index=parent_idx) append_0_mask = gen_batch_like(0.0, dtype=tmp_mask.dtype) append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype) tmp_mask = layers.concat([tmp_mask, append_1_mask], axis=2) pre_mask = layers.concat([tmp_mask, append_0_mask], axis=2) cur_mask = layers.concat([tmp_mask, append_1_mask], axis=2) cur_ids = gen_batch_like(self.attn_id) pre_pos = gen_batch_like(step_idx, is_scalar=False) cur_pos = gen_batch_like(pos_idx, is_scalar=False) if self.continuous_position: pre_pos = pre_pos + pos_bias cur_pos = cur_pos + pos_bias dec_emb_ids = { "word_embedding": layers.concat([pre_ids, cur_ids], axis=1), "pos_embedding": layers.concat([pre_pos, cur_pos], axis=1) } if self.task_type == "dialog": role_ids = gen_batch_like(0) turn_ids = gen_batch_like(0) dec_emb_ids["role_embedding"] = layers.concat( [role_ids, role_ids], axis=1) dec_emb_ids["turn_embedding"] = layers.concat( [turn_ids, turn_ids], axis=1) else: sent_ids = gen_batch_like(self.tgt_type_id) dec_emb_ids["sent_embedding"] = layers.concat( [sent_ids, sent_ids], axis=1) dec_mask = layers.concat([pre_mask, cur_mask], axis=1) dec_out = ernie.encode(dec_emb_ids, dec_mask, parent_idx, remove_query=True) fc_out = self.cal_logit(dec_out[:, 1:, :], None) topk_scores, topk_indices = layers.topk( input=layers.softmax(fc_out), k=self.beam_size) pre_lenpen = layers.pow( (5.0 + layers.cast(step_idx, pre_scores.dtype)) / 6.0, self.length_penalty) cur_lenpen = layers.pow( (5.0 + layers.cast(pos_idx, pre_scores.dtype)) / 6.0, self.length_penalty) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_lenpen, axis=0) / cur_lenpen topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=pos_idx, value=1.0, in_place=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(tmp_mask, i=step_idx, array=tgt_masks) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) graph_vars = { "finished_ids": finished_ids, "finished_scores": finished_scores, "data_ids": data_ids } for k, v in graph_vars.items(): v.persistable = True return pyreader, graph_vars
def decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data( name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) cond = pd.less_than(x=counter, y=array_len) while_op = pd.While(cond=cond) with while_op.block(): pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) # expand the lod of pre_state to be the same with pre_score pre_state_expanded = pd.sequence_expand(pre_state, pre_score) pre_ids_emb = pd.embedding( input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse) # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=topk_size) selected_ids, selected_scores = pd.beam_search( pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) pd.increment(x=counter, value=1, in_place=True) # update the memories pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) pd.less_than(x=counter, y=array_len, cond=cond) translation_ids, translation_scores = pd.beam_search_decode( ids=ids_array, scores=scores_array) # return init_ids, init_scores return translation_ids, translation_scores
return rev_dict[i] ernie = ErnieGenerate.from_pretrained(model_dir) for sentence, difficult_word in zip(sentences, difficult_words): print(sentence, difficult_word) # 词预测 ids, _ = tokenizer.encode(sentence, pre_process(sentence, difficult_word, 2)) # print(ids) src_ids = D.to_variable(np.expand_dims(ids, 0)) mask_id = tokenizer.mask_id mask_index = np.argwhere(ids == mask_id)[0] logits = ernie(src_ids) _, top_10_tokens = L.topk(logits, 10) # print(top_k_tokens[1].numpy()) substitution_words = [] for token in top_10_tokens[0].numpy(): first_char = str(rev_lookup(token)) ids[mask_index] = token # sep_index = np.argwhere(ids==tokenizer.sep_id)[0][0] # second_ids = ids[sep_index::] # second_ids[0:0] = tokenizer.cls_id second_ids = D.to_variable(np.expand_dims(ids, 0)) logits = ernie(second_ids).numpy() top_token = np.argmax(logits, -1) second_char = str(rev_lookup(top_token[0])) substitution_words.append(first_char + second_char) for token in top_10_tokens[1].numpy(): second_char = str(rev_lookup(token))
def beam_search(): max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=max_out_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, n_head, 0, d_key], dtype=enc_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, n_head, 0, d_value], dtype=enc_output.dtype, value=0), "static_k": # for encoder-decoder attention layers.create_tensor(dtype=enc_output.dtype), "static_v": # for encoder-decoder attention layers.create_tensor(dtype=enc_output.dtype) } for i in range(n_layer) ] with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder(trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), enc_output=enc_output, caches=caches, gather_idx=parent_idx, bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. accu_scores = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_attn_bias, trg_src_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=eos_idx) return finished_ids, finished_scores
def _grammar_step(self, logits, next_cell_states, decode_states, actions, gmr_mask): """跟进文法约束完成一步解码逻辑 Args: logits (Variable): shape = [batch_size, beam_size, vocab_size] next_cell_states (Variable): NULL decode_states (StateWrapper): NULL Returns: TODO Raises: NULL """ # 解码出符合语法规则的 token logits logits, valid_table_mask = self._output_layer( logits, actions, gmr_mask, decode_states.valid_table_mask) # 初始化 vocab size self._vocab_size = logits.shape[-1] self._vocab_size_tensor = layers.fill_constant(shape=[1], dtype='int64', value=logits.shape[-1]) # 计算 log probs,并 mask 掉 finished 部分 step_log_probs = layers.log(layers.softmax(logits)) step_log_probs = self._mask_finished_probs(step_log_probs, decode_states.finished) scores = layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]) topk_scores, topk_indices = layers.topk(input=scores, k=self._beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_indices = layers.reshape(topk_indices, shape=[-1]) # top-k 对应的 beam beam_indices = layers.elementwise_floordiv(topk_indices, self._vocab_size_tensor) # top-k 对应的 token id token_indices = layers.elementwise_mod(topk_indices, self._vocab_size_tensor) # 根据 top k 的来源,重新组织 step_log_probs next_log_probs = nn_utils.batch_gather( layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]), topk_indices) def _beam_gather(x, beam_indices): """reshape x to beam dim, and gather each beam_indices Args: x (TYPE): NULL Returns: Variable """ x = self.split_batch_beams(x) return nn_utils.batch_gather(x, beam_indices) next_cell_states = layers.utils.map_structure( lambda x: _beam_gather(x, beam_indices), next_cell_states) next_finished = _beam_gather(decode_states.finished, beam_indices) next_lens = _beam_gather(decode_states.lengths, beam_indices) next_lens = layers.elementwise_add( next_lens, layers.cast(layers.logical_not(next_finished), next_lens.dtype)) next_finished = layers.logical_or( next_finished, layers.equal(token_indices, self._end_token_tensor)) decode_output = OutputWrapper(topk_scores, token_indices, beam_indices) decode_states = StateWrapper(next_cell_states, next_log_probs, next_finished, next_lens, valid_table_mask) return decode_output, decode_states
def gru_attention_infer(self, decoder_boot, max_length, char_num, word_vector_dim, encoded_vector, encoded_proj, decoder_size): init_state = decoder_boot beam_size = 1 array_len = layers.fill_constant( shape=[1], dtype='int64', value=max_length) counter = layers.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = layers.create_array('float32') layers.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = layers.create_array('int64') scores_array = layers.create_array('float32') rois_shape = layers.shape(init_state) batch_size = layers.slice( rois_shape, axes=[0], starts=[0], ends=[1]) + 1 lod_level = layers.range( start=0, end=batch_size, step=1, dtype=batch_size.dtype) init_ids = layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], value=0, dtype='int64') init_ids = layers.lod_reset(init_ids, lod_level) init_ids = layers.lod_append(init_ids, lod_level) init_scores = layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], value=1, dtype='float32') init_scores = layers.lod_reset(init_scores, init_ids) layers.array_write(init_ids, array=ids_array, i=counter) layers.array_write(init_scores, array=scores_array, i=counter) full_ids = fluid.layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], dtype='int64', value=1) cond = layers.less_than(x=counter, y=array_len) while_op = layers.While(cond=cond) with while_op.block(): pre_ids = layers.array_read(array=ids_array, i=counter) pre_state = layers.array_read(array=state_array, i=counter) pre_score = layers.array_read(array=scores_array, i=counter) pre_ids_emb = layers.embedding( input=pre_ids, size=[char_num, word_vector_dim], dtype='float32') context = self.simple_attention(encoded_vector, encoded_proj, pre_state, decoder_size) # expand the recursive_sequence_lengths of pre_state # to be the same with pre_score pre_state_expanded = layers.sequence_expand(pre_state, pre_score) context_expanded = layers.sequence_expand(context, pre_score) fc_1 = layers.fc(input=context_expanded, size=decoder_size * 3, bias_attr=False, name="rnn_fc1") fc_2 = layers.fc(input=pre_ids_emb, size=decoder_size * 3, bias_attr=False, name="rnn_fc2") decoder_inputs = fc_1 + fc_2 current_state, _, _ = layers.gru_unit( input=decoder_inputs, hidden=pre_state_expanded, size=decoder_size * 3) current_state_with_lod = layers.lod_reset( x=current_state, y=pre_score) # use score to do beam search current_score = layers.fc(input=current_state_with_lod, size=char_num, bias_attr=True, act='softmax', name="rnn_out_fc") topk_scores, topk_indices = layers.topk(current_score, k=beam_size) new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1) fluid.layers.assign(new_ids, full_ids) layers.increment(x=counter, value=1, in_place=True) # update the memories layers.array_write(current_state, array=state_array, i=counter) layers.array_write(topk_indices, array=ids_array, i=counter) layers.array_write(topk_scores, array=scores_array, i=counter) # update the break condition: # up to the max length or all candidates of # source sentences have ended. length_cond = layers.less_than(x=counter, y=array_len) finish_cond = layers.logical_not(layers.is_empty(x=topk_indices)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) return full_ids
def _get_bboxes_single(self, cls_scores, bbox_preds, mlvl_points, img_shape, scale_factor, rescale=False, with_nms=True): # mlvl_points 里面每个元素是[格子行数*格子列数, 3] 具体是(格子左上角x坐标, 格子左上角y坐标, 格子边长) nms_cfg = self.nms_cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) mlvl_bboxes = [] mlvl_scores = [] # 遍历每个fpn输出层 for i_lvl, (cls_score, bbox_pred, points) in enumerate( zip(cls_scores, bbox_preds, mlvl_points)): # cls_score.shape = [80, h, w] # bbox_pred.shape = [ 4, h, w] # points.shape = [h*w, 3] 具体是(格子左上角x坐标, 格子左上角y坐标, 格子边长) cls_score = L.transpose(cls_score, [1, 2, 0]) # [h, w, 80] cls_score = L.reshape(cls_score, (-1, self.num_classes)) # [h*w, 80] if self.use_sigmoid_cls: scores = L.sigmoid(cls_score) # [h*w, 80] else: scores = L.softmax(cls_score) bbox_pred = L.transpose(bbox_pred, [1, 2, 0]) # [h, w, 4] bbox_pred = L.reshape(bbox_pred, (-1, 4)) # [h*w, 4] nms_top_k = nms_cfg.get('nms_top_k', -1) if nms_top_k > 0 and scores.shape[0] > nms_top_k: if self.use_sigmoid_cls: max_scores = L.reduce_max(scores, dim=1) else: # remind that we set FG labels to [0, num_class-1] # since mmdet v2.0 # BG cat_id: num_class # max_scores, _ = scores[:, :-1].max(dim=1) pass _, topk_inds = L.topk(max_scores, k=nms_top_k) scores = L.gather(scores, topk_inds) # [M, 80] points = L.gather(points, topk_inds) # [M, 3] 格子xy坐标、边长 bbox_pred = L.gather(bbox_pred, topk_inds) # [M, 4] # [M, 4] 格子xy坐标重复2次。格子左上角坐标。 bbox_pos_center = L.concat([points[:, :2], points[:, :2]], axis=1) # [M, 4] 物体最终预测坐标(x1y1x2y2格式) = bbox_pred*格子边长 + 格子左上角坐标 bboxes = bbox_pred * self.fpn_stride[i_lvl] + bbox_pos_center x1 = L.clip(bboxes[:, 0], 0.0, img_shape[1]) y1 = L.clip(bboxes[:, 1], 0.0, img_shape[0]) x2 = L.clip(bboxes[:, 2], 0.0, img_shape[1]) y2 = L.clip(bboxes[:, 3], 0.0, img_shape[0]) bboxes = paddle.stack([x1, y1, x2, y2], axis=-1) # [M, 4] mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_scores = L.concat(mlvl_scores, axis=0) # [M2, 80] 各个fpn层预测的分数汇合在一起 mlvl_bboxes = L.concat(mlvl_bboxes, axis=0) # [M2, 4] 各个fpn层预测的bbox(x1y1x2y2格式)汇合在一起 if rescale: scale_factor_ = paddle.to_tensor(scale_factor) mlvl_bboxes /= scale_factor_ # [M2, 4] 预测的bbox(x1y1x2y2格式) pred_scores = L.unsqueeze(mlvl_scores, axes=0) # [1, M2, 80] pred_boxes = L.unsqueeze(mlvl_bboxes, axes=0) # [1, M2, 4],最终坐标 pred_scores = L.transpose(pred_scores, perm=[0, 2, 1]) # [1, 80, M2],最终分数 # nms pred = None i = 0 nms_cfg = copy.deepcopy(self.nms_cfg) nms_type = nms_cfg.pop('nms_type') if nms_type == 'matrix_nms': pred = fluid.layers.matrix_nms(pred_boxes[i:i+1, :, :], pred_scores[i:i+1, :, :], background_label=-1, **nms_cfg) elif nms_type == 'multiclass_nms': pred = fluid.layers.multiclass_nms(pred_boxes[i:i+1, :, :], pred_scores[i:i+1, :, :], background_label=-1, **nms_cfg) return pred
def knowledge_seq2seq(config): """ knowledge seq2seq """ emb_size = config.embed_size hidden_size = config.hidden_size input_size = emb_size num_layers = config.num_layers bi_direc = config.bidirectional batch_size = config.batch_size vocab_size = config.vocab_size run_type = config.run_type enc_input = layers.data(name="enc_input", shape=[1], dtype='int64', lod_level=1) #enc_input --> goal enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32') goal_input = layers.data(name="goal_input", shape=[1], dtype='int64', lod_level=1) #goal_input --> x cue_input = layers.data(name="cue_input", shape=[1], dtype='int64', lod_level=1) #cue_input --> kg #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32') memory_mask = layers.data(name='memory_mask', shape=[-1, 1], dtype='float32') tar_input = layers.data(name='tar_input', shape=[1], dtype='int64', lod_level=1) #tar_input --> y # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32') rnn_hidden_size = hidden_size if bi_direc: rnn_hidden_size //= 2 enc_out, enc_last_hidden = \ rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc") goal_out, goal_last_hidden = \ rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc1") context_goal_out = fluid.layers.concat( input=[enc_last_hidden, goal_last_hidden], axis=2) context_goal_out = layers.reshape(context_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # context_goal_out = layers.squeeze(context_goal_out, axes=[1]) context_goal_out = fluid.layers.fc(context_goal_out, size=rnn_hidden_size * 2, bias_attr=False) context_goal_out = layers.unsqueeze(context_goal_out, axes=[0]) bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge") bridge_out = layers.tanh(bridge_out) cue_last_mask = layers.data(name='cue_last_mask', shape=[-1], dtype='float32') knowledge_out, knowledge_last_hidden = \ rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc") query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1]) query = layers.squeeze(query, axes=[0]) query = layers.unsqueeze(query, axes=[1]) query = layers.reshape(query, shape=[batch_size, -1, hidden_size]) cue_memory = layers.slice(knowledge_last_hidden, axes=[0], starts=[0], ends=[1]) cue_memory = layers.reshape(cue_memory, shape=[batch_size, -1, hidden_size]) memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1]) weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask) cue_att = layers.reshape(cue_att, shape=[batch_size, -1]) knowledge = weighted_cue if config.use_posterior: print("config.use_posterior", config.use_posterior) target_out, target_last_hidden = \ rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="knowledge_enc1") target_goal_out = fluid.layers.concat( input=[target_last_hidden, goal_last_hidden], axis=2) target_goal_out = layers.reshape(target_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # target_goal_out = layers.squeeze(target_goal_out, axes=[1]) target_goal_out = fluid.layers.fc(target_goal_out, size=rnn_hidden_size * 2, bias_attr=False) target_goal_out = layers.unsqueeze(target_goal_out, axes=[0]) # get attenion # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1]) target_query = layers.slice(target_goal_out, axes=[0], starts=[0], ends=[1]) target_query = layers.squeeze(target_query, axes=[0]) target_query = layers.unsqueeze(target_query, axes=[1]) target_query = layers.reshape(target_query, shape=[batch_size, -1, hidden_size]) weight_target, target_att = dot_attention(target_query, cue_memory, mask=memory_mask) target_att = layers.reshape(target_att, shape=[batch_size, -1]) # add to output knowledge = weight_target enc_memory_mask = layers.data(name="enc_memory_mask", shape=[-1, 1], dtype='float32') enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1]) # decoder init_hidden, enc_memory, enc_mask dec_init_hidden = bridge_out pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32')) enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out, pad_value=pad_value) enc_memory.persistable = True gru_unit = GRU_unit(input_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_gru_unit") cue_gru_unit = GRU_unit(hidden_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_cue_gru_unit") tgt_vocab_size = config.vocab_size if run_type == "train": if config.use_bow: bow_logits = fc(knowledge, hidden_size, hidden_size, name='bow_fc_1') bow_logits = layers.tanh(bow_logits) bow_logits = fc(bow_logits, hidden_size, tgt_vocab_size, name='bow_fc_2') bow_logits = layers.softmax(bow_logits) bow_label = layers.data(name='bow_label', shape=[-1, config.max_len], dtype='int64') bow_mask = layers.data(name="bow_mask", shape=[-1, config.max_len], dtype='float32') bow_logits = layers.expand(bow_logits, [1, config.max_len, 1]) bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size]) bow_label = layers.reshape(bow_label, shape=[-1, 1]) bow_loss = layers.cross_entropy(bow_logits, bow_label, soft_label=False) bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len]) bow_loss *= bow_mask bow_loss = layers.reduce_sum(bow_loss, dim=[1]) bow_loss = layers.reduce_mean(bow_loss) dec_input = layers.data(name="dec_input", shape=[-1, 1, 1], dtype='int64') dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32') dec_knowledge = weight_target knowledge_goal_out = fluid.layers.concat( input=[dec_knowledge, target_query], axis=2) knowledge_goal_out = layers.reshape(knowledge_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1]) knowledge_goal_out = fluid.layers.fc(knowledge_goal_out, size=rnn_hidden_size * 2, bias_attr=False) knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0]) decoder_logits = \ rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers, enc_memory, enc_memory_mask, dec_knowledge, vocab_size, init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout) target_label = layers.data(name='target_label', shape=[-1, 1], dtype='int64') target_mask = layers.data(name='target_mask', shape=[-1, 1], dtype='float32') decoder_logits = layers.reshape(decoder_logits, shape=[-1, tgt_vocab_size]) target_label = layers.reshape(target_label, shape=[-1, 1]) nll_loss = layers.cross_entropy(decoder_logits, target_label, soft_label=False) nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1]) nll_loss *= target_mask nll_loss = layers.reduce_sum(nll_loss, dim=[1]) nll_loss = layers.reduce_mean(nll_loss) prior_attn = cue_att + 1e-10 posterior_att = target_att posterior_att.stop_gradient = True prior_attn = layers.log(prior_attn) kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) - prior_attn) kl_loss = layers.reduce_mean(kl_loss) kl_and_nll_factor = layers.data(name='kl_and_nll_factor', shape=[1], dtype='float32') kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1]) final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor return [bow_loss, kl_loss, nll_loss, final_loss] elif run_type == "test": beam_size = config.beam_size batch_size = config.batch_size token = layers.fill_constant(shape=[batch_size * beam_size, 1], value=config.bos_id, dtype='int64') token = layers.reshape(token, shape=[-1, 1]) max_decode_len = config.max_dec_len dec_knowledge = knowledge INF = 100000000.0 init_score_np = np.ones([beam_size * batch_size], dtype='float32') * -INF for i in range(batch_size): init_score_np[i * beam_size] = 0.0 pre_score = layers.assign(init_score_np) pos_index_np = np.arange(batch_size).reshape(-1, 1) pos_index_np = \ np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size pos_index = layers.assign(pos_index_np) id_array = [] score_array = [] index_array = [] init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1]) init_enc_memory = layers.reshape( init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size]) init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1]) init_enc_mask = layers.reshape(init_enc_mask, shape=[batch_size * beam_size, 1, -1]) dec_knowledge = layers.reshape(dec_knowledge, shape=[-1, 1, hidden_size]) init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1]) init_dec_knowledge = layers.reshape( init_dec_knowledge, shape=[batch_size * beam_size, -1, hidden_size]) dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size]) dec_init_hidden = layers.reshape(dec_init_hidden, shape=[1, -1, hidden_size]) length_average = config.length_average UNK = config.unk_id EOS = config.eos_id for i in range(1, max_decode_len + 1): dec_emb = get_embedding(token, input_size, vocab_size) dec_out, dec_last_hidden = \ decoder_step(gru_unit, cue_gru_unit, dec_emb, dec_init_hidden, input_size, hidden_size, init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(dec_out, dropout_prob=config.dropout, is_test=True) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2') log_softmax_output = log_softmax(rnnout) log_softmax_output = layers.squeeze(log_softmax_output, axes=[1]) if i > 1: if length_average: log_softmax_output = layers.elementwise_add( (log_softmax_output / i), (pre_score * (1.0 - 1.0 / i)), axis=0) else: log_softmax_output = layers.elementwise_add( log_softmax_output, pre_score, axis=0) else: log_softmax_output = layers.elementwise_add(log_softmax_output, pre_score, axis=0) log_softmax_output = layers.reshape(log_softmax_output, shape=[batch_size, -1]) topk_score, topk_index = layers.topk(log_softmax_output, k=beam_size) topk_score = layers.reshape(topk_score, shape=[-1]) topk_index = layers.reshape(topk_index, shape=[-1]) vocab_var = layers.fill_constant([1], dtype='int64', value=vocab_size) new_token = topk_index % vocab_var index = topk_index // vocab_var id_array.append(new_token) index_array.append(index) index = index + pos_index score_array.append(topk_score) eos_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=EOS) unk_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=UNK) eos_eq = layers.cast(layers.equal(new_token, eos_ids), dtype='float32') topk_score += eos_eq * -100000000.0 unk_eq = layers.cast(layers.equal(new_token, unk_ids), dtype='float32') topk_score += unk_eq * -100000000.0 # update token = new_token pre_score = topk_score token = layers.reshape(token, shape=[-1, 1]) index = layers.cast(index, dtype='int32') dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0]) dec_init_hidden = layers.gather(dec_last_hidden, index=index) dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0]) init_enc_memory = layers.gather(init_enc_memory, index) init_enc_mask = layers.gather(init_enc_mask, index) init_dec_knowledge = layers.gather(init_dec_knowledge, index) final_score = layers.concat(score_array, axis=0) final_ids = layers.concat(id_array, axis=0) final_index = layers.concat(index_array, axis=0) final_score = layers.reshape( final_score, shape=[max_decode_len, beam_size * batch_size]) final_ids = layers.reshape( final_ids, shape=[max_decode_len, beam_size * batch_size]) final_index = layers.reshape( final_index, shape=[max_decode_len, beam_size * batch_size]) return final_score, final_ids, final_index
def _build_decoder(self, enc_last_hidden, enc_last_cell, mode='train', beam_size=10): softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale)) if mode == 'train': dec_output, dec_last_hidden, dec_last_cell = basic_lstm( self.tar_emb, enc_last_hidden, enc_last_cell, \ self.hidden_size, num_layers=self.num_layers, \ batch_first=self.batch_first, \ dropout_prob=self.dropout, \ param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale) ), \ bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) )) dec_output = layers.matmul(dec_output, softmax_weight) return dec_output elif mode == 'beam_search' or mode == 'greedy_search': dec_unit_list = [] name = 'basic_lstm' for i in range(self.num_layers): new_name = name + "_layers_" + str(i) dec_unit_list.append( BasicLSTMUnit(new_name, self.hidden_size, dtype='float32')) def decoder_step(current_in, pre_hidden_array, pre_cell_array): new_hidden_array = [] new_cell_array = [] step_in = current_in for i in range(self.num_layers): pre_hidden = pre_hidden_array[i] pre_cell = pre_cell_array[i] new_hidden, new_cell = dec_unit_list[i](step_in, pre_hidden, pre_cell) new_hidden_array.append(new_hidden) new_cell_array.append(new_cell) step_in = new_hidden return step_in, new_hidden_array, new_cell_array if mode == 'beam_search': max_src_seq_len = layers.shape(self.src)[1] max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) #eos_ids = layers.fill_constant( [1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([beam_size, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append( layers.expand(enc_last_hidden[i], [beam_size, 1])) pre_cell_array.append( layers.expand(enc_last_cell[i], [beam_size, 1])) eos_ids = layers.fill_constant([beam_size], dtype='int64', value=2) init_score = np.zeros((beam_size)).astype('float32') init_score[1:] = -INF pre_score = layers.assign(init_score) #pre_score = layers.fill_constant( [1,], dtype='float32', value= 0.0) tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) enc_memory = layers.expand(self.enc_output, [beam_size, 1, 1]) pre_tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) finished_seq = layers.fill_constant([beam_size, 1], dtype='int64', value=0) finished_scores = layers.fill_constant([beam_size], dtype='float32', value=-INF) finished_flag = layers.fill_constant([beam_size], dtype='float32', value=0.0) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True parent_idx = layers.fill_constant([1], dtype='int32', value=0) while_op = layers.While(cond) def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags, beam_size, select_beam=None, generate_id=None): scores = layers.reshape(scores, shape=[1, -1]) _, topk_indexs = layers.topk(scores, k=beam_size) topk_indexs = layers.reshape(topk_indexs, shape=[-1]) # gather result top_seq = layers.gather(sequences, topk_indexs) topk_flags = layers.gather(flags, topk_indexs) topk_gather_scores = layers.gather(scores_to_gather, topk_indexs) if select_beam: topk_beam = layers.gather(select_beam, topk_indexs) else: topk_beam = select_beam if generate_id: topk_id = layers.gather(generate_id, topk_indexs) else: topk_id = generate_id return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, select_beam, generate_id): curr_scores += curr_finished * -INF return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs, curr_finished, beam_size, select_beam, generate_id=generate_id) def grow_finished(finished_seq, finished_scores, finished_flag, curr_seq, curr_scores, curr_finished): finished_seq = layers.concat([ finished_seq, layers.fill_constant( [beam_size, 1], dtype='int64', value=1) ], axis=1) curr_scores += (1.0 - curr_finished) * -INF #layers.Print( curr_scores, message="curr scores") curr_finished_seq = layers.concat([finished_seq, curr_seq], axis=0) curr_finished_scores = layers.concat( [finished_scores, curr_scores], axis=0) curr_finished_flags = layers.concat( [finished_flag, curr_finished], axis=0) return compute_topk_scores_and_seq(curr_finished_seq, curr_finished_scores, curr_finished_scores, curr_finished_flags, beam_size) def is_finished(alive_log_prob, finished_scores, finished_in_finished): max_out_len = 200 max_length_penalty = layers.pow( layers.fill_constant([1], dtype='float32', value=((5.0 + max_out_len) / 6.0)), alpha) lower_bound_alive_score = layers.slice( alive_log_prob, starts=[0], ends=[1], axes=[0]) / max_length_penalty lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished lowest_score_of_fininshed_in_finished += ( 1.0 - finished_in_finished) * -INF lowest_score_of_fininshed_in_finished = layers.reduce_min( lowest_score_of_fininshed_in_finished) met = layers.less_than( lower_bound_alive_score, lowest_score_of_fininshed_in_finished) met = layers.cast(met, 'float32') bound_is_met = layers.reduce_sum(met) finished_eos_num = layers.reduce_sum(finished_in_finished) finish_cond = layers.less_than( finished_eos_num, layers.fill_constant([1], dtype='float32', value=beam_size)) return finish_cond def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_hidden_array, pre_cell_array) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape( topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array with while_op.block(): topk_seq, topk_log_probs, topk_scores, topk_finished, topk_beam, topk_generate_id, attention_out, new_hidden_array, new_cell_array = \ grow_top_k( step_idx, pre_tokens, pre_score, parent_idx) alive_seq, alive_log_prob, _, alive_beam, alive_id = grow_alive( topk_seq, topk_scores, topk_log_probs, topk_finished, topk_beam, topk_generate_id) finished_seq_2, finished_scores_2, finished_flags_2, _, _ = grow_finished( finished_seq, finished_scores, finished_flag, topk_seq, topk_scores, topk_finished) finished_cond = is_finished(alive_log_prob, finished_scores_2, finished_flags_2) layers.increment(x=step_idx, value=1.0, in_place=True) layers.assign(alive_beam, parent_idx) layers.assign(alive_id, pre_tokens) layers.assign(alive_log_prob, pre_score) layers.assign(alive_seq, tokens) layers.assign(finished_seq_2, finished_seq) layers.assign(finished_scores_2, finished_scores) layers.assign(finished_flags_2, finished_flag) # update init_hidden, init_cell, input_feed new_feed = layers.gather(attention_out, parent_idx) layers.assign(new_feed, pre_feed) for i in range(self.num_layers): new_hidden_var = layers.gather(new_hidden_array[i], parent_idx) layers.assign(new_hidden_var, pre_hidden_array[i]) new_cell_var = layers.gather(new_cell_array[i], parent_idx) layers.assign(new_cell_var, pre_cell_array[i]) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=finished_cond, out=cond) tokens_with_eos = tokens all_seq = layers.concat([tokens_with_eos, finished_seq], axis=0) all_score = layers.concat([pre_score, finished_scores], axis=0) _, topk_index = layers.topk(all_score, k=beam_size) topk_index = layers.reshape(topk_index, shape=[-1]) final_seq = layers.gather(all_seq, topk_index) final_score = layers.gather(all_score, topk_index) return final_seq elif mode == 'greedy_search': max_src_seq_len = layers.shape(self.src)[1] max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) eos_ids = layers.fill_constant([1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([1, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append(enc_last_hidden[i]) pre_cell_array.append(enc_last_cell[i]) #pre_hidden_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) #pre_cell_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True while_op = layers.While(cond) with while_op.block(): dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_hidden_array, pre_cell_array) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) logits = layers.log(logits) current_log = layers.elementwise_add(logits, score, axis=0) topk_score, topk_indices = layers.topk(input=current_log, k=1) new_ids = layers.concat([full_ids, topk_indices]) layers.assign(new_ids, full_ids) #layers.Print( full_ids, message="ful ids") layers.assign(topk_score, score) layers.assign(topk_indices, pre_ids) layers.assign(dec_att_out, pre_feed) for i in range(self.num_layers): layers.assign(new_hidden_array[i], pre_hidden_array[i]) layers.assign(new_cell_array[i], pre_cell_array[i]) layers.increment(x=step_idx, value=1.0, in_place=True) eos_met = layers.not_equal(topk_indices, eos_ids) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=eos_met, out=cond) return full_ids raise Exception("error") else: print("mode not supprt", mode)
def decoder_decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data( name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) cond = pd.less_than(x=counter, y=array_len) while_op = pd.While(cond=cond) with while_op.block(): pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) # expand the recursive_sequence_lengths of pre_state to be the same with pre_score pre_state_expanded = pd.sequence_expand(pre_state, pre_score) pre_ids_emb = pd.embedding( input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse) # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=beam_size) # calculate accumulated scores after topk to reduce computation cost accu_scores = pd.elementwise_add( x=pd.log(topk_scores), y=pd.reshape( pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search( pre_ids, pre_score, topk_indices, accu_scores, beam_size, end_id=10, level=0) pd.increment(x=counter, value=1, in_place=True) # update the memories pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) # update the break condition: up to the max length or all candidates of # source sentences have ended. length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) # return init_ids, init_scores return translation_ids, translation_scores
def beam_search(): """Beam search function""" max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.min_out_len) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-INF) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "static_k_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_v_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_k_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype), "static_v_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype) } for i in range(self._dec_n_layer) ] trigram_blocking = TrigramBlocking(start_tokens, self.tokenizer, use_fp16=self._use_fp16, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_words_attn_bias = layers.gather( tgt_src_words_attn_bias, index=parent_idx) pre_src_sents_attn_bias = layers.gather( tgt_src_sents_attn_bias, index=parent_idx) pre_graph_attn_bias = layers.gather(graph_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input= pre_src_sents_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = self.decode( dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias, pre_src_sents_attn_bias, pre_graph_attn_bias), enc_words_output=enc_words_output, enc_sents_output=enc_sents_output, caches=caches, gather_idx=parent_idx) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant( shape=[layers.shape(logits)[0]], dtype='int64', value=self.eos_idx) eos_index = fluid.one_hot(eos_index, depth=self.voc_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(logits, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.len_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func( func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) layers.Print(trigram_blocking.delta_score_out, summarize=100, message="trigram_blocking.delta_score_out") pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.len_penalty) curr_scores = layers.elementwise_div( accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias) layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias) layers.assign(pre_graph_attn_bias, graph_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not( layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) return finished_ids, finished_scores
def __call__(self, step_fn, state): """ Running beam search. @param : step_fn : decoding one step @type : function @param : state : initial state @type : dict """ batch_size = state["batch_size"] beam_size = self.beam_size # shape: [batch_size, 1] pos_index = layers.range(0, batch_size, 1, dtype="int64") pos_index = layers.scale(pos_index, beam_size) pos_index = F.unsqueeze(pos_index, [1]) # shape: [batch_size, beam_size, 1] predictions = layers.fill_constant(shape=[batch_size, beam_size, 1], dtype="int64", value=self.bos_id) # initial input state["pred_token"] = predictions[:, :1] # shape: [batch_size, vocab_size] scores, state = step_fn(state) unk_penalty = np.zeros(self.vocab_size, dtype="float32") unk_penalty[self.unk_id] = -1e10 unk_penalty = layers.assign(unk_penalty) eos_penalty = np.zeros(self.vocab_size, dtype="float32") eos_penalty[self.eos_id] = -1e10 eos_penalty = layers.assign(eos_penalty) scores_after_end = np.full(self.vocab_size, -1e10, dtype="float32") scores_after_end[self.pad_id] = 0 scores_after_end = layers.assign(scores_after_end) if self.ignore_unk: scores = scores + unk_penalty scores = scores + eos_penalty # shape: [batch_size, beam_size] sequence_scores, preds = layers.topk(scores, self.beam_size) predictions = layers.concat( [predictions, F.unsqueeze(preds, [2])], axis=2) state = repeat(state, beam_size) parent_idx_list = [] pred_list = [] for step in range(2, self.max_gen_len + 1): pre_ids = predictions[:, :, -1:] state["pred_token"] = layers.reshape( pre_ids, shape=[batch_size * beam_size, 1, 1]) state["pred_mask"] = 1 - F.equal(state["pred_token"], self.pad_id) state["pred_pos"] = state["pred_pos"] + 1 scores, state = step_fn(state) # Generate next # scores shape: [batch_size, beam_size, vocab_size] if self.ignore_unk: scores = scores + unk_penalty if step <= self.min_gen_len: scores = scores + eos_penalty scores = layers.reshape( scores, shape=[batch_size, beam_size, self.vocab_size]) # previous token is [PAD] or [EOS] pre_eos_mask = F.equal(pre_ids, self.eos_id) + F.equal( pre_ids, self.pad_id) scores = scores * (1 - pre_eos_mask) + \ layers.expand(pre_eos_mask, [1, 1, self.vocab_size]) * scores_after_end if self.length_average: scaled_value = pre_eos_mask + (1 - pre_eos_mask) * (1 - 1 / step) sequence_scores = F.unsqueeze(sequence_scores, [2]) * scaled_value scaled_value = pre_eos_mask + (1 - pre_eos_mask) * (1 / step) scores = scores * scaled_value elif self.length_penalty >= 0.0: scaled_value = pre_eos_mask + (1 - pre_eos_mask) * \ (math.pow((4 + step) / (5 + step), self.length_penalty)) sequence_scores = layers.elementwise_mul(scaled_value, sequence_scores, axis=0) scaled_value = pre_eos_mask + (1 - pre_eos_mask) * \ (math.pow(1 / (5 + step), self.length_penalty)) scores = scores * scaled_value scores = layers.elementwise_add(scores, sequence_scores, axis=0) scores = layers.reshape( scores, shape=[batch_size, beam_size * self.vocab_size]) topk_scores, topk_indices = layers.topk(scores, beam_size) vocab_size = layers.fill_constant(shape=[1], dtype="int64", value=self.vocab_size) parent_idx = layers.elementwise_floordiv(topk_indices, vocab_size) preds = layers.elementwise_mod(topk_indices, vocab_size) # Gather state / sequence_scores parent_idx = layers.elementwise_add(parent_idx, pos_index, axis=0) parent_idx = layers.reshape(parent_idx, [batch_size * beam_size]) state = gather(state, parent_idx) sequence_scores = topk_scores predictions = layers.reshape(predictions, shape=[batch_size * beam_size, step]) predictions = gather(predictions, parent_idx) predictions = layers.reshape(predictions, shape=[batch_size, beam_size, step]) predictions = layers.concat( [predictions, F.unsqueeze(preds, [2])], axis=2) pre_ids = predictions[:, :, -1] pre_eos_mask = F.equal(pre_ids, self.eos_id) + F.equal( pre_ids, self.pad_id) sequence_scores = sequence_scores * pre_eos_mask + layers.scale( 1 - pre_eos_mask, -1e10) _, indices = layers.argsort(sequence_scores, axis=1) indices = indices + pos_index indices = layers.reshape(indices, [-1]) sequence_scores = layers.reshape(sequence_scores, [batch_size * beam_size]) predictions = layers.reshape(predictions, [batch_size * beam_size, -1]) sequence_scores = gather(sequence_scores, indices) predictions = layers.gather(predictions, indices) sequence_scores = layers.reshape(sequence_scores, [batch_size, beam_size]) predictions = layers.reshape(predictions, [batch_size, beam_size, -1]) results = { "preds": predictions[:, -1], "scores": sequence_scores[:, -1] } return results
def _get_fine_grained_loss(self, outputs, targets, gt_box, num_classes, mask_anchors, ignore_thresh, eps=1.e-10): """ Calculate fine grained YOLOv3 loss Args: outputs ([Variables]): List of Variables, output of backbone stages targets ([Variables]): List of Variables, The targets for yolo loss calculatation. gt_box (Variable): The ground-truth boudding boxes. num_classes (int): class num of dataset mask_anchors ([[float]]): list of anchors in each output layer ignore_thresh (float): prediction bbox overlap any gt_box greater than ignore_thresh, objectness loss will be ignored. Returns: Type: dict xy_loss (Variable): YOLOv3 (x, y) coordinates loss wh_loss (Variable): YOLOv3 (w, h) coordinates loss obj_loss (Variable): YOLOv3 objectness score loss cls_loss (Variable): YOLOv3 classification loss """ assert len(outputs) == len(targets), \ "YOLOv3 output layer number not equal target number" batch_size = gt_box.shape[0] loss_xys, loss_whs, loss_objs, loss_clss = [], [], [], [] loss_ious = [] if self._iou_aware_loss is not None: loss_iou_awares = [] for i, (output, target, anchors) in enumerate(zip(outputs, targets, mask_anchors)): downsample = self.downsample[i] an_num = len(anchors) // 2 scale_x_y = self.scale_x_y if not isinstance( self.scale_x_y, Sequence) else self.scale_x_y[i] target = L.transpose( target, perm=[0, 3, 4, 1, 2]) # [N, 3, 86, 13, 13] -> [N, 13, 13, 3, 86] output = L.transpose( output, perm=[0, 2, 3, 1]) # [N, 255, 13, 13] -> [N, 13, 13, 255] anchors = np.array(anchors).astype(np.float32) anchors = np.reshape(anchors, (-1, 2)) # split output conv_shape = output.shape n_grid = conv_shape[1] conv_output = L.reshape( output, (batch_size, n_grid, n_grid, an_num, 5 + num_classes)) x = conv_output[:, :, :, :, 0] # (8, 13, 13, 3) y = conv_output[:, :, :, :, 1] # (8, 13, 13, 3) w = conv_output[:, :, :, :, 2] # (8, 13, 13, 3) h = conv_output[:, :, :, :, 3] # (8, 13, 13, 3) conv_raw_conf = conv_output[:, :, :, :, 4] # (8, 13, 13, 3) conv_raw_prob = conv_output[:, :, :, :, 5:] # (8, 13, 13, 3, 80) pred_conf = L.sigmoid(conv_raw_conf) # (8, 13, 13, 3) pred_prob = L.sigmoid(conv_raw_prob) # (8, 13, 13, 3, 80) # split target tx = target[:, :, :, :, 0] # (8, 13, 13, 3) ty = target[:, :, :, :, 1] # (8, 13, 13, 3) tw = target[:, :, :, :, 2] # (8, 13, 13, 3) th = target[:, :, :, :, 3] # (8, 13, 13, 3) tobj = target[:, :, :, :, 4] # (8, 13, 13, 3) tscale = target[:, :, :, :, 5] # (8, 13, 13, 3) label_prob = target[:, :, :, :, 6:] # (8, 13, 13, 3, 80) tscale_tobj = tscale * tobj # (8, 13, 13, 3) # loss if (abs(scale_x_y - 1.0) < eps): loss_x = fluid.layers.sigmoid_cross_entropy_with_logits( x, tx) * tscale_tobj loss_x = fluid.layers.reduce_sum(loss_x, dim=[1, 2, 3]) loss_y = fluid.layers.sigmoid_cross_entropy_with_logits( y, ty) * tscale_tobj loss_y = fluid.layers.reduce_sum(loss_y, dim=[1, 2, 3]) else: dx = scale_x_y * fluid.layers.sigmoid(x) - 0.5 * (scale_x_y - 1.0) dy = scale_x_y * fluid.layers.sigmoid(y) - 0.5 * (scale_x_y - 1.0) loss_x = fluid.layers.abs(dx - tx) * tscale_tobj loss_x = fluid.layers.reduce_sum(loss_x, dim=[1, 2, 3]) loss_y = fluid.layers.abs(dy - ty) * tscale_tobj loss_y = fluid.layers.reduce_sum(loss_y, dim=[1, 2, 3]) # NOTE: we refined loss function of (w, h) as L1Loss loss_w = fluid.layers.abs(w - tw) * tscale_tobj loss_w = fluid.layers.reduce_sum(loss_w, dim=[1, 2, 3]) loss_h = fluid.layers.abs(h - th) * tscale_tobj loss_h = fluid.layers.reduce_sum(loss_h, dim=[1, 2, 3]) # iou_loss # loss_iou = self._iou_loss(x, y, w, h, tx, ty, tw, th, anchors, # downsample, batch_size, # scale_x_y) # loss_iou = loss_iou * tscale_tobj # loss_iou = fluid.layers.reduce_sum(loss_iou, dim=[1, 2, 3]) # loss_ious.append(fluid.layers.reduce_mean(loss_iou)) # if self._iou_aware_loss is not None: # loss_iou_aware = self._iou_aware_loss( # ioup, x, y, w, h, tx, ty, tw, th, anchors, downsample, # batch_size, scale_x_y) # loss_iou_aware = loss_iou_aware * tobj # loss_iou_aware = fluid.layers.reduce_sum( # loss_iou_aware, dim=[1, 2, 3]) # loss_iou_awares.append(fluid.layers.reduce_mean(loss_iou_aware)) pred_xywh = self._decode(x, y, w, h, anchors, downsample, scale_x_y, eps) # (8, 13, 13, 3, 4) label_xywh = self._decode(tx, ty, tw, th, anchors, downsample, scale_x_y, eps, True) # (8, 13, 13, 3, 4) x_shape = x.shape # (8, 13, 13, 3) output_size = x_shape[1] ciou = bbox_ciou(pred_xywh, label_xywh) # (8, 13, 13, 3) # 每个预测框xxxiou_loss的权重 tscale = 2 - (ground truth的面积/图片面积) ciou_loss = tscale_tobj * (1 - ciou ) # 1. tobj作为mask,有物体才计算xxxiou_loss # 2. respond_bbox作为mask,有物体才计算类别loss prob_pos_loss = label_prob * (0 - L.log(pred_prob + 1e-9) ) # 二值交叉熵,tf中也是加了极小的常数防止nan prob_neg_loss = (1 - label_prob) * (0 - L.log(1 - pred_prob + 1e-9) ) # 二值交叉熵,tf中也是加了极小的常数防止nan tobj = L.unsqueeze(tobj, 4) # (8, 13, 13, 3, 1) prob_mask = L.expand(tobj, [1, 1, 1, 1, num_classes]) prob_loss = prob_mask * (prob_pos_loss + prob_neg_loss) # 3. xxxiou_loss和类别loss比较简单。重要的是conf_loss,是一个二值交叉熵损失 # 分两步:第一步是确定 grid_h * grid_w * 3 个预测框 哪些作为反例;第二步是计算二值交叉熵损失。 expand_pred_xywh = L.reshape( pred_xywh, (batch_size, output_size, output_size, 3, 1, 4)) # 扩展为(?, grid_h, grid_w, 3, 1, 4) # gt_box为cx_cy_w_h格式 expand_bboxes = L.reshape(gt_box, (batch_size, 1, 1, 1, L.shape(gt_box)[1], 4)) # 扩展为(?, 1, 1, 1, 70, 4) iou = bbox_iou( expand_pred_xywh, expand_bboxes ) # 所有格子的3个预测框 分别 和 70个ground truth 计算iou。 (?, grid_h, grid_w, 3, 70) max_iou, max_iou_indices = L.topk( iou, k=1 ) # 与70个ground truth的iou中,保留最大那个iou。 (?, grid_h, grid_w, 3, 1) # respond_bgd代表 这个分支输出的 grid_h * grid_w * 3 个预测框是否是 反例(背景) # label有物体,respond_bgd是0。 没物体的话:如果和某个gt(共70个)的iou超过iou_loss_thresh,respond_bgd是0;如果和所有gt(最多70个)的iou都小于iou_loss_thresh,respond_bgd是1。 # respond_bgd是0代表有物体,不是反例(或者是忽略框); 权重respond_bgd是1代表没有物体,是反例。 # 有趣的是,模型训练时由于不断更新,对于同一张图片,两次预测的 grid_h * grid_w * 3 个预测框(对于这个分支输出) 是不同的。用的是这些预测框来与gt计算iou来确定哪些预测框是反例。 # 而不是用固定大小(不固定位置)的先验框。 respond_bgd = (1.0 - tobj) * L.cast(max_iou < self._ignore_thresh, 'float32') # 二值交叉熵损失 pred_conf = L.unsqueeze(pred_conf, 4) # (8, 13, 13, 3, 1) pos_loss = tobj * (0 - L.log(pred_conf + 1e-9)) neg_loss = respond_bgd * (0 - L.log(1 - pred_conf + 1e-9)) conf_loss = pos_loss + neg_loss # 回顾respond_bgd,某个预测框和某个gt的iou超过iou_loss_thresh,不被当作是反例。在参与“预测的置信位 和 真实置信位 的 二值交叉熵”时,这个框也可能不是正例(label里没标这个框是1的话)。这个框有可能不参与置信度loss的计算。 # 这种框一般是gt框附近的框,或者是gt框所在格子的另外两个框。它既不是正例也不是反例不参与置信度loss的计算。(论文里称之为ignore) ciou_loss = L.reduce_sum(ciou_loss) / batch_size conf_loss = L.reduce_sum(conf_loss) / batch_size prob_loss = L.reduce_sum(prob_loss) / batch_size loss_ious.append(ciou_loss) loss_objs.append(conf_loss) loss_clss.append(prob_loss) loss_xys.append(fluid.layers.reduce_mean(loss_x + loss_y)) loss_whs.append(fluid.layers.reduce_mean(loss_w + loss_h)) losses_all = { "loss_xy": fluid.layers.sum(loss_xys), "loss_wh": fluid.layers.sum(loss_whs), "loss_obj": fluid.layers.sum(loss_objs), "loss_cls": fluid.layers.sum(loss_clss), "loss_iou": fluid.layers.sum(loss_ious), } if self._iou_aware_loss is not None: losses_all["loss_iou_aware"] = fluid.layers.sum(loss_iou_awares) return losses_all
def decoder_decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data(name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) cond = pd.less_than(x=counter, y=array_len) while_op = pd.While(cond=cond) with while_op.block(): pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) # expand the lod of pre_state to be the same with pre_score pre_state_expanded = pd.sequence_expand(pre_state, pre_score) pre_ids_emb = pd.embedding(input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse) # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=50) selected_ids, selected_scores = pd.beam_search(pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) pd.increment(x=counter, value=1, in_place=True) # update the memories pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) pd.less_than(x=counter, y=array_len, cond=cond) translation_ids, translation_scores = pd.beam_search_decode( ids=ids_array, scores=scores_array) # return init_ids, init_scores return translation_ids, translation_scores
def loss_layer(conv, pred, label, bboxes, stride, num_class, iou_loss_thresh): conv_shape = P.shape(conv) batch_size = conv_shape[0] output_size = conv_shape[1] input_size = stride * output_size pred_xywh = pred[:, :, :, :, 0:4] pred_conf = pred[:, :, :, :, 4:5] pred_prob = pred[:, :, :, :, 5:] label_xywh = label[:, :, :, :, 0:4] respond_bbox = label[:, :, :, :, 4:5] label_prob = label[:, :, :, :, 5:] ciou = P.reshape( bbox_ciou(pred_xywh, label_xywh), (batch_size, output_size, output_size, 3, 1)) # (8, 13, 13, 3, 1) input_size = P.cast(input_size, dtype='float32') # 每个预测框xxxiou_loss的权重 = 2 - (ground truth的面积/图片面积) bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / ( input_size** 2) ciou_loss = respond_bbox * bbox_loss_scale * ( 1 - ciou) # 1. respond_bbox作为mask,有物体才计算xxxiou_loss # 2. respond_bbox作为mask,有物体才计算类别loss prob_pos_loss = label_prob * (0 - P.log(pred_prob + 1e-9) ) # 二值交叉熵,tf中也是加了极小的常数防止nan prob_neg_loss = (1 - label_prob) * (0 - P.log(1 - pred_prob + 1e-9) ) # 二值交叉熵,tf中也是加了极小的常数防止nan prob_mask = P.expand(respond_bbox, [1, 1, 1, 1, num_class]) prob_loss = prob_mask * (prob_pos_loss + prob_neg_loss) # 3. xxxiou_loss和类别loss比较简单。重要的是conf_loss,是一个二值交叉熵损失 # 分两步:第一步是确定 grid_h * grid_w * 3 个预测框 哪些作为反例;第二步是计算二值交叉熵损失。 expand_pred_xywh = P.reshape(pred_xywh, (batch_size, output_size, output_size, 3, 1, 4)) # 扩展为(?, grid_h, grid_w, 3, 1, 4) expand_bboxes = P.reshape(bboxes, (batch_size, 1, 1, 1, P.shape(bboxes)[1], 4)) # 扩展为(?, 1, 1, 1, 70, 4) iou = bbox_iou( expand_pred_xywh, expand_bboxes ) # 所有格子的3个预测框 分别 和 70个ground truth 计算iou。 (?, grid_h, grid_w, 3, 70) max_iou, max_iou_indices = P.topk( iou, k=1) # 与70个ground truth的iou中,保留最大那个iou。 (?, grid_h, grid_w, 3, 1) # respond_bgd代表 这个分支输出的 grid_h * grid_w * 3 个预测框是否是 反例(背景) # label有物体,respond_bgd是0。 没物体的话:如果和某个gt(共70个)的iou超过iou_loss_thresh,respond_bgd是0;如果和所有gt(最多70个)的iou都小于iou_loss_thresh,respond_bgd是1。 # respond_bgd是0代表有物体,不是反例(或者是忽略框); 权重respond_bgd是1代表没有物体,是反例。 # 有趣的是,模型训练时由于不断更新,对于同一张图片,两次预测的 grid_h * grid_w * 3 个预测框(对于这个分支输出) 是不同的。用的是这些预测框来与gt计算iou来确定哪些预测框是反例。 # 而不是用固定大小(不固定位置)的先验框。 respond_bgd = (1.0 - respond_bbox) * P.cast(max_iou < iou_loss_thresh, 'float32') # 二值交叉熵损失 pos_loss = respond_bbox * (0 - P.log(pred_conf + 1e-9)) neg_loss = respond_bgd * (0 - P.log(1 - pred_conf + 1e-9)) conf_loss = pos_loss + neg_loss # 回顾respond_bgd,某个预测框和某个gt的iou超过iou_loss_thresh,不被当作是反例。在参与“预测的置信位 和 真实置信位 的 二值交叉熵”时,这个框也可能不是正例(label里没标这个框是1的话)。这个框有可能不参与置信度loss的计算。 # 这种框一般是gt框附近的框,或者是gt框所在格子的另外两个框。它既不是正例也不是反例不参与置信度loss的计算。(论文里称之为ignore) ciou_loss = P.reduce_sum(ciou_loss) / batch_size conf_loss = P.reduce_sum(conf_loss) / batch_size prob_loss = P.reduce_sum(prob_loss) / batch_size return ciou_loss, conf_loss, prob_loss