def _process_type_leaf(condition, decoder, grammar_stack, next_inputs, finished): """Process when output type is LEAF Args: condition (TYPE): NULL decoder (TYPE): NULL grammar_stack (StackData): (gmr_stack_data, gmr_stack_pos) next_inputs (DecoderInputsWrapper): (input_var, action, grammar_mask) finished (TYPE): NULL Returns: None Raises: NULL """ ## pop stack next_output, valid_pos, gmr_stack_tmp = data_structure.Stack.pop( grammar_stack, mask=True, in_place=False) valid_pos = fluider.squeeze(valid_pos, [1]) ## update next grammar mask next_actions = layers.elementwise_mul(decoder.grammar_action(next_output), layers.cast( valid_pos, dtype=next_inputs.action.dtype), axis=0) next_gmr_mask = layers.elementwise_mul( decoder.grammar_mask(next_output), layers.cast(valid_pos, dtype=next_inputs.gmr_mask.dtype), axis=0) ## save result, while condition is True new_gmr_stack_data, new_gmr_stack_pos, new_actions, new_gmr_mask = nn_utils.ifelse( condition, [gmr_stack_tmp.data, gmr_stack_tmp.pos, next_actions, next_gmr_mask], [ grammar_stack.data, grammar_stack.pos, next_inputs.action, next_inputs.gmr_mask ]) layers.utils.map_structure( layers.assign, [new_gmr_stack_data, new_gmr_stack_pos, next_actions, new_gmr_mask], [ grammar_stack.data, grammar_stack.pos, next_inputs.action, next_inputs.gmr_mask ]) layers.logical_or(finished, layers.logical_and(condition, layers.logical_not(valid_pos)), out=finished)
def get_face_mask(densepose_map): """ Obtain mask of faces. Args: densepose_map (3D or 4D tensor) """ need_reshape = len(densepose_map.shape) == 4 if need_reshape: bo, t, h, w = densepose_map.shape densepose_map = L.reshape(densepose_map, (-1, h, w)) b, h, w = densepose_map.shape part_map = (densepose_map / 2 + 0.5) * 24 assert L.reduce_all((part_map >= 0)) and L.reduce_all((part_map < 25)) mask = dg.to_variable(np.zeros((b, h, w)).astype('bool')) for j in [23, 24]: mask = L.logical_or( mask, L.logical_and((part_map > j - 0.1), (part_map < j + 0.1))) if need_reshape: mask = L.reshape(mask, (bo, t, h, w)) return P.cast(mask, "float32")
def _grammar_step(self, logits, next_cell_states, decode_states, actions, gmr_mask): """跟进文法约束完成一步解码逻辑 Args: logits (Variable): shape = [batch_size, beam_size, vocab_size] next_cell_states (Variable): NULL decode_states (StateWrapper): NULL Returns: TODO Raises: NULL """ # 解码出符合语法规则的 token logits logits, valid_table_mask = self._output_layer(logits, actions, gmr_mask, decode_states.valid_table_mask) # 初始化 vocab size self._vocab_size = logits.shape[-1] self._vocab_size_tensor = layers.fill_constant(shape=[1], dtype='int64', value=logits.shape[-1]) # 计算 log probs,并 mask 掉 finished 部分 step_log_probs = layers.log(layers.softmax(logits)) step_log_probs = self._mask_finished_probs(step_log_probs, decode_states.finished) scores = layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]) topk_scores, topk_indices = layers.topk(input=scores, k=self._beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_indices = layers.reshape(topk_indices, shape=[-1]) # top-k 对应的 beam beam_indices = layers.elementwise_floordiv(topk_indices, self._vocab_size_tensor) # top-k 对应的 token id token_indices = layers.elementwise_mod(topk_indices, self._vocab_size_tensor) # 根据 top k 的来源,重新组织 step_log_probs next_log_probs = nn_utils.batch_gather( layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]), topk_indices) def _beam_gather(x, beam_indices): """reshape x to beam dim, and gather each beam_indices Args: x (TYPE): NULL Returns: Variable """ x = self.split_batch_beams(x) return nn_utils.batch_gather(x, beam_indices) next_cell_states = layers.utils.map_structure(lambda x: _beam_gather(x, beam_indices), next_cell_states) next_finished = _beam_gather(decode_states.finished, beam_indices) next_lens = _beam_gather(decode_states.lengths, beam_indices) next_lens = layers.elementwise_add(next_lens, layers.cast(layers.logical_not(next_finished), next_lens.dtype)) next_finished = layers.logical_or(next_finished, layers.equal(token_indices, self._end_token_tensor)) decode_output = OutputWrapper(topk_scores, token_indices, beam_indices) decode_states = StateWrapper(next_cell_states, next_log_probs, next_finished, next_lens, valid_table_mask) return decode_output, decode_states
def simple_net(self): d0 = layers.data( "d0", shape=[10], append_batch_size=False, dtype='float32') d1 = layers.data( "d1", shape=[10], append_batch_size=False, dtype='float32') d2 = layers.data( "d2", shape=[10], append_batch_size=False, dtype='float32') # fill_constant npu op doesn't support int64 i = layers.zeros(shape=[1], dtype='int32') i = layers.cast(i, 'int64') i.stop_gradient = True init = layers.zeros(shape=[10], dtype='float32') mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) i = layers.increment(i) layers.array_write(d1, i, array=data_array) i = layers.increment(i) layers.array_write(d2, i, array=data_array) i = layers.zeros(shape=[1], dtype='int32') i = layers.cast(i, 'int64') i.stop_gradient = True array_len = layers.fill_constant(shape=[1], dtype='int32', value=5) array_len = layers.cast(array_len, 'int64') array_len.stop_gradient = True cond = layers.ones(shape=[1], dtype='int32') cond = layers.cast(cond, 'bool') j = layers.fill_constant(shape=[1], dtype='int32', value=1) j = layers.cast(j, 'int64') j.stop_gradient = True array_len2 = layers.fill_constant(shape=[1], dtype='int32', value=3) array_len2 = layers.cast(array_len2, 'int64') array_len2.stop_gradient = True cond2 = layers.logical_or(x=j, y=array_len2) cond2 = layers.ones(shape=[1], dtype='int32') cond2 = layers.cast(cond2, 'bool') while_op = layers.While(cond=cond) while_op2 = layers.While(cond=cond2) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) result = layers.sums(input=[d, prev]) i = layers.increment(x=i, in_place=True) layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) with while_op2.block(): d2 = layers.array_read(array=data_array, i=j) prev2 = layers.array_read(array=mem_array, i=j) result2 = layers.sums(input=[d2, prev2]) j = layers.increment(x=j, in_place=True) layers.array_write(result2, i=j, array=mem_array) layers.less_than(x=j, y=array_len2, cond=cond2) sum_result = layers.array_read(array=mem_array, i=j) loss = layers.mean(sum_result) return loss, sum_result
def logical_or(cls, x, y, *args, out=None, name=None): """wrapper of paddle.fluid.layers.logical_or Args: x (Variable): NULL y (Variable): NULL *args (TYPE): NULL out (TYPE): Default is None name (TYPE): Default is None Returns: TODO Raises: NULL """ tmp = layers.logical_or(x, y, out=out, name=name) for var in args: tmp = layers.logical_or(tmp, var, out=out, name=name) return tmp
def get_attention_mask(mask, nhead): # mask: [bs, L] -> attn_mask: [bs, nhead, L, L] bs, l = mask.shape row_mask = L.expand(L.unsqueeze(mask, [2]), (1, 1, l)) # [bs, L, L] col_mask = L.expand(L.unsqueeze(mask, [1]), (1, l, 1)) # [bs, L, L] mask = L.logical_or(row_mask, col_mask) attn_mask = L.zeros([bs, l, l], dtype="float32") attn_mask = attn_mask.numpy() mask = mask.numpy() attn_mask[mask] = -1e8 attn_mask = dg.to_variable(attn_mask) attn_mask = L.expand(L.unsqueeze(attn_mask, [1]), (1, nhead, 1, 1)) # [bs, nhead, L1, L2] return attn_mask
def _check_finished(decoder, next_inputs, finished, outputs_array): """check finished instance by next_inputs.action, and update finished tag and write END to outputs Args: decoder (TYPE): NULL next_inputs (TYPE): NULL finished (TYPE): NULL outputs_array (TYPE): NULL Returns: TODO Raises: NULL """ act_stop = tensor.fill_constant_batch_size_like( next_inputs.action, shape=next_inputs.action.shape, value=decoder._grammar.ACTION_STOP, dtype='int64') new_finished = layers.logical_and( layers.equal(next_inputs.action, act_stop), layers.logical_not(finished)) end_token_id = tensor.fill_constant_batch_size_like( outputs_array.data, shape=[-1], value=decoder._grammar.END, dtype=outputs_array.data.dtype) out_data_tmp, out_pos_tmp = data_structure.Array.push(outputs_array, end_token_id, in_place=False) new_data, new_pos = nn_utils.ifelse( new_finished, [out_data_tmp, out_pos_tmp], [outputs_array.data, outputs_array.pos]) layers.assign(new_data, outputs_array.data) layers.assign(new_pos, outputs_array.pos) layers.logical_or(finished, new_finished, out=finished)
def beam_search(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias, bos_id=0, eos_id=1, beam_size=4, max_len=256): def expand_to_beam_size(tensor, beam_size): tensor = layers.reshape(tensor, [tensor.shape[0], 1] + tensor.shape[1:]) tile_dims = [1] * len(tensor.shape) tile_dims[1] = beam_size return layers.expand(tensor, tile_dims) def merge_batch_beams(tensor): return layers.reshape(tensor, [tensor.shape[0] * tensor.shape[1]] + tensor.shape[2:]) def split_batch_beams(tensor): return fluid.layers.reshape(tensor, shape=[-1, beam_size] + list(tensor.shape[1:])) def mask_probs(probs, finished, noend_mask_tensor): # TODO: use where_op finished = layers.cast(finished, dtype=probs.dtype) probs = layers.elementwise_mul(layers.expand( layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]), noend_mask_tensor, axis=-1) - layers.elementwise_mul( probs, (finished - 1), axis=0) return probs def gather(x, indices, batch_pos): topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) return layers.gather_nd(x, topk_coordinates) # run encoder enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) # constant number inf = float(1. * 1e7) batch_size = enc_output.shape[0] max_len = (enc_output.shape[1] + 20) if max_len is None else max_len vocab_size_tensor = layers.fill_constant(shape=[1], dtype="int64", value=self.trg_vocab_size) end_token_tensor = to_variable( np.full([batch_size, beam_size], eos_id, dtype="int64")) noend_array = [-inf] * self.trg_vocab_size noend_array[eos_id] = 0 noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32")) batch_pos = layers.expand( layers.unsqueeze( to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]), [1, beam_size]) predict_ids = [] parent_ids = [] ### initialize states of beam search ### log_probs = to_variable( np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size, dtype="float32")) finished = to_variable( np.full([batch_size, beam_size], 0, dtype="bool")) ### initialize inputs and states of transformer decoder ### ## init inputs for decoder, shaped `[batch_size*beam_size, ...]` trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id) trg_pos = layers.zeros_like(trg_word) trg_src_attn_bias = merge_batch_beams( expand_to_beam_size(trg_src_attn_bias, beam_size)) enc_output = merge_batch_beams( expand_to_beam_size(enc_output, beam_size)) ## init states (caches) for transformer, need to be updated according to selected beam caches = [{ "k": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_key], dtype=enc_output.dtype, value=0), "v": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_value], dtype=enc_output.dtype, value=0), } for i in range(self.n_layer)] for i in range(max_len): trg_pos = layers.fill_constant(shape=trg_word.shape, dtype="int64", value=i) caches = map_structure( # can not be reshaped since the 0 size lambda x: x if i == 0 else merge_batch_beams(x), caches) logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, enc_output, caches) caches = map_structure(split_batch_beams, caches) step_log_probs = split_batch_beams( fluid.layers.log(fluid.layers.softmax(logits))) step_log_probs = mask_probs(step_log_probs, finished, noend_mask_tensor) log_probs = layers.elementwise_add(x=step_log_probs, y=log_probs, axis=0) log_probs = layers.reshape(log_probs, [-1, beam_size * self.trg_vocab_size]) scores = log_probs topk_scores, topk_indices = fluid.layers.topk(input=scores, k=beam_size) beam_indices = fluid.layers.elementwise_floordiv( topk_indices, vocab_size_tensor) token_indices = fluid.layers.elementwise_mod( topk_indices, vocab_size_tensor) # update states caches = map_structure( lambda x: gather(x, beam_indices, batch_pos), caches) log_probs = gather(log_probs, topk_indices, batch_pos) finished = gather(finished, beam_indices, batch_pos) finished = layers.logical_or( finished, layers.equal(token_indices, end_token_tensor)) trg_word = layers.reshape(token_indices, [-1, 1]) predict_ids.append(token_indices) parent_ids.append(beam_indices) if layers.reduce_all(finished).numpy(): break predict_ids = layers.stack(predict_ids, axis=0) parent_ids = layers.stack(parent_ids, axis=0) finished_seq = layers.transpose( layers.gather_tree(predict_ids, parent_ids), [1, 2, 0]) finished_scores = topk_scores return finished_seq, finished_scores
def _run_paddle_logical_or(x, y): x = cast_bool_if_necessary(x) y = cast_bool_if_necessary(y) return logical_or(x, y)
def forward(self, x, y): # x,y误差一帧 u1 = zeros_like(x) u2 = zeros_like(x) l_t = self.l * self.t taut = self.a / self.t grad2_x = self.conv_img_grad(y) # grad2_x[:, :, :, 0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0]) # grad2_x[:, :, :, -1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2]) grad2_y = self.conv_img_grad2(y) # grad2_y[:, :, 0, :] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :]) # grad2_y[:, :, -1, :] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :]) p11 = zeros_like(x) p12 = zeros_like(x) p21 = zeros_like(x) p22 = zeros_like(x) gsqx = grad2_x**2 gsqy = grad2_y**2 grad = gsqx + gsqy + 1e-12 rho_c = y - grad2_x * u1 - grad2_y * u2 - x for i in range(self.n_iter): rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12 v1 = zeros_like(x) v2 = zeros_like(x) mask1 = rho < -l_t * grad mask2 = rho > l_t * grad mask3 = logical_and(logical_not(logical_or(mask1, mask2)), (grad > 1e-12)) mask1 = cast(mask1, dtype='float32') mask2 = cast(mask2, dtype='float32') mask3 = cast(mask3, dtype='float32') mask1.stop_gradient = True mask2.stop_gradient = True mask3.stop_gradient = True # v1 = v1 + l_t * grad2_x * mask1 - l_t * grad2_x * mask2 - (rho / grad) * grad2_x * mask3 # v2 = v2 + l_t * grad2_y * mask1 - l_t * grad2_y * mask2 - (rho / grad) * grad2_y * mask3 v1 = elementwise_add( u1, elementwise_add( elementwise_mul(l_t * grad2_x, mask1), elementwise_add( elementwise_mul(-l_t * grad2_x, mask2), elementwise_mul(-elementwise_div(rho, grad), elementwise_mul(grad2_x, mask3))))) v2 = elementwise_add( u2, elementwise_add( elementwise_mul(l_t * grad2_y, mask1), elementwise_add( elementwise_mul(-l_t * grad2_y, mask2), elementwise_mul(-elementwise_div(rho, grad), elementwise_mul(grad2_y, mask3))))) del rho del mask1 del mask2 del mask3 v1 += u1 v2 += u2 u1 = v1 + self.t * self.divergence(p11, p12) u2 = v2 + self.t * self.divergence(p21, p22) del v1 del v2 u1 = u1 u2 = u2 u1x, u1y = self.forward_grad(u1) u2x, u2y = self.forward_grad(u2) p11 = (p11 + taut * u1x) / (1. + taut * sqrt(u1x**2 + u1y**2 + 1e-12)) p12 = (p12 + taut * u1y) / (1. + taut * sqrt(u1x**2 + u1y**2 + 1e-12)) p21 = (p21 + taut * u2x) / (1. + taut * sqrt(u2x**2 + u2y**2 + 1e-12)) p22 = (p22 + taut * u2y) / (1. + taut * sqrt(u2x**2 + u2y**2 + 1e-12)) del u1x del u1y del u2x del u2y return u1, u2
def grammar_output(inputs, actions, gmr_mask, last_col2tbl_mask, decode_vocab, grammar, name=None, column2table=None): """output logits according to grammar Args: inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1 actions (Variable): shape = [batch_size, max_len]. infer 阶段 max_len 恒为1 gmr_mask (Variable): shape = [batch_size, max_len, grammar_size]. infer 阶段 max_len 恒为1 last_col2tbl_mask (Variable): shape = [batch_size, max_len, max_table]. 解码过程中,上一个step为column时,其对应的 table mask decode_vocab (DecoderDynamicVocab): (table, table_len, column, column_len, value, value_len, column2table_mask). 这里的column2table_mask是跟column一一对应的table mask。 gramamr (Grammar): NULL name (str): Variable 的 name 前缀。用于多次调用时的参数共享。默认为 None,表示参数不会共享。 Returns: (Variable, Variable) output: 词表输出概率 valid_table_mask: 只在预测阶段有效 Raises: NULL """ batch_size = layers.shape(inputs)[0] max_len = inputs.shape[1] vocab_size = grammar.vocab_size action_shape = [batch_size, max_len] act_apply_rule = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_APPLY, dtype='int64') act_stop = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_STOP, dtype='int64') act_select_t = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_SELECT_T, dtype='int64') act_select_c = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_SELECT_C, dtype='int64') act_select_v = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_SELECT_V, dtype='int64') cond_apply_rule = layers.logical_or(layers.equal(actions, act_apply_rule), layers.equal(actions, act_stop)) cond_select_t = layers.equal(actions, act_select_t) cond_select_c = layers.equal(actions, act_select_c) cond_select_v = layers.equal(actions, act_select_v) # expand vocab to [-1, max_len, ...] if max_len == 1: expand_to_seq_len = lambda x: layers.unsqueeze(x, [1]) else: expand_to_seq_len = lambda x: layers.expand(layers.unsqueeze( x, [1]), [1, max_len] + [1] * (len(x.shape) - 1)) table_enc = expand_to_seq_len(decode_vocab.table) table_len = expand_to_seq_len(decode_vocab.table_len) column_enc = expand_to_seq_len(decode_vocab.column) column_len = expand_to_seq_len(decode_vocab.column_len) value_enc = expand_to_seq_len(decode_vocab.value) value_len = expand_to_seq_len(decode_vocab.value_len) column2table_mask = expand_to_seq_len(decode_vocab.column2table_mask) # merge batch & seq_len dim inputs = nn_utils.merge_first_ndim(inputs, n=2) actions = nn_utils.merge_first_ndim(actions, n=2) gmr_mask = nn_utils.merge_first_ndim(gmr_mask, n=2) last_col2tbl_mask = nn_utils.merge_first_ndim(last_col2tbl_mask, n=2) table_enc = nn_utils.merge_first_ndim(table_enc, n=2) table_len = nn_utils.merge_first_ndim(table_len, n=2) column_enc = nn_utils.merge_first_ndim(column_enc, n=2) column_len = nn_utils.merge_first_ndim(column_len, n=2) value_enc = nn_utils.merge_first_ndim(value_enc, n=2) value_len = nn_utils.merge_first_ndim(value_len, n=2) column2table_mask = nn_utils.merge_first_ndim(column2table_mask, n=2) cond_apply_rule = nn_utils.merge_first_ndim(cond_apply_rule, n=2) cond_select_t = nn_utils.merge_first_ndim(cond_select_t, n=2) cond_select_c = nn_utils.merge_first_ndim(cond_select_c, n=2) cond_select_v = nn_utils.merge_first_ndim(cond_select_v, n=2) t_ptr_net = models.PointerNetwork(score_type="affine", name='gmr_output_t_ptr') c_ptr_net = models.PointerNetwork(score_type="affine", name='gmr_output_c_ptr') v_ptr_net = models.PointerNetwork(score_type="affine", name='gmr_output_v_ptr') ## 核心处理逻辑 ## apply_rule_output = _apply_rule(cond_apply_rule, inputs, gmr_mask, grammar, name=name) select_t_output = \ _select_table(cond_select_t, inputs, table_enc, table_len, last_col2tbl_mask, t_ptr_net, grammar) select_c_output, valid_table_mask = \ _select_column(cond_select_c, inputs, column_enc, column_len, c_ptr_net, grammar, column2table_mask) select_v_output = _select_value(cond_select_v, inputs, value_enc, value_len, v_ptr_net, grammar) output = fluider.elementwise_add(apply_rule_output, select_t_output, select_c_output, select_v_output, axis=0) output = layers.reshape(output, shape=[batch_size, max_len, vocab_size]) return output, valid_table_mask
def _greedy_search(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias, bos_id=0, eos_id=1, max_len=256): # run encoder enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) # constant number batch_size = enc_output.shape[0] max_len = (enc_output.shape[1] + 20) if max_len is None else max_len end_token_tensor = layers.fill_constant(shape=[batch_size, 1], dtype="int64", value=eos_id) predict_ids = [] log_probs = layers.fill_constant(shape=[batch_size, 1], dtype="float32", value=0) trg_word = layers.fill_constant(shape=[batch_size, 1], dtype="int64", value=bos_id) finished = layers.fill_constant(shape=[batch_size, 1], dtype="bool", value=0) ## init states (caches) for transformer caches = [{ "k": layers.fill_constant(shape=[batch_size, self.n_head, 0, self.d_key], dtype=enc_output.dtype, value=0), "v": layers.fill_constant( shape=[batch_size, self.n_head, 0, self.d_value], dtype=enc_output.dtype, value=0), } for i in range(self.n_layer)] for i in range(max_len): trg_pos = layers.fill_constant(shape=trg_word.shape, dtype="int64", value=i) logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, enc_output, caches) step_log_probs = layers.log(layers.softmax(logits)) log_probs = layers.elementwise_add(x=step_log_probs, y=log_probs, axis=0) scores = log_probs topk_scores, topk_indices = layers.topk(input=scores, k=1) finished = layers.logical_or( finished, layers.equal(topk_indices, end_token_tensor)) trg_word = topk_indices log_probs = topk_scores predict_ids.append(topk_indices) if layers.reduce_all(finished).numpy(): break predict_ids = layers.stack(predict_ids, axis=0) finished_seq = layers.transpose(predict_ids, [1, 2, 0]) finished_scores = topk_scores return finished_seq, finished_scores