def func(self, place): # the shape of input variable should be clearly specified, not inlcude -1. shape = [2, 3, 7, 9] eps = 0.0001 dtype = np.float64 x = layers.data('x', shape, False, dtype) y = layers.data('y', shape, False, dtype) x.persistable = True y.persistable = True out = layers.elementwise_div(x, y, axis=0) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) y_arr = np.random.uniform(-1, 1, shape).astype(dtype) y_arr[np.abs(y_arr) < 0.005] = 0.02 gradient_checker.double_grad_check([x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3)
def training_network(self, img, caption): # build caption and mask target = caption[:, 1:] source = caption[:, :-1] padding_filled = layers.fill_constant_batch_size_like(target, shape=[-1, decoder_config['sentence_length'] - 1], dtype='int64', value=config.dc['padding_idx']) mask = layers.equal(target, padding_filled) mask = layers.cast(layers.logical_not(mask), 'float32') scale_factor = layers.reduce_sum(mask) mask.stop_gradient = True scale_factor.stop_gradient = True # mdl decoder = Decoder(decoder_config['hidden_dim'], rnn_layer=1) image_embed, global_image_feat = self._img2feature(img) # [batch, k+1, hidden], [batch, hidden] # 这里要改,要么在rnn里面做embedding,要么在外面做! seq_out = decoder.call(global_image_feat, image_embed, embedding_function, words=source) loss = layers.squeeze(ImageCaptionModel.loss(target, seq_out), axes=[2]) loss = layers.elementwise_mul(loss, mask) output_loss = layers.elementwise_div(layers.reduce_sum(loss), scale_factor, name='loss') return output_loss
def greater_equal_branch(i, a): return layers.cond(i < 8.0, lambda: layers.elementwise_mul(a, a), lambda: layers.elementwise_div(a, a))
def _dygraph_clip(self, params_grads): sum_square_fp32, sum_square_fp16 = [], [] unslice_params_fp32, unslice_params_fp16 = [], [] for p, g in params_grads: p_slice = True # using for slice parameter in sharding stage3 if g is None or getattr(p, 'need_clip', True) is False: continue if hasattr(p, "unslice"): p_slice = False merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.get_tensor_from_selected_rows( layers.merge_selected_rows(g)) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) if p.dtype == paddle.float16: if p_slice: sum_square_fp16.append(sum_square) else: unslice_params_fp16.append(sum_square) elif p.dtype == paddle.float32: if p_slice: sum_square_fp32.append(sum_square) else: unslice_params_fp32.append(sum_square) # global norm of non-distributed FP16 params_and_grads if len(sum_square_fp16) == 0: global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_norm_fp16 = layers.concat(sum_square_fp16) global_norm_fp16 = layers.reduce_sum(global_norm_fp16) global_norm_fp16 = paddle.cast( global_norm_fp16, dtype=paddle.float32) # global norm of non-distributed FP16 params_and_grads for unslice parameters if len(unslice_params_fp16) == 0: global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_unslice_fp16 = layers.concat(unslice_params_fp16) global_unslice_fp16 = layers.reduce_sum(global_unslice_fp16) global_unslice_fp16 = paddle.cast( global_unslice_fp16, dtype=paddle.float32) # global norm of non-distributed FP32 params_and_grads global_norm_fp32 = layers.concat(sum_square_fp32) if len( sum_square_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_norm_fp32 = layers.reduce_sum(global_norm_fp32) # global norm of non-distributed FP32 params_and_grads for unslice parameters global_unslice_fp32 = layers.concat(unslice_params_fp32) if len( unslice_params_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32) global_unslice_var = global_unslice_fp16 + global_unslice_fp32 global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var # add all reduce to get global norm of distributed params_and_grads dev_id = int(self._device.split(":")[1]) if paddle.device.get_device() == "cpu": global_norm_var = global_norm_var.cuda(dev_id) with device_guard(dev_id, "gpu"): paddle.distributed.all_reduce(global_norm_var, group=self._group) global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: if getattr(p, 'need_clip', True) is False or g is None: continue origin_state = g.stop_gradient g.stop_gradient = True if p.dtype == paddle.float16: g.scale_(clip_var_fp16.item()) else: g.scale_(clip_var.item()) g.stop_gradient = origin_state # p._reset_grad_inplace_version(True) return params_grads
def __softmax(x, eps=1e-9): exp_out = layers.exp(x=x) sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
def norm(inputs, dim): tp = [1,0] mm = L.sqrt(L.reduce_sum(L.elementwise_mul(inputs, inputs), dim=-dim)) h = L.elementwise_div(inputs, mm, axis=tp[dim]) return h
def network(self, for_test=False): """ 定义train_model的网络结构 :return: """ if not for_test: before = fluid.data(name='before_train', shape=[-1, self.sent_len], dtype='int64') target = fluid.data(name='target_train', shape=[-1, self.sent_len], dtype='int64') after = fluid.data(name='after_train', shape=[-1, self.sent_len], dtype='int64') # 定义数据读取工具 reader = fluid.io.DataLoader.from_generator( feed_list=[before, target, after], capacity=64, iterable=True) # 前向传播 rnn_out, encode_hidden = self.forward(target) pred_before = self.sent_pred(target, dir='before', encode_hidden=encode_hidden, for_test=False) pred_after = self.sent_pred(target, dir='after', encode_hidden=encode_hidden, for_test=False) else: before = fluid.data(name='before_test', shape=[-1, self.sent_len], dtype='int64') target = fluid.data(name='target_test', shape=[-1, self.sent_len], dtype='int64') after = fluid.data(name='after_test', shape=[-1, self.sent_len], dtype='int64') # 定义数据读取工具 reader = fluid.io.DataLoader.from_generator( feed_list=[before, target, after], capacity=64, iterable=True) # 前向传播 rnn_out, encode_hidden = self.forward(target) pred_before = self.sent_pred(target, dir='before', encode_hidden=encode_hidden, for_test=True) pred_after = self.sent_pred(target, dir='after', encode_hidden=encode_hidden, for_test=True) # 将batch_size 置为1列,为什么不是0列?0列是num_layers. pred_before = layers.transpose(pred_before, perm=[0, 2, 1, 3]) pred_after = layers.transpose(pred_after, perm=[0, 2, 1, 3]) if not for_test: before_emb = self.embedding(before) after_emb = self.embedding(after) vocab_emb = self.embedding.parameters()[0] else: before_emb = self.test_embedding(before) after_emb = self.test_embedding(after) vocab_emb = self.test_embedding.parameters()[0] #loss_before = layers.cross_entropy(pred_before, before, soft_label=False) #loss_after = layers.cross_entropy(pred_after, after, soft_label=False) vocab_emb = layers.reshape( vocab_emb, shape=[1, 1, 1, vocab_emb.shape[0], vocab_emb.shape[1]]) new_shape = pred_before.shape[:-1] + (1, ) + pred_before.shape[-1:] pred_before = layers.reshape(pred_before, shape=new_shape) pred_after = layers.reshape(pred_after, shape=new_shape) prob_w_before = layers.reduce_sum(layers.elementwise_mul( pred_before, vocab_emb), dim=[0, 4]) prob_w_after = layers.reduce_sum(layers.elementwise_mul( pred_after, vocab_emb), dim=[0, 4]) prob_w_before = layers.reduce_sum(layers.exp(prob_w_before), dim=-1) prob_w_after = layers.reduce_sum(layers.exp(prob_w_after), dim=-1) new_shape = before_emb.shape[:-1] + (1, ) + before_emb.shape[-1:] before_emb = layers.reshape(before_emb, shape=new_shape) after_emb = layers.reshape(after_emb, shape=new_shape) pred_before = layers.reduce_sum(layers.elementwise_mul( pred_before, before_emb), dim=[0, 3, 4]) pred_after = layers.reduce_sum(layers.elementwise_mul( pred_after, after_emb), dim=[0, 3, 4]) prob_before = layers.elementwise_div(layers.exp(pred_before), prob_w_before + 1e-6) prob_after = layers.elementwise_div(layers.exp(pred_after), prob_w_after + 1e-6) loss = -layers.reduce_mean( (layers.log(prob_after) + layers.log(prob_before)) / 2.0) return loss, reader
def beam_search(): """Beam search function""" max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.min_out_len) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-INF) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "static_k_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_v_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_k_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype), "static_v_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype) } for i in range(self._dec_n_layer) ] trigram_blocking = TrigramBlocking(start_tokens, self.tokenizer, use_fp16=self._use_fp16, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_words_attn_bias = layers.gather( tgt_src_words_attn_bias, index=parent_idx) pre_src_sents_attn_bias = layers.gather( tgt_src_sents_attn_bias, index=parent_idx) pre_graph_attn_bias = layers.gather(graph_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input= pre_src_sents_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = self.decode( dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias, pre_src_sents_attn_bias, pre_graph_attn_bias), enc_words_output=enc_words_output, enc_sents_output=enc_sents_output, caches=caches, gather_idx=parent_idx) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant( shape=[layers.shape(logits)[0]], dtype='int64', value=self.eos_idx) eos_index = fluid.one_hot(eos_index, depth=self.voc_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(logits, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.len_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func( func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) layers.Print(trigram_blocking.delta_score_out, summarize=100, message="trigram_blocking.delta_score_out") pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.len_penalty) curr_scores = layers.elementwise_div( accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias) layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias) layers.assign(pre_graph_attn_bias, graph_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not( layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) return finished_ids, finished_scores
def forward(self, x, y): # x,y误差一帧 u1 = zeros_like(x) u2 = zeros_like(x) l_t = self.l * self.t taut = self.a / self.t grad2_x = self.conv_img_grad(y) # grad2_x[:, :, :, 0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0]) # grad2_x[:, :, :, -1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2]) grad2_y = self.conv_img_grad2(y) # grad2_y[:, :, 0, :] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :]) # grad2_y[:, :, -1, :] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :]) p11 = zeros_like(x) p12 = zeros_like(x) p21 = zeros_like(x) p22 = zeros_like(x) gsqx = grad2_x**2 gsqy = grad2_y**2 grad = gsqx + gsqy + 1e-12 rho_c = y - grad2_x * u1 - grad2_y * u2 - x for i in range(self.n_iter): rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12 v1 = zeros_like(x) v2 = zeros_like(x) mask1 = rho < -l_t * grad mask2 = rho > l_t * grad mask3 = logical_and(logical_not(logical_or(mask1, mask2)), (grad > 1e-12)) mask1 = cast(mask1, dtype='float32') mask2 = cast(mask2, dtype='float32') mask3 = cast(mask3, dtype='float32') mask1.stop_gradient = True mask2.stop_gradient = True mask3.stop_gradient = True # v1 = v1 + l_t * grad2_x * mask1 - l_t * grad2_x * mask2 - (rho / grad) * grad2_x * mask3 # v2 = v2 + l_t * grad2_y * mask1 - l_t * grad2_y * mask2 - (rho / grad) * grad2_y * mask3 v1 = elementwise_add( u1, elementwise_add( elementwise_mul(l_t * grad2_x, mask1), elementwise_add( elementwise_mul(-l_t * grad2_x, mask2), elementwise_mul(-elementwise_div(rho, grad), elementwise_mul(grad2_x, mask3))))) v2 = elementwise_add( u2, elementwise_add( elementwise_mul(l_t * grad2_y, mask1), elementwise_add( elementwise_mul(-l_t * grad2_y, mask2), elementwise_mul(-elementwise_div(rho, grad), elementwise_mul(grad2_y, mask3))))) del rho del mask1 del mask2 del mask3 v1 += u1 v2 += u2 u1 = v1 + self.t * self.divergence(p11, p12) u2 = v2 + self.t * self.divergence(p21, p22) del v1 del v2 u1 = u1 u2 = u2 u1x, u1y = self.forward_grad(u1) u2x, u2y = self.forward_grad(u2) p11 = (p11 + taut * u1x) / (1. + taut * sqrt(u1x**2 + u1y**2 + 1e-12)) p12 = (p12 + taut * u1y) / (1. + taut * sqrt(u1x**2 + u1y**2 + 1e-12)) p21 = (p21 + taut * u2x) / (1. + taut * sqrt(u2x**2 + u2y**2 + 1e-12)) p22 = (p22 + taut * u2y) / (1. + taut * sqrt(u2x**2 + u2y**2 + 1e-12)) del u1x del u1y del u2x del u2y return u1, u2
def _dygraph_clip(self, params_grads): params_and_grads = [] sum_square_dist_fp16 = [] sum_square_dist_fp32 = [] sum_square_not_dist_fp16 = [] sum_square_not_dist_fp32 = [] for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or ( hasattr(p, 'is_firstly_shared') and getattr(p, 'is_firstly_shared', True)) if not_shared_enable: if p.is_distributed: if p.dtype == paddle.float16: sum_square_dist_fp16.append(sum_square) elif p.dtype == paddle.float32: sum_square_dist_fp32.append(sum_square) else: if p.dtype == paddle.float16: sum_square_not_dist_fp16.append(sum_square) elif p.dtype == paddle.float32: sum_square_not_dist_fp32.append(sum_square) # global norm of distributed FP16 params_and_grads if len(sum_square_dist_fp16) == 0: global_norm_dist_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16) global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16) global_norm_dist_fp16 = paddle.cast(global_norm_dist_fp16, dtype=paddle.float32) # global norm of non-distributed FP16 params_and_grads if len(sum_square_not_dist_fp16) == 0: global_norm_not_dist_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16) global_norm_not_dist_fp16 = layers.reduce_sum( global_norm_not_dist_fp16) global_norm_not_dist_fp16 = paddle.cast(global_norm_not_dist_fp16, dtype=paddle.float32) # global norm of distributed FP32 params_and_grads global_norm_dist_fp32 = layers.concat(sum_square_dist_fp32) if len( sum_square_dist_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_norm_dist_fp32 = layers.reduce_sum(global_norm_dist_fp32) # global norm of non-distributed FP32 params_and_grads global_norm_not_dist_fp32 = layers.concat( sum_square_not_dist_fp32 ) if len(sum_square_not_dist_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_norm_not_dist_fp32 = layers.reduce_sum( global_norm_not_dist_fp32) global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32 global_norm_var_not_dist = global_norm_not_dist_fp16 + global_norm_not_dist_fp32 # add all reduce to get global norm of distributed params_and_grads if self._hcg.get_model_parallel_world_size() > 1: paddle.distributed.all_reduce( global_norm_var_dist, group=self._hcg.get_check_parallel_group()) # add all reduce to get global norm of non-distributed params_and_grads in groups of pp if self._hcg.get_pipe_parallel_world_size() > 1: paddle.distributed.all_reduce( global_norm_var_not_dist, group=self._hcg.get_pipe_parallel_group()) # In Sharding mode, param and grad is mapping different rank in optimizer. # ClipGradByGlobalNorm need allreduce to get globol norm if self._hcg.get_sharding_parallel_world_size() > 1: paddle.distributed.all_reduce( global_norm_var_not_dist, group=self._hcg.get_sharding_parallel_group()) global_norm_var_fp32 = layers.sqrt(global_norm_var_dist + global_norm_var_not_dist) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm) clip_var = layers.elementwise_div(x=max_global_norm, y=layers.elementwise_max( x=global_norm_var_fp32, y=max_global_norm)) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue if p.dtype == paddle.float16: new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16) else: new_grad = layers.elementwise_mul(x=g, y=clip_var) params_and_grads.append((p, new_grad)) return params_and_grads
def _dygraph_clip(self, params_grads): normal_params_grads = [] moe_params_grads = [] # separate moe params from normal params if self.moe_group is not None and self.moe_group.nranks > 1: for p, g in params_grads: if self.is_expert_param_func(p): moe_params_grads.append((p, g)) else: normal_params_grads.append((p, g)) else: normal_params_grads = params_grads # why to return sum_dtype? # we will call `get_l2_norm_pow` twice and the precisions may be different. # For convenience and simplification, we use sum_dtype directly instead of global_norm_var_normal.dtype global_norm_var_normal, sum_dtype \ = self.get_l2_norm_pow(normal_params_grads) global_norm_var_moe = None if len(moe_params_grads) > 0: global_norm_var_moe, _ \ = self.get_l2_norm_pow(moe_params_grads, sum_dtype) if global_norm_var_moe is not None: collective.all_reduce(global_norm_var_moe, op=collective.ReduceOp.SUM, group=self.moe_group) if global_norm_var_normal is None and global_norm_var_moe is None: return params_grads elif global_norm_var_normal is None: global_norm_var = global_norm_var_moe elif global_norm_var_moe is None: global_norm_var = global_norm_var_normal else: if global_norm_var_normal.dtype != global_norm_var_moe.dtype: # compared with normal norm, moe norm is the later one, # so its precision is no lower than normal norm global_norm_var_normal = \ global_norm_var_normal.astype(global_norm_var_moe.dtype) global_norm_var = global_norm_var_normal + global_norm_var_moe params_and_grads = [] global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant(shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div(x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue # TODO(wangxi): use inplace elementwise_mul clip_input = (clip_var.astype('float16') if g.dtype == core.VarDesc.VarType.FP16 else clip_var) new_grad = layers.elementwise_mul(x=g, y=clip_input) params_and_grads.append((p, new_grad)) return params_and_grads
def log_fun(x, y): return F.elementwise_mul(x, (F.elementwise_div(x, y)))
def compute_weight(v, g, dim, power): assert len(g.shape) == 1, "magnitude should be a vector" v_normalized = F.elementwise_div(v, (norm_except(v, dim, power) + 1e-12), axis=dim) weight = F.elementwise_mul(v_normalized, g, axis=dim) return weight
def fast_decode(self): """create model for inference""" if self.task_type == "dialog": emb_num = 4 else: emb_num = 3 input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \ [[-1, self.max_seq_len, self.max_seq_len]] input_dtypes = ['int64'] * emb_num + ['float32'] input_lod_levels = [0] * emb_num + [0] shapes = input_shapes + [[-1, 1, 1], [-1, 1, 1], [-1, 1], [-1], [-1, 1, self.max_seq_len], [-1, 1]] dtypes = input_dtypes + [ 'int64', 'int64', 'float32', 'int32', 'float32', 'int64' ] lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0] inputs = self.to_tensor(shapes, dtypes, lod_levels) pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=70, iterable=False) emb_ids = {} for key, value in zip(self.emb_keys, inputs[:emb_num]): emb_ids[key] = value input_mask = inputs[emb_num] tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[ -6:] unimo = UNIMOModel(emb_ids=emb_ids, input_mask=input_mask, config=self.gene_config, task_type=self.task_type, decoding=True, gather_idx=parent_idx) max_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.min_out_len, force_cpu=True) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-1e18) step_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx) pos_biases = layers.array_write(tgt_pos, step_idx) scores = layers.array_write(init_scores, step_idx) tgt_masks = layers.array_write(tgt_input_mask, step_idx) trigram_blocking = TrigramBlocking(tgt_ids, self.tokenizer, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) def gen_batch_like(value, dtype="int64", shape=[-1, 1, 1], is_scalar=True): """generate batch""" if is_scalar: return layers.fill_constant_batch_size_like( input=parent_idx, value=value, shape=shape, dtype=dtype) else: return layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=parent_idx, value=1, shape=shape, dtype=dtype), y=value, axis=0) tmp_mask = layers.array_read(tgt_masks, i=step_idx) tmp_mask = layers.gather(input=tmp_mask, index=parent_idx) append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype) pre_mask = layers.concat([tmp_mask, append_1_mask], axis=2) pre_pos = gen_batch_like(step_idx, is_scalar=False) pre_pos = pre_pos + pos_bias ####################### pos start from 2 pre_sent = gen_batch_like(self.tgt_type_id, dtype=pre_ids.dtype) dec_emb_ids = {"word_embedding": pre_ids, "pos_embedding": pre_pos} if self.task_type == "dialog": role_ids = gen_batch_like(0) turn_ids = gen_batch_like(0) dec_emb_ids["role_embedding"] = role_ids dec_emb_ids["turn_embedding"] = turn_ids else: dec_emb_ids["sent_embedding"] = pre_sent dec_out = unimo.encode(emb_ids=dec_emb_ids, input_mask=pre_mask, gather_idx=parent_idx) fc_out = self.cal_logit(dec_out, None) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant(shape=[layers.shape(fc_out)[0]], dtype='int64', value=self.eos_id) eos_index = fluid.one_hot(eos_index, depth=self.vocab_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(fc_out, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.length_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func(func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.length_penalty) curr_scores = layers.elementwise_div(accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_id, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_masks) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_id) graph_vars = { "finished_ids": finished_ids, "finished_scores": finished_scores, "data_ids": data_ids } for k, v in graph_vars.items(): v.persistable = True return pyreader, graph_vars