def magnitude(self, x): """Compute the magnitude spectrogram. Args: (real, imag) real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram. imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram. Returns: Variable: shape(B, C, 1, T), dtype flaot32, the magnitude spectrogram. It is the square root of the power spectrogram. """ power = self.power(x) magnitude = F.sqrt(power) return magnitude
def forward(self, pred, target): target = 1 - target[:, 0] batch_size, vector_size = pred.shape[0], pred.shape[1] pred = L.l2_normalize(pred, axis=1, epsilon=1e-10) square_norm = L.reduce_sum(L.square(pred), dim=1) dist = L.elementwise_add(-2.0 * L.matmul(pred, pred, transpose_y=True), square_norm, axis=0) dist = L.elementwise_add(dist, square_norm, axis=1) dist = L.elementwise_max(dist, L.zeros_like(dist)) dist = L.sqrt(dist) ap_dist = L.reshape(dist, (0, 0, 1)) an_dist = L.reshape(dist, (0, 1, -1)) loss = L.expand(ap_dist, (1, 1, batch_size)) - L.expand( an_dist, (1, batch_size, 1)) + self.magin indice_equal = L.diag( L.fill_constant((batch_size, ), dtype='float32', value=1.0)) indice_not_equal = 1.0 - indice_equal broad_matrix = L.expand(L.reshape(target, (-1, 1)), (1, batch_size)) + L.expand( L.reshape(target, (1, -1)), (batch_size, 1)) pp = L.cast(L.equal(broad_matrix, L.zeros_like(broad_matrix)), dtype='float32') pp = L.reshape(indice_not_equal * pp, (0, 0, 1)) pn = L.cast(L.equal(broad_matrix, L.zeros_like(broad_matrix) + 1), dtype='float32') pn = L.reshape(indice_not_equal * pn, (1, 0, -1)) apn = L.expand(pp, (1, 1, batch_size)) * L.expand(pn, (batch_size, 1, 1)) loss = loss * L.cast(apn, dtype='float32') loss = L.elementwise_max(loss, L.zeros_like(loss)) num_tri = L.reduce_sum( L.cast(L.greater_than(loss, L.zeros_like(loss)), dtype='float32')) loss = L.reduce_sum(loss) * self.loss_weight / (num_tri + 1e-16) return loss
def func(self, place): shape = [2, 3, 7, 9] eps = 0.0001 dtype = np.float64 x = layers.data('x', shape, False, dtype) x.persistable = True y = layers.sqrt(x) x_arr = np.random.uniform(0.1, 1, shape).astype(dtype) gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps) gradient_checker.double_grad_check_for_dygraph( self.sqrt_wrapper, [x], y, x_init=x_arr, place=place)
def norm_except_dim(p, dim): shape = p.shape ndims = len(shape) if dim is None: return F.sqrt(F.reduce_sum(F.square(p))) elif dim == 0: p_matrix = F.reshape(p, (shape[0], -1)) return l2_norm(p_matrix, axis=1) elif dim == -1 or dim == ndims - 1: p_matrix = F.reshape(p, (-1, shape[-1])) return l2_norm(p_matrix, axis=0) else: perm = list(range(ndims)) perm[0] = dim perm[dim] = 0 p_transposed = F.transpose(p, perm) return norm_except_dim(p_transposed, 0)
def communicate_avg_loss(): communicate() self._generate_avg_loss(main_block, loss, avg_loss) next_local_steps = layers.cast(layers.ceil( layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) * float(init_k_steps))), dtype='int64') max_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=16) min_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=1) next_local_steps = layers.elementwise_min( next_local_steps, max_local_steps) next_local_steps = layers.elementwise_max( next_local_steps, min_local_steps) layers.assign(next_local_steps, k_steps)
def build_program(self, dtype): with fluid.program_guard(self.main_program, self.startup_program): self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 3) self.feed_vars.append( fluid.data(name="data3", shape=[128, 128], dtype=dtype)) # subgraph with 2 op nodes tmp_0 = layers.sum( [self.feed_vars[0], self.feed_vars[1], self.feed_vars[2]]) tmp_1 = layers.sqrt(tmp_0) tmp_2 = layers.mul(tmp_0, self.feed_vars[3]) # subgraph with 2 op nodes tmp_3 = layers.square(layers.sum([tmp_1, tmp_2])) self.append_gradients(tmp_3) self.num_fused_ops = 4 self.fetch_list = [tmp_3, self.grad(tmp_0)]
def forward(self, output1, output2, label): """ :param output1: [n, 128] :param output2: [n, 128] :param label: [n, 1] :return: [1] """ distance = layers.elementwise_sub(output1, output2) distance = layers.square(distance) euclidean_distance = layers.reduce_sum(distance, dim=1, keep_dim=True) euclidean_distance = layers.sqrt(euclidean_distance) loss_contrastive = layers.elementwise_mul( 1 - label, layers.square(euclidean_distance), axis=0) + layers.elementwise_mul( label, layers.square( layers.clamp(self.margin - euclidean_distance, min=0.0)), axis=0) return loss_contrastive, euclidean_distance.numpy(), label.numpy()
def _dygraph_clip_by_global_norm(self, params_grads): params_and_grads = [] sum_square_list = [] for p, g in params_grads: if g is None: continue if self._need_clip_func is not None and not self._need_clip_func( p): continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) sum_square_list.append(sum_square) # all parameters have been filterd out if len(sum_square_list) == 0: return params_grads global_norm_var = layers.concat(sum_square_list) global_norm_var = layers.reduce_sum(global_norm_var) global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant(shape=[1], dtype='float32', value=self.clip_norm) clip_var = layers.elementwise_div(x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) for p, g in params_grads: if g is None: continue if self._need_clip_func is not None and not self._need_clip_func( p): params_and_grads.append((p, g)) continue new_grad = layers.elementwise_mul(x=g, y=clip_var) params_and_grads.append((p, new_grad)) return params_and_grads
def graph_norm(gw, feature): """Implementation of graph normalization Reference Paper: BENCHMARKING GRAPH NEURAL NETWORKS Each node features is divied by sqrt(num_nodes) per graphs. Args: gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, hidden_size) Return: A tensor with shape (num_nodes, hidden_size) """ nodes = L.fill_constant([gw.num_nodes, 1], dtype="float32", value=1.0) norm = graph_pooling(gw, nodes, pool_type="sum") norm = L.sqrt(norm) feature_lod = op.nested_lod_reset(feature, gw.graph_lod) norm = L.sequence_expand_as(norm, feature_lod) norm.stop_gradient = True return feature_lod / norm
def forward(self, tenFirst, tenSecond, tenFeaturesFirst, tenFeaturesSecond, tenFlow): b, _, h, w = tenFlow.shape tenDifference = tenFirst - backwarp(tenInput=tenSecond, tenFlow=tenFlow * self.fltBackward) tenDifference = L.pow(tenDifference, 2) tenDifference = L.reduce_sum(tenDifference, 1, True) # [b, 1, h, w] tenDifference = L.sqrt(tenDifference).detach() tenFeaturesFirst = self.moduleFeat(tenFeaturesFirst) tenMean = L.reshape(tenFlow, (b, 2, -1)) # [b, 2, h * w] tenMean = L.reduce_mean(tenMean, 2, True) # [b, 2, 1] tenMean = L.reshape(tenMean, (b, 2, 1, 1)) # [b, 2, 1, 1] tenMean = L.expand(tenMean, (1, 1, h, w)) # [b, 2, h, w] delta = tenFlow - tenMean diff = L.concat([tenDifference, delta, tenFeaturesFirst], 1) tenDist = self.moduleDist(self.moduleMain(diff)) tenDist = L.pow(tenDist, 2.0) * -1.0 tenDist = tenDist - L.reduce_max(tenDist, 1, True) tenDist = L.exp(tenDist) tenDivisor = L.reduce_sum(tenDist, 1, True) tenDivisor = L.reciprocal(tenDivisor) tenScaleX = L.unfold(x=tenFlow[:, 0:1, :, :], kernel_sizes=self.intUnfold, strides=1, paddings=int((self.intUnfold - 1) / 2)) # [b, c, h * w] tenScaleX = L.reshape(tenScaleX, (b, -1, h, w)) # [b, c, h, w] tenScaleX = self.moduleScaleX(tenDist * tenScaleX) * tenDivisor tenScaleY = L.unfold(x=tenFlow[:, 1:2, :, :], kernel_sizes=self.intUnfold, strides=1, paddings=int((self.intUnfold - 1) / 2)) # [b, c, h * w] tenScaleY = L.reshape(tenScaleY, (b, -1, h, w)) # [b, c, h, w] tenScaleY = self.moduleScaleY(tenDist * tenScaleY) * tenDivisor return L.concat([tenScaleX, tenScaleY], 1)
import paddle.fluid as fluid import numpy as np import paddle.fluid.layers as L def gen_data(): return { "x": np.random.randint(1, 5, size=[8, 10]).astype('float32'), "y": np.random.randint(1, 5, size=[10]).astype('float32'), } x = fluid.layers.data(name="x", shape=[8,10], dtype='float32') y = fluid.layers.data(name="y", shape=[10], dtype='float32') mm = L.sqrt(L.reduce_sum(L.elementwise_mul(x,x), dim=0)) kk = L.ones_like(y) z = fluid.layers.elementwise_div(x, mm, axis=1) # z = x / y place = fluid.CPUPlace() exe = fluid.Executor(place) z_value = exe.run(feed=gen_data(), fetch_list=[z.name]) print(z_value) #
epsilon = 1e-5 N = 2 C = 2 H = 2 W = 2 HW = H * W x = paddle.randn((N, C, H, W)) x.stop_gradient = False U = fluid.layers.reduce_mean(x, dim=[2, 3], keep_dim=True) # [N, C, 1, 1] V = fluid.layers.reduce_mean(fluid.layers.square(x - U), dim=[2, 3], keep_dim=True) # [N, C, 1, 1] normX = (x - U) / L.sqrt(V + epsilon) Var1 = (x - U) Var2 = 1.0 / L.sqrt(V + epsilon) Var3 = (x - U) * 1.0 / L.sqrt(V + epsilon) dUdx = paddle.grad(outputs=[U], inputs=[x], create_graph=True, retain_graph=True)[0] dVdx = paddle.grad(outputs=[V], inputs=[x], create_graph=True, retain_graph=True)[0] dnormXdx = paddle.grad(outputs=[normX], inputs=[x], create_graph=True,
def norm(inputs, dim): tp = [1,0] mm = L.sqrt(L.reduce_sum(L.elementwise_mul(inputs, inputs), dim=-dim)) h = L.elementwise_div(inputs, mm, axis=tp[dim]) return h
def forward(self, x): if self.training: N, C, H, W = x.shape NHW = N * H * W # 方案一:用乘法 # U = fluid.layers.reduce_mean(x, dim=[0, 2, 3], keep_dim=True) # [1, C, 1, 1] # V = fluid.layers.reduce_mean(fluid.layers.square(x - U), dim=[0, 2, 3], keep_dim=True) # [1, C, 1, 1] # normX = (x - U) / L.sqrt(V + self.epsilon) # [N, C, H, W] # scale = L.unsqueeze(self.weight, [0, 2, 3]) # bias = L.unsqueeze(self.bias, [0, 2, 3]) # out = normX * scale + bias # U = L.reshape(U, (-1, )) # V = L.reshape(V, (-1, )) # 方案二:用分组卷积代替乘法 # out = W*(x - U)/s + B = (W/s) * x + B - (W/s)*U U = fluid.layers.reduce_mean(x, dim=[0, 2, 3], keep_dim=False) # [C, ] if self.special_kernel is None: # 为了快速求(x - U) special_kernel = np.ones((self.num_features, 1, 1, 1), np.float32) self.special_kernel = paddle.to_tensor(special_kernel) self.special_kernel.stop_gradient = True V = F.conv2d(x, self.special_kernel, -U, groups=self.num_features) # 为了快速求(x - U) V = fluid.layers.reduce_mean(fluid.layers.square(V), dim=[0, 2, 3], keep_dim=False) # [C, ] std = L.sqrt(V + self.epsilon) # [C, ] A = self.weight / std # [C, ] B = self.bias - U * A # [C, ] A = L.unsqueeze(A, [1, 2, 3]) # [C, 1, 1, 1] out = F.conv2d(x, A, B, groups=self.num_features) curr_U = U.numpy() curr_V = V.numpy() state_dict = self.state_dict() momentum = self.momentum _mean = self._mean.numpy() * momentum + curr_U * (1. - momentum) _variance = self._variance.numpy() * momentum + curr_V * (1. - momentum) state_dict['_mean'] = _mean.astype(np.float32) state_dict['_variance'] = _variance.astype(np.float32) self.set_state_dict(state_dict) self.A = None self.B = None else: # 方案一:用乘法 # U = L.unsqueeze(self._mean, [0, 2, 3]) # [1, C, 1, 1] # V = L.unsqueeze(self._variance, [0, 2, 3]) # [1, C, 1, 1] # normX = (x - U) / L.sqrt(V + self.epsilon) # [N, C, H, W] # scale = L.unsqueeze(self.weight, [0, 2, 3]) # bias = L.unsqueeze(self.bias, [0, 2, 3]) # out = normX * scale + bias # 方案二:用分组卷积代替乘法 # out = W*(x - U)/s + B = (W/s) * x + B - (W/s)*U if self.A is None: std = L.sqrt(self._variance + self.epsilon) # [C, ] A = self.weight / std # [C, ] B = self.bias - self._mean * A # [C, ] A = L.unsqueeze(A, [1, 2, 3]) # [C, 1, 1, 1] self.A = A self.B = B out = F.conv2d(x, self.A, self.B, groups=self.num_features) return out
def forward(self, input, conv, conv_g): # deal with wight and grad of self.pre_dxdw! self._check_input_dim(input) N, C, H, W = input.shape NHW = N * H * W y = input # [N, C, H, W] weight = conv.weight # burnin if self.training and self.burnin > 0: self.iter_count += 1 self._update_buffer_num() if self.buffer_num > 0 and self.training and ( not input.stop_gradient): # some layers are frozen! # cal current batch mu and sigma cur_mu = L.reduce_mean(y, dim=[0, 2, 3], keep_dim=False) # [C, ] if self.special_kernel is None: # 为了快速求(x - cur_mu) special_kernel = np.ones((self.num_features, 1, 1, 1), np.float32) self.special_kernel = paddle.to_tensor(special_kernel) self.special_kernel.stop_gradient = True cur_sigma2 = F.conv2d( y, self.special_kernel, -cur_mu, groups=self.num_features) # 为了快速求(x - cur_mu) cur_sigma2 = L.reduce_sum( L.square(cur_sigma2), dim=[0, 2, 3], keep_dim=False) / ( NHW - 1) # [C, ] 作者原版实现中使用的是样本方差,所以分母-1 y2 = L.square(y) cur_meanx2 = L.reduce_mean(y2, dim=[0, 2, 3], keep_dim=False) # [C, ] # cal dmu/dw dsigma2/dw # dmudw = paddle.grad(outputs=[cur_mu], inputs=[weight], create_graph=False, retain_graph=True)[0] # dmeanx2dw = paddle.grad(outputs=[cur_meanx2], inputs=[weight], create_graph=False, retain_graph=True)[0] # 自己的求法 dmudinput = np.zeros(input.shape, np.float32) + 1.0 / NHW dmudinput = paddle.to_tensor(dmudinput) dmeanx2dinput = input.numpy() dmeanx2dinput = paddle.to_tensor(dmeanx2dinput) dmeanx2dinput *= 2.0 / NHW dmudw = conv_g.get_grad_w(conv.weight, conv.bias, dmudinput) dmeanx2dw = conv_g.get_grad_w(conv.weight, conv.bias, dmeanx2dinput) # update cur_mu and cur_sigma2 with pres weight_data = weight.numpy() weight_data = paddle.to_tensor(weight_data) weight_data.stop_gradient = True # 如果用L.stack()会报错,所以用L.concat()代替。 mu_all = [ cur_mu, ] + [ tmp_mu + L.reduce_sum(self.rho * tmp_d * (weight_data - tmp_w), dim=[1, 2, 3]) for tmp_mu, tmp_d, tmp_w in zip(self.pre_mu, self.pre_dmudw, self.pre_weight) ] meanx2_all = [ cur_meanx2, ] + [ tmp_meanx2 + L.reduce_sum( self.rho * tmp_d * (weight_data - tmp_w), dim=[1, 2, 3]) for tmp_meanx2, tmp_d, tmp_w in zip( self.pre_meanx2, self.pre_dmeanx2dw, self.pre_weight) ] mu_all = [L.unsqueeze(mu_, 0) for mu_ in mu_all] meanx2_all = [L.unsqueeze(meanx2_, 0) for meanx2_ in meanx2_all] mu_all = L.concat(mu_all, 0) meanx2_all = L.concat(meanx2_all, 0) sigma2_all = meanx2_all - L.square(mu_all) # with considering count re_mu_all = mu_all.clone() re_meanx2_all = meanx2_all.clone() mask1 = L.cast(sigma2_all >= 0., dtype="float32") mask1.stop_gradient = True re_mu_all *= mask1 re_meanx2_all *= mask1 count = L.reduce_sum(L.cast(sigma2_all >= 0., dtype="float32"), dim=[ 0, ]) mu = L.reduce_sum(re_mu_all, dim=[ 0, ]) / count sigma2 = L.reduce_sum(re_meanx2_all, dim=[ 0, ]) / count - L.square(mu) cur_mu_ = cur_mu.numpy() cur_mu_ = paddle.to_tensor(cur_mu_) cur_mu_.stop_gradient = True self.pre_mu = [ cur_mu_, ] + self.pre_mu[:(self.buffer_num - 1)] cur_meanx2_ = cur_meanx2.numpy() cur_meanx2_ = paddle.to_tensor(cur_meanx2_) cur_meanx2_.stop_gradient = True self.pre_meanx2 = [ cur_meanx2_, ] + self.pre_meanx2[:(self.buffer_num - 1)] dmudw_ = dmudw.numpy() dmudw_ = paddle.to_tensor(dmudw_) dmudw_.stop_gradient = True self.pre_dmudw = [ dmudw_, ] + self.pre_dmudw[:(self.buffer_num - 1)] dmeanx2dw_ = dmeanx2dw.numpy() dmeanx2dw_ = paddle.to_tensor(dmeanx2dw_) dmeanx2dw_.stop_gradient = True self.pre_dmeanx2dw = [ dmeanx2dw_, ] + self.pre_dmeanx2dw[:(self.buffer_num - 1)] tmp_weight = weight.numpy() tmp_weight = paddle.to_tensor(tmp_weight) tmp_weight.stop_gradient = True self.pre_weight = [ tmp_weight, ] + self.pre_weight[:(self.buffer_num - 1)] else: mu = L.reduce_mean(y, dim=[0, 2, 3], keep_dim=False) # [C, ] if self.special_kernel is None: # 为了快速求(x - mu) special_kernel = np.ones((self.num_features, 1, 1, 1), np.float32) self.special_kernel = paddle.to_tensor(special_kernel) self.special_kernel.stop_gradient = True sigma2 = F.conv2d(y, self.special_kernel, -mu, groups=self.num_features) # 为了快速求(x - mu) sigma2 = L.reduce_sum(L.square(sigma2), dim=[0, 2, 3], keep_dim=False) / (NHW - 1) # [C, ] cur_mu = mu cur_sigma2 = sigma2 if not self.training or self.FROZEN: # eval()状态 U = self._mean # TODO: outside **0.5? if self.out_p: std = L.sqrt(self._variance + self.eps) else: std = L.sqrt(self._variance) + self.eps else: # train()状态 if self.track_running_stats is True: state_dict = self.state_dict() momentum = self.momentum _mean = self._mean.numpy() * momentum + cur_mu.numpy() * ( 1. - momentum) _variance = self._variance.numpy( ) * momentum + cur_sigma2.numpy() * (1. - momentum) state_dict['_mean'] = _mean.astype(np.float32) state_dict['_variance'] = _variance.astype(np.float32) self.set_state_dict(state_dict) U = mu # TODO: outside **0.5? if self.out_p: std = L.sqrt(sigma2 + self.eps) else: std = L.sqrt(sigma2) + self.eps A = self.weight / std # [C, ] B = self.bias - U * A # [C, ] A = L.unsqueeze(A, [1, 2, 3]) # [C, 1, 1, 1] y = F.conv2d(y, A, B, groups=self.num_features) return y
def _dygraph_clip(self, params_grads): normal_params_grads = [] moe_params_grads = [] # separate moe params from normal params if self.moe_group is not None and self.moe_group.nranks > 1: for p, g in params_grads: if self.is_expert_param_func(p): moe_params_grads.append((p, g)) else: normal_params_grads.append((p, g)) else: normal_params_grads = params_grads # why to return sum_dtype? # we will call `get_l2_norm_pow` twice and the precisions may be different. # For convenience and simplification, we use sum_dtype directly instead of global_norm_var_normal.dtype global_norm_var_normal, sum_dtype \ = self.get_l2_norm_pow(normal_params_grads) global_norm_var_moe = None if len(moe_params_grads) > 0: global_norm_var_moe, _ \ = self.get_l2_norm_pow(moe_params_grads, sum_dtype) if global_norm_var_moe is not None: collective.all_reduce(global_norm_var_moe, op=collective.ReduceOp.SUM, group=self.moe_group) if global_norm_var_normal is None and global_norm_var_moe is None: return params_grads elif global_norm_var_normal is None: global_norm_var = global_norm_var_moe elif global_norm_var_moe is None: global_norm_var = global_norm_var_normal else: if global_norm_var_normal.dtype != global_norm_var_moe.dtype: # compared with normal norm, moe norm is the later one, # so its precision is no lower than normal norm global_norm_var_normal = \ global_norm_var_normal.astype(global_norm_var_moe.dtype) global_norm_var = global_norm_var_normal + global_norm_var_moe params_and_grads = [] global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant(shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div(x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue # TODO(wangxi): use inplace elementwise_mul clip_input = (clip_var.astype('float16') if g.dtype == core.VarDesc.VarType.FP16 else clip_var) new_grad = layers.elementwise_mul(x=g, y=clip_input) params_and_grads.append((p, new_grad)) return params_and_grads
def communicate(): sub_block = default_main_program().current_block() ring_id = -1 for param, snapshot in p2s: sub_block.append_op(type='elementwise_sub', inputs={ 'X': [snapshot], 'Y': [param] }, outputs={'Out': [param]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) sub_block.append_op(type='c_sync_calc_stream', inputs={'X': param}, outputs={'Out': param}, attrs={OP_ROLE_KEY: OpRole.Optimize}) ring_id = (ring_id + 1) % self.nrings sub_block.append_op(type='c_allreduce_sum', inputs={'X': [param]}, outputs={'Out': [param]}, attrs={ 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Optimize }) for ring_id in range(self.nrings): sub_block.append_op(type='c_sync_comm_stream', inputs={'X': param}, outputs={'Out': param}, attrs={ 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Optimize }) for param, snapshot in p2s: sub_block.append_op(type='scale', inputs={'X': [param]}, outputs={'Out': [param]}, attrs={ 'scale': 1.0 / self.role_maker.worker_num(), OP_ROLE_KEY: OpRole.Optimize }) sub_block.append_op(type='elementwise_sub', inputs={ 'X': [snapshot], 'Y': [param] }, outputs={'Out': [param]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) sub_block.append_op(type='assign', inputs={'X': [param]}, outputs={'Out': [snapshot]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) if auto_steps: next_local_steps = layers.cast(layers.ceil( layers.sqrt(lr_0 * loss / (global_lr * loss_0) * float(init_k_steps))), dtype='int64') max_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=16) next_local_steps = layers.elementwise_min( next_local_steps, max_local_steps) layers.assign(next_local_steps, k_steps) layers.assign(step, last_step)
def forward(self, q, k, v, lengths, speaker_embed, start_index, force_monotonic=False, prev_coeffs=None, window=None): # add position encoding as an inductive bias if self.has_bias: # multi-speaker model omega_q = 2 * F.sigmoid( F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1])) omega_k = 2 * self.omega_initial * F.sigmoid( F.squeeze(self.k_pos_affine(speaker_embed), axes=[-1])) else: # single-speaker case batch_size = q.shape[0] omega_q = F.ones((batch_size, ), dtype="float32") omega_k = F.ones( (batch_size, ), dtype="float32") * self.omega_default q += self.position_encoding_weight * positional_encoding( q, start_index, omega_q) k += self.position_encoding_weight * positional_encoding(k, 0, omega_k) q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v) activations = F.matmul(q, k, transpose_y=True) activations /= np.sqrt(self.attention_dim) if self.training: # mask the <pad> parts from the encoder mask = F.sequence_mask(lengths, dtype="float32") attn_bias = F.scale(1. - mask, -1000) activations += F.unsqueeze(attn_bias, [1]) elif force_monotonic: assert window is not None backward_step, forward_step = window T_enc = k.shape[1] batch_size, T_dec, _ = q.shape # actually T_dec = 1 here alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \ if prev_coeffs is None \ else F.argmax(prev_coeffs, axis=-1) backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool") forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool") mask = F.cast(F.logical_xor(backward, forward), "float32") # print("mask's shape:", mask.shape) attn_bias = F.scale(1. - mask, -1000) activations += attn_bias # softmax coefficients = F.softmax(activations, axis=-1) # context vector coefficients = F.dropout(coefficients, 1. - self.keep_prob, dropout_implementation='upscale_in_train') contexts = F.matmul(coefficients, v) # context normalization enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32") contexts *= F.sqrt(enc_lengths) # out affine contexts = self.out_affine(contexts) return contexts, coefficients
def _sqrt(x): if isinstance(x, PTensor): return layers.sqrt(x) else: return np.sqrt(x)
def forward(self, x, y): # x,y误差一帧 u1 = zeros_like(x) u2 = zeros_like(x) l_t = self.l * self.t taut = self.a / self.t grad2_x = self.conv_img_grad(y) # grad2_x[:, :, :, 0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0]) # grad2_x[:, :, :, -1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2]) grad2_y = self.conv_img_grad2(y) # grad2_y[:, :, 0, :] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :]) # grad2_y[:, :, -1, :] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :]) p11 = zeros_like(x) p12 = zeros_like(x) p21 = zeros_like(x) p22 = zeros_like(x) gsqx = grad2_x**2 gsqy = grad2_y**2 grad = gsqx + gsqy + 1e-12 rho_c = y - grad2_x * u1 - grad2_y * u2 - x for i in range(self.n_iter): rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12 v1 = zeros_like(x) v2 = zeros_like(x) mask1 = rho < -l_t * grad mask2 = rho > l_t * grad mask3 = logical_and(logical_not(logical_or(mask1, mask2)), (grad > 1e-12)) mask1 = cast(mask1, dtype='float32') mask2 = cast(mask2, dtype='float32') mask3 = cast(mask3, dtype='float32') mask1.stop_gradient = True mask2.stop_gradient = True mask3.stop_gradient = True # v1 = v1 + l_t * grad2_x * mask1 - l_t * grad2_x * mask2 - (rho / grad) * grad2_x * mask3 # v2 = v2 + l_t * grad2_y * mask1 - l_t * grad2_y * mask2 - (rho / grad) * grad2_y * mask3 v1 = elementwise_add( u1, elementwise_add( elementwise_mul(l_t * grad2_x, mask1), elementwise_add( elementwise_mul(-l_t * grad2_x, mask2), elementwise_mul(-elementwise_div(rho, grad), elementwise_mul(grad2_x, mask3))))) v2 = elementwise_add( u2, elementwise_add( elementwise_mul(l_t * grad2_y, mask1), elementwise_add( elementwise_mul(-l_t * grad2_y, mask2), elementwise_mul(-elementwise_div(rho, grad), elementwise_mul(grad2_y, mask3))))) del rho del mask1 del mask2 del mask3 v1 += u1 v2 += u2 u1 = v1 + self.t * self.divergence(p11, p12) u2 = v2 + self.t * self.divergence(p21, p22) del v1 del v2 u1 = u1 u2 = u2 u1x, u1y = self.forward_grad(u1) u2x, u2y = self.forward_grad(u2) p11 = (p11 + taut * u1x) / (1. + taut * sqrt(u1x**2 + u1y**2 + 1e-12)) p12 = (p12 + taut * u1y) / (1. + taut * sqrt(u1x**2 + u1y**2 + 1e-12)) p21 = (p21 + taut * u2x) / (1. + taut * sqrt(u2x**2 + u2y**2 + 1e-12)) p22 = (p22 + taut * u2y) / (1. + taut * sqrt(u2x**2 + u2y**2 + 1e-12)) del u1x del u1y del u2x del u2y return u1, u2
def forward(self, x): ''' bt,c,w,h=x.shape tmp=layers.reshape(x,shape=[48,-1,c,w,h]) res=layers.reshape(tmp[:,:-1],shape=[-1,c,w,h])''' x = self.bottleneck(x) inp = self.norm_img(x) bt, c, w, h = inp.shape inp = layers.reshape(inp, shape=[self.batch_size, -1, c, w, h]) x = inp[:, :-1] y = inp[:, 1:] x = layers.reshape(layers.transpose(x, perm=[0, 2, 1, 3, 4]), shape=[-1, c, h, w]) y = layers.reshape(layers.transpose(y, perm=[0, 2, 1, 3, 4]), shape=[-1, c, h, w]) u1 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') u2 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') l_t = self.lamda * self.theta taut = self.tau / (self.theta + 1e-12) grad2_x = self.conv4Ix(layers.pad(y, (0, 0, 0, 0, 0, 0, 1, 1))) tmp = layers.unstack(grad2_x, axis=3) tmp[-1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2]) tmp[0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0]) grad2_x = layers.stack(tmp, axis=3) grad2_y = self.conv4Iy(layers.pad(y, (0, 0, 0, 0, 1, 1, 0, 0))) tmp = layers.unstack(grad2_y, axis=2) tmp[-1] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :]) tmp[0] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :]) grad2_y = layers.stack(tmp, axis=2) p11 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') p12 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') p21 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') p22 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') gsqx = grad2_x**2 gsqy = grad2_y**2 grad = gsqx + gsqy + 1e-12 rho_c = y - grad2_x * u1 - grad2_y * u2 - x for i in range(self.n_iter): rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12 mask1 = (rho < -l_t * grad).detach().astype('float32') mask1.stop_gradient = True tmp1 = l_t * grad2_x tmp2 = l_t * grad2_y v1 = tmp1 * mask1 v2 = tmp2 * mask1 mask2 = (rho > l_t * grad).detach().astype('float32') mask2.stop_gradient = True v1 = -tmp1 * mask2 + v1 v2 = -tmp2 * mask2 + v2 mask3 = fluid.layers.ones( x.shape, dtype='float32') - (mask1 + mask2 - mask1 * mask2) mask3.stop_gradient = True tmp1 = (-rho / grad) * grad2_x tmp2 = (-rho / grad) * grad2_y v1 = tmp1 * mask3 + v1 v2 = tmp2 * mask3 + v2 del rho del mask1 del mask2 del mask3 v1 += u1 v2 += u2 u1 = v1 + self.theta * self.divergence(p11, p12) u2 = v2 + self.theta * self.divergence(p21, p22) del v1 del v2 u1 = u1 u2 = u2 u1x, u1y = self.forward_grad(u1) u2x, u2y = self.forward_grad(u2) p11 = (p11 + taut * u1x) / ( 1. + taut * layers.sqrt(u1x**2 + u1y**2 + 1e-12)) p12 = (p12 + taut * u1y) / ( 1. + taut * layers.sqrt(u1x**2 + u1y**2 + 1e-12)) p21 = (p21 + taut * u2x) / ( 1. + taut * layers.sqrt(u2x**2 + u2y**2 + 1e-12)) p22 = (p22 + taut * u2y) / ( 1. + taut * layers.sqrt(u2x**2 + u2y**2 + 1e-12)) del u1x del u1y del u2x del u2y flow = layers.concat([u1, u2], axis=1) # flow = layers.transpose(layers.reshape(flow,shape=[b,t,c*2,h,w]),perm=[0,2,1,3,4]) flow = self.unbottleneck(flow) flow = self.bn(flow) if self.bn else flow return flow
def _dygraph_clip(self, params_grads): sum_square_fp32, sum_square_fp16 = [], [] unslice_params_fp32, unslice_params_fp16 = [], [] for p, g in params_grads: p_slice = True # using for slice parameter in sharding stage3 if g is None or getattr(p, 'need_clip', True) is False: continue if hasattr(p, "unslice"): p_slice = False merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.get_tensor_from_selected_rows( layers.merge_selected_rows(g)) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) if p.dtype == paddle.float16: if p_slice: sum_square_fp16.append(sum_square) else: unslice_params_fp16.append(sum_square) elif p.dtype == paddle.float32: if p_slice: sum_square_fp32.append(sum_square) else: unslice_params_fp32.append(sum_square) # global norm of non-distributed FP16 params_and_grads if len(sum_square_fp16) == 0: global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_norm_fp16 = layers.concat(sum_square_fp16) global_norm_fp16 = layers.reduce_sum(global_norm_fp16) global_norm_fp16 = paddle.cast( global_norm_fp16, dtype=paddle.float32) # global norm of non-distributed FP16 params_and_grads for unslice parameters if len(unslice_params_fp16) == 0: global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_unslice_fp16 = layers.concat(unslice_params_fp16) global_unslice_fp16 = layers.reduce_sum(global_unslice_fp16) global_unslice_fp16 = paddle.cast( global_unslice_fp16, dtype=paddle.float32) # global norm of non-distributed FP32 params_and_grads global_norm_fp32 = layers.concat(sum_square_fp32) if len( sum_square_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_norm_fp32 = layers.reduce_sum(global_norm_fp32) # global norm of non-distributed FP32 params_and_grads for unslice parameters global_unslice_fp32 = layers.concat(unslice_params_fp32) if len( unslice_params_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32) global_unslice_var = global_unslice_fp16 + global_unslice_fp32 global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var # add all reduce to get global norm of distributed params_and_grads dev_id = int(self._device.split(":")[1]) if paddle.device.get_device() == "cpu": global_norm_var = global_norm_var.cuda(dev_id) with device_guard(dev_id, "gpu"): paddle.distributed.all_reduce(global_norm_var, group=self._group) global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: if getattr(p, 'need_clip', True) is False or g is None: continue origin_state = g.stop_gradient g.stop_gradient = True if p.dtype == paddle.float16: g.scale_(clip_var_fp16.item()) else: g.scale_(clip_var.item()) g.stop_gradient = origin_state # p._reset_grad_inplace_version(True) return params_grads
def forward(self, x): tmp = layers.elementwise_mul(x, x) # or x ** 2 tmp1 = layers.sqrt( layers.reduce_mean(tmp, dim=1, keep_dim=True) + self.epsilon) return x * tmp1
def _dygraph_clip(self, params_grads): params_and_grads = [] sum_square_dist_fp16 = [] sum_square_dist_fp32 = [] sum_square_not_dist_fp16 = [] sum_square_not_dist_fp32 = [] for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or ( hasattr(p, 'is_firstly_shared') and getattr(p, 'is_firstly_shared', True)) if not_shared_enable: if p.is_distributed: if p.dtype == paddle.float16: sum_square_dist_fp16.append(sum_square) elif p.dtype == paddle.float32: sum_square_dist_fp32.append(sum_square) else: if p.dtype == paddle.float16: sum_square_not_dist_fp16.append(sum_square) elif p.dtype == paddle.float32: sum_square_not_dist_fp32.append(sum_square) # global norm of distributed FP16 params_and_grads if len(sum_square_dist_fp16) == 0: global_norm_dist_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16) global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16) global_norm_dist_fp16 = paddle.cast(global_norm_dist_fp16, dtype=paddle.float32) # global norm of non-distributed FP16 params_and_grads if len(sum_square_not_dist_fp16) == 0: global_norm_not_dist_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16) global_norm_not_dist_fp16 = layers.reduce_sum( global_norm_not_dist_fp16) global_norm_not_dist_fp16 = paddle.cast(global_norm_not_dist_fp16, dtype=paddle.float32) # global norm of distributed FP32 params_and_grads global_norm_dist_fp32 = layers.concat(sum_square_dist_fp32) if len( sum_square_dist_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_norm_dist_fp32 = layers.reduce_sum(global_norm_dist_fp32) # global norm of non-distributed FP32 params_and_grads global_norm_not_dist_fp32 = layers.concat( sum_square_not_dist_fp32 ) if len(sum_square_not_dist_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_norm_not_dist_fp32 = layers.reduce_sum( global_norm_not_dist_fp32) global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32 global_norm_var_not_dist = global_norm_not_dist_fp16 + global_norm_not_dist_fp32 # add all reduce to get global norm of distributed params_and_grads if self._hcg.get_model_parallel_world_size() > 1: paddle.distributed.all_reduce( global_norm_var_dist, group=self._hcg.get_check_parallel_group()) # add all reduce to get global norm of non-distributed params_and_grads in groups of pp if self._hcg.get_pipe_parallel_world_size() > 1: paddle.distributed.all_reduce( global_norm_var_not_dist, group=self._hcg.get_pipe_parallel_group()) # In Sharding mode, param and grad is mapping different rank in optimizer. # ClipGradByGlobalNorm need allreduce to get globol norm if self._hcg.get_sharding_parallel_world_size() > 1: paddle.distributed.all_reduce( global_norm_var_not_dist, group=self._hcg.get_sharding_parallel_group()) global_norm_var_fp32 = layers.sqrt(global_norm_var_dist + global_norm_var_not_dist) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm) clip_var = layers.elementwise_div(x=max_global_norm, y=layers.elementwise_max( x=global_norm_var_fp32, y=max_global_norm)) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue if p.dtype == paddle.float16: new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16) else: new_grad = layers.elementwise_mul(x=g, y=clip_var) params_and_grads.append((p, new_grad)) return params_and_grads