def arithmetic_simplify_02(x, y): """ arithmetic_simplify_02 """ return C.ones_like(x) * y
def arithmetic_simplify_03(x, y): """ arithmetic_simplify_03 """ return x * C.ones_like(y)
def construct(self, data, label, sens=None): """ construct a compute flow. """ init = False if not self.gpu_target: # init overflow buffer init = self.alloc_status() # clear overflow buffer self.clear_status(init) if sens is None: scaling_sens = self.loss_scale else: scaling_sens = sens # DP clip weights = self.weights record_datas = self._split(data) record_labels = self._split(label) # first index loss = self.network(record_datas[0], record_labels[0]) scaling_sens_filled = C.ones_like(loss) * F.cast( scaling_sens, F.dtype(loss)) record_grad = self.grad(self.network, weights)(record_datas[0], record_labels[0], scaling_sens_filled) beta = self._zero square_sum = self._zero for grad in record_grad: square_sum = self._add(square_sum, self._reduce_sum(self._square_all(grad))) norm_grad = self._sqrt(square_sum) beta = self._add( beta, self._cast(self._less(norm_grad, self._norm_bound), mstype.float32)) record_grad = self._clip_by_global_norm(record_grad, GRADIENT_CLIP_TYPE, self._norm_bound) grads = record_grad total_loss = loss for i in range(1, self._micro_batches): loss = self.network(record_datas[i], record_labels[i]) scaling_sens_filled = C.ones_like(loss) * F.cast( scaling_sens, F.dtype(loss)) record_grad = self.grad(self.network, weights)(record_datas[i], record_labels[i], scaling_sens_filled) square_sum = self._zero for grad in record_grad: square_sum = self._add( square_sum, self._reduce_sum(self._square_all(grad))) norm_grad = self._sqrt(square_sum) beta = self._add( beta, self._cast(self._less(norm_grad, self._norm_bound), mstype.float32)) record_grad = self._clip_by_global_norm(record_grad, GRADIENT_CLIP_TYPE, self._norm_bound) grads = self._tuple_add(grads, record_grad) total_loss = P.TensorAdd()(total_loss, loss) loss = P.Div()(total_loss, self._micro_float) beta = self._div(beta, self._micro_batches) if self._noise_mech is not None: grad_noise_tuple = () for grad_item in grads: grad_noise = self._mech(grad_item) grad_noise_tuple = grad_noise_tuple + (grad_noise, ) grads = self._tuple_add(grads, grad_noise_tuple) grads = self._hyper_map(F.partial(_grad_scale, self._micro_float), grads) # update mech parameters if self._noise_mech_param_updater is not None: multiplier = self._noise_mech_param_updater() loss = F.depend(loss, multiplier) grads = self.hyper_map(F.partial(_grad_scale, scaling_sens), grads) # apply grad reducer on grads grads = self.grad_reducer(grads) # get the overflow buffer if not self.gpu_target: self.get_status(init) # sum overflow buffer elements, 0:not overflow , >0:overflow flag_sum = self.reduce_sum(init, (0, )) else: flag_sum = self.hyper_map(F.partial(_grad_overflow), grads) flag_sum = self.addn(flag_sum) # convert flag_sum to scalar flag_sum = self.reshape(flag_sum, (())) if self.is_distributed: # sum overflow flag over devices flag_reduce = self.allreduce(flag_sum) cond = self.less_equal(self.base, flag_reduce) else: cond = self.less_equal(self.base, flag_sum) overflow = cond if sens is None: overflow = self.loss_scaling_manager(self.loss_scale, cond) # if there is no overflow, do optimize if overflow: opt = False else: opt = self.optimizer(grads) ret = (loss, cond, scaling_sens) if self._clip_mech is not None: next_norm_bound = self._clip_mech(beta, self._norm_bound) P.assign(self._norm_bound, next_norm_bound) return F.depend(ret, opt)