def bprop(x, z, out, dout): if mean_flag: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: z = F.depend(z, F.assign_add(z, dout)) real_grad = all_reduce(z) dx = real_grad else: dx = dout float_one = F.scalar_cast(1.0, F.dtype(dx)) num = F.scalar_cast(dev_num, F.dtype(dx)) dx = mul(dx, cast(F.scalar_to_array(float_one / num), F.dtype(dx))) else: dx = zeros_like( x) # The grad accumulation do not support row tensor now else: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: z = F.depend(z, F.assign_add(z, dout)) real_grad = all_reduce(z) dx = real_grad else: dx = dout else: dx = zeros_like( x) # The grad accumulation do not support row tensor now return (dx, zeros_like(z))
def bprop(x, z, out, dout): if do_mirror: if mean_flag: z = F.depend(z, F.assign_add(z, dout)) grad = all_reduce(z) dx = split(grad)[rank] dx = F.tensor_mul(dx, scale) else: z = F.depend(z, F.assign_add(z, dout)) grad = all_reduce(z) dx = split(grad)[rank] else: dx = dout return (dx, zeros_like(z))
def _run_off_load_opt(opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, param, moment1, moment2): """Apply AdamOffload optimizer to the weight parameter using Tensor.""" success = True delat_param = opt(moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2, eps, gradient) success = F.depend(success, F.assign_add(param, delat_param)) return success
def _accumulate_accu_grads(accu_grad, grad): succ = True return F.depend(succ, F.assign_add(accu_grad, cast(grad, mstype.float32)))