def _update_run_op_for_map(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag): op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32)) update = next_m / (op_sqrt(next_v) + eps) if decay_flag: update = update + op_mul(weight_decay_tensor, param_fp32) update_with_lr = op_mul(lr, update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_v = F.depend(next_v, F.assign(param, next_param)) next_v = F.depend(next_v, F.assign(m, next_m)) next_v = F.depend(next_v, F.assign(v, next_v)) return next_v
def broadcast_params(self, optim_result): """ Apply Broadcast operations in the sequential order of parameter groups. Returns: bool, the status flag. """ param_group = [] key_group = [] for _ in range(self.dev_num): param_group.append(F.make_tuple()) key_group.append(F.make_tuple()) for i in range(self.param_length): param_group[self.param_rank[i]] = param_group[ self.param_rank[i]] + (self.parameters[i], ) key = P.MakeRefKey(self.param_names[i])() key_group[ self.param_rank[i]] = key_group[self.param_rank[i]] + (key, ) new_param_group = [] for root in range(self.dev_num): ops = P.Broadcast(root) next_params = ops(param_group[root]) new_param_group.append(next_params) for i in range(F.tuple_len(next_params)): F.assign(key_group[root][i], next_params[i]) status = F.control_depend(optim_result, new_param_group[0][0]) for i in range(self.dev_num - 1): status = F.depend( F.control_depend(new_param_group[i], new_param_group[i + 1][0]), status) return status
def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be equal to or greater than 0. param (Tensor): Parameters. m (Tensor): m value of parameters. v (Tensor): v value of parameters. gradient (Tensor): Gradient of parameters. decay_flag (bool): Applies weight decay or not. optim_filter (bool): Applies parameter update or not. Returns: Tensor, the new value of v after updating. """ if optim_filter: op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32)) update = next_m / (eps + op_sqrt(next_v)) if decay_flag: update = op_mul(weight_decay, param_fp32) + update update_with_lr = op_mul(lr, update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_param = F.depend( next_param, F.assign(param, op_cast(next_param, F.dtype(param)))) next_param = F.depend(next_param, F.assign(m, op_cast(next_m, F.dtype(m)))) next_param = F.depend(next_param, F.assign(v, op_cast(next_v, F.dtype(v)))) return op_cast(next_param, F.dtype(param)) return gradient
def construct(self, x, y): out = self.zero for i in range(self.max_cycles): if out <= 20: self.weight = out F.assign(self.weight, i) out = x * y + out return out, self.weight
def step_end(self, run_context): cb_params = run_context.original_args() arr_lr = cb_params.optimizer.learning_rate.asnumpy() lr = float(np.array2string(arr_lr)) new_lr = self.learning_rate_function(lr, cb_params.cur_step_num) if not math.isclose(lr, new_lr, rel_tol=1e-10): F.assign(cb_params.optimizer.learning_rate, Tensor(new_lr, mstype.float32)) print(f'At step {cb_params.cur_step_num}, learning_rate change to {new_lr}')
def construct(self, x, y): out = self.zero i = self.i if x > y: while i < self.max_cycles: self.weight = i F.assign(self.weight, i) out = x * y + out i = i + 1 return out, self.weight
def construct(self, x, y): i = self.i out = self.zero while i < self.max_cycles: if out <= 20: out = x * y + out # use F.Assign will throw NameSpace error. F.assign(self.weight, i) self.weight = i i = i + 1 return out, self.weight
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, param, m, v, ps_parameter, cache_enable): """Apply sparse adam optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices values = gradient.values if ps_parameter and not cache_enable: op_shape = P.Shape() shapes = (op_shape(param), op_shape(m), op_shape(v), op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1), op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices)) success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices), shapes), param)) return success if not target: success = F.depend(success, sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices)) else: op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() scatter_add = P.ScatterAdd(use_locking) success = F.depend(success, F.assign(m, op_mul(beta1, m))) success = F.depend(success, F.assign(v, op_mul(beta2, v))) grad_indices = gradient.indices grad_value = gradient.values next_m = scatter_add(m, grad_indices, op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) next_v = scatter_add(v, grad_indices, op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value))) if use_nesterov: m_temp = next_m * _scaler_ten F.assign(m, op_mul(beta1, next_m)) div_value = scatter_add(m, op_mul(grad_indices, _scaler_one), op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) param_update = div_value / (op_sqrt(next_v) + eps) F.assign(m, m_temp / _scaler_ten) else: param_update = next_m / (op_sqrt(next_v) + eps) lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power) next_param = param - lr_t * param_update success = F.depend(success, F.assign(param, next_param)) success = F.depend(success, F.assign(m, next_m)) success = F.depend(success, F.assign(v, next_v)) return success
def construct(self, x, y): i = self.i out = self.zero while i < self.max_cycles: F.assign(self.weight, i) self.weight = i out = x * y + out i = i + 1 if out >= 20: F.assign(self.weight, out) self.weight = out out = out - 20 return out, self.weight
def construct(self, input_ids, input_mask, token_type_id, label_ids, sens=None): """Defines the computation performed.""" weights = self.weights for i in range(self.length): F.assign(self.saved_params[i], weights[i]) for i in range(self.quant_embedding_list_length): quant_embedding = self.quantize_embedding( weights[self.quant_embedding_list[i]]) F.assign(weights[self.quant_embedding_list[i]], quant_embedding) for i in range(self.quant_weight_list_length): quant_weight = self.quantize_weight( weights[self.quant_weight_list[i]]) F.assign(weights[self.quant_weight_list[i]], quant_weight) if sens is None: scaling_sens = self.loss_scale else: scaling_sens = sens # alloc status and clear should be right before grad operation init = self.alloc_status() self.clear_before_grad(init) grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, label_ids, self.cast(scaling_sens, mstype.float32)) # apply grad reducer on grads grads = self.grad_reducer(grads) grads = self.hyper_map( F.partial(grad_scale, scaling_sens * self.degree), grads) grads = self.hyper_map( F.partial(clip_grad, self.clip_type, self.clip_value), grads) for i in range(self.length): param = F.depend(self.saved_params[i], grads) F.assign(weights[i], param) self.get_status(init) flag_sum = self.reduce_sum(init, (0, )) if self.is_distributed: # sum overflow flag over devices flag_reduce = self.allreduce(flag_sum) cond = self.less_equal(self.base, flag_reduce) else: cond = self.less_equal(self.base, flag_sum) overflow = cond if sens is None: overflow = self.loss_scaling_manager(self.loss_scale, cond) if overflow: succ = False else: succ = self.optimizer(grads) return succ
def construct(self, input_ids, input_mask, token_type_id, label_ids): """Defines the computation performed.""" weights = self.weights for i in range(self.length): F.assign(self.saved_params[i], weights[i]) for i in range(self.quant_embedding_list_length): quant_embedding = self.quantize_embedding( weights[self.quant_embedding_list[i]]) F.assign(weights[self.quant_embedding_list[i]], quant_embedding) for i in range(self.quant_weight_list_length): quant_weight = self.quantize_weight( weights[self.quant_weight_list[i]]) F.assign(weights[self.quant_weight_list[i]], quant_weight) grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, label_ids, self.cast(F.tuple_to_array((self.sens, )), mstype.float32)) # apply grad reducer on grads grads = self.grad_reducer(grads) grads = self.hyper_map( F.partial(clip_grad, self.clip_type, self.clip_value), grads) for i in range(self.length): param = F.depend(self.saved_params[i], grads) F.assign(weights[i], param) succ = self.optimizer(grads) return succ
def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay_tensor (Tensor): Weight decay. Should be equal to or greater than 0. param (Tensor): Parameters. m (Tensor): m value of parameters. v (Tensor): v value of parameters. gradient (Tensor): Gradient of parameters. Returns: Tensor, the new value of v after updating. """ op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() param = op_cast(param, mstype.float32) m = op_cast(m, mstype.float32) v = op_cast(v, mstype.float32) gradient = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m) + op_mul( op_cast(F.tuple_to_array((1.0, )), mstype.float32) - beta1, gradient) next_v = op_mul(beta2, v) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta2, op_square(gradient)) update = next_m / (op_sqrt(next_v) + eps) if decay_flag: update = update + op_mul(weight_decay_tensor, param) update_with_lr = op_mul(lr, update) next_param = param - op_reshape(update_with_lr, op_shape(param)) next_v = F.depend(next_v, F.assign(param, next_param)) next_v = F.depend(next_v, F.assign(m, next_m)) next_v = F.depend(next_v, F.assign(v, next_v)) return next_v
def update_opt_step(learning_rate, batch_size, parameter, gradient): """ Update opt step. Args: learning_rate (Tensor): Learning rate. batch_size (Tensor): Batch Size. parameter (Tensor): Parameter. gradient (Tensor): Gradients. Returns: """ next_param = parameter - learning_rate * gradient / batch_size F.assign(parameter, next_param) return next_param
def tensor_run_opt(opt, iters, learning_rate, momentum, gradient, variable, moment): """ tensor_run_opt """ success = True new_weight = opt(variable, moment, learning_rate, gradient, momentum) success = F.depend(success, F.assign(variable, new_weight)) return success
def tensor_grad_scale(scale, grad, accu_grad): #mul = P.Mul() new_grad = accu_grad * reciprocal(scale) zeros = F.tensor_mul(accu_grad, 0.0) clear = F.assign(accu_grad, zeros) F.control_depend(new_grad, clear) F.control_depend(grad, new_grad) return new_grad
def construct(self, beta1, beta2, one_sub_beta_1, one_sub_beta_2, gradient, eps, weight_decay_tensor, lr): F.assign(self.param, self.x) param_fp32 = self.op_cast(self.param, mstype.float32) m_fp32 = self.op_cast(self.m, mstype.float32) v_fp32 = self.op_cast(self.v, mstype.float32) gradient_fp32 = self.op_cast(gradient, mstype.float32) next_m = self.op_mul(beta1, m_fp32) + \ self.op_mul(self.op_cast(one_sub_beta_1, mstype.float32), gradient_fp32) next_v = self.op_mul(beta2, v_fp32) + self.op_mul(self.op_cast(one_sub_beta_2, mstype.float32), self.op_square(gradient_fp32)) update = next_m / (eps + self.op_sqrt(next_v)) if self.decay_flag: update = self.op_mul(weight_decay_tensor, param_fp32) + update update_with_lr = self.op_mul(lr, update) next_param = param_fp32 - \ self.op_reshape(update_with_lr, self.op_shape(param_fp32)) depend_v = F.depend(next_param, F.assign(self.param, next_param)) depend_v = F.depend(depend_v, F.assign(self.m, next_m)) depend_v = F.depend(depend_v, F.assign(self.v, next_v)) F.assign(self.x, self.m) return depend_v
def construct(self, input_ids, input_mask, token_type_id, label_ids): """Defines the computation performed.""" weights = self.weights saved = () for i in range(self.length): saved = saved + (F.assign(self.saved_params[i], weights[i]), ) assign_embedding = () for i in range(self.quant_embedding_list_length): quant_embedding = self.quantize_embedding( weights[self.quant_embedding_list[i]]) assign_embedding = assign_embedding + (F.assign( weights[self.quant_embedding_list[i]], quant_embedding), ) F.control_depend(saved, assign_embedding[i]) assign_weight = () for i in range(self.quant_weight_list_length): quant_weight = self.quantize_weight( weights[self.quant_weight_list[i]]) assign_weight = assign_weight + (F.assign( weights[self.quant_weight_list[i]], quant_weight), ) F.control_depend(saved, assign_weight[i]) for i in range(self.quant_embedding_list_length): F.control_depend(assign_embedding[i], input_ids) for i in range(self.quant_weight_list_length): F.control_depend(assign_weight[i], input_ids) grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, label_ids, self.cast(F.tuple_to_array((self.sens, )), mstype.float32)) F.control_depend(input_ids, grads) # apply grad reducer on grads grads = self.grad_reducer(grads) grads = self.hyper_map( F.partial(clip_grad, gradient_cfg.clip_type, gradient_cfg.clip_value), grads) restore = () for i in range(self.length): restore = restore + (F.assign(weights[i], self.saved_params[i]), ) F.control_depend(grads, restore[i]) succ = self.optimizer(grads) for i in range(self.length): F.control_depend(restore[i], succ) return succ
def construct(self, beta1, beta2, gradient, eps, weight_decay_tensor, lr): param_fp32 = self.op_cast(self.param, mstype.float32) m_fp32 = self.op_cast(self.m, mstype.float32) v_fp32 = self.op_cast(self.v, mstype.float32) gradient_fp32 = self.op_cast(gradient, mstype.float32) next_m = self.op_mul(beta1, m_fp32) + \ self.op_mul(self.op_cast(F.tuple_to_array((1.0,)), mstype.float32) - beta1, gradient_fp32) next_v = self.op_mul(beta2, v_fp32) + self.op_mul(self.op_cast(F.tuple_to_array((1.0,)), mstype.float32) - \ beta2, self.op_square(gradient_fp32)) update = next_m / (eps + self.op_sqrt(next_v)) if self.decay_flag: update = self.op_mul(weight_decay_tensor, param_fp32) + update update_with_lr = self.op_mul(lr, update) next_param = param_fp32 - self.op_reshape(update_with_lr, self.op_shape(param_fp32)) next_v = F.depend(next_v, F.assign(self.param, next_param)) next_v = F.depend(next_v, F.assign(self.m, next_m)) next_v = F.depend(next_v, F.assign(self.v, next_v)) return next_v
def _reset_accu_grads(accu_grad): succ = True return F.depend(succ, F.assign(accu_grad, zeroslike(accu_grad)))
def _update_accu_grads(accu_grad, grad): succ = True return F.depend(succ, F.assign(accu_grad, cast(grad, mstype.float32)))
def construct(self, x, y): add_res = self.add(x, y) F.depend(add_res, F.assign(self.param, add_res)) return add_res
def _update_run_op(beta1, beta2, eps, global_step, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be equal to or greater than 0. global_step (Tensor): Global step. param (Tensor): Parameters. m (Tensor): m value of parameters. v (Tensor): v value of parameters. gradient (Tensor): Gradient of parameters. decay_flag (bool): Specifies whether param update with weight decay. optim_filter(bool): Applies parameter update or not. Returns: Tensor, the new value of v after updating. """ if optim_filter: op_mul = P.Mul() op_sqrt = P.Sqrt() op_rsqrt = P.Rsqrt() op_square = P.Square() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() op_pow = P.Pow() op_norm = layer.Norm() op_select = P.Select() op_greater = P.Greater() op_fill = P.Fill() op_dtype = P.DType() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(num_one, mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32)) next_mm = next_m / (op_cast(num_one, mstype.float32) - op_pow( beta1, op_cast(global_step + num_one, mstype.float32))) next_vv = next_v / (op_cast(num_one, mstype.float32) - op_pow( beta2, op_cast(global_step + num_one, mstype.float32))) w_norm = op_norm(param_fp32) g_norm = op_norm(gradient_fp32) g_norm_hat = op_norm( op_mul(next_mm, op_rsqrt(next_vv + eps)) + weight_decay * param_fp32) zeros = F.zeros_like(w_norm) ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0) trust_ratio = op_select( op_greater(w_norm, zeros), op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones), ones) tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0) trust_ratio = C.clip_by_value(trust_ratio, zeros, tens) update = next_mm / (op_sqrt(next_vv) + eps) if decay_flag: update = update + op_mul(weight_decay, param_fp32) update_with_lr = op_mul(op_mul(trust_ratio, lr), update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_param = F.depend( next_param, F.assign(param, op_cast(next_param, F.dtype(param)))) next_param = F.depend(next_param, F.assign(m, op_cast(next_m, F.dtype(m)))) next_param = F.depend(next_param, F.assign(v, op_cast(next_v, F.dtype(v)))) return op_cast(next_param, F.dtype(param)) return gradient
def _clear_grad_sum(grad_sum, zero): """Apply zero to clear grad_sum.""" success = True success = F.depend(success, F.assign(grad_sum, zero)) return success
def construct(self, x, y): F.assign(self.cov_step, y) F.assign(x, y) return x