Example #1
0
def _update_run_op_for_map(beta1, beta2, eps, lr, weight_decay_tensor, param,
                           m, v, gradient, decay_flag):
    op_mul = P.Mul()
    op_square = P.Square()
    op_sqrt = P.Sqrt()
    op_cast = P.Cast()
    op_reshape = P.Reshape()
    op_shape = P.Shape()

    param_fp32 = op_cast(param, mstype.float32)
    m_fp32 = op_cast(m, mstype.float32)
    v_fp32 = op_cast(v, mstype.float32)
    gradient_fp32 = op_cast(gradient, mstype.float32)

    next_m = op_mul(beta1, m_fp32) + op_mul(
        op_cast(F.tuple_to_array(
            (1.0, )), mstype.float32) - beta1, gradient_fp32)

    next_v = op_mul(beta2, v_fp32) + op_mul(
        op_cast(F.tuple_to_array(
            (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32))

    update = next_m / (op_sqrt(next_v) + eps)
    if decay_flag:
        update = update + op_mul(weight_decay_tensor, param_fp32)

    update_with_lr = op_mul(lr, update)
    next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))

    next_v = F.depend(next_v, F.assign(param, next_param))
    next_v = F.depend(next_v, F.assign(m, next_m))
    next_v = F.depend(next_v, F.assign(v, next_v))
    return next_v
Example #2
0
    def broadcast_params(self, optim_result):
        """
        Apply Broadcast operations in the sequential order of parameter groups.

        Returns:
             bool, the status flag.
        """
        param_group = []
        key_group = []
        for _ in range(self.dev_num):
            param_group.append(F.make_tuple())
            key_group.append(F.make_tuple())
        for i in range(self.param_length):
            param_group[self.param_rank[i]] = param_group[
                self.param_rank[i]] + (self.parameters[i], )
            key = P.MakeRefKey(self.param_names[i])()
            key_group[
                self.param_rank[i]] = key_group[self.param_rank[i]] + (key, )
        new_param_group = []
        for root in range(self.dev_num):
            ops = P.Broadcast(root)
            next_params = ops(param_group[root])
            new_param_group.append(next_params)
            for i in range(F.tuple_len(next_params)):
                F.assign(key_group[root][i], next_params[i])
        status = F.control_depend(optim_result, new_param_group[0][0])
        for i in range(self.dev_num - 1):
            status = F.depend(
                F.control_depend(new_param_group[i],
                                 new_param_group[i + 1][0]), status)

        return status
Example #3
0
def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient,
                   decay_flag, optim_filter):
    """
    Update parameters.

    Args:
        beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
        beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0).
        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
        lr (Tensor): Learning rate.
        weight_decay (Number): Weight decay. Should be equal to or greater than 0.
        param (Tensor): Parameters.
        m (Tensor): m value of parameters.
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.
        decay_flag (bool): Applies weight decay or not.
        optim_filter (bool): Applies parameter update or not.

    Returns:
        Tensor, the new value of v after updating.
    """
    if optim_filter:
        op_mul = P.Mul()
        op_square = P.Square()
        op_sqrt = P.Sqrt()
        op_cast = P.Cast()
        op_reshape = P.Reshape()
        op_shape = P.Shape()

        param_fp32 = op_cast(param, mstype.float32)
        m_fp32 = op_cast(m, mstype.float32)
        v_fp32 = op_cast(v, mstype.float32)
        gradient_fp32 = op_cast(gradient, mstype.float32)

        next_m = op_mul(beta1, m_fp32) + op_mul(
            op_cast(F.tuple_to_array(
                (1.0, )), mstype.float32) - beta1, gradient_fp32)

        next_v = op_mul(beta2, v_fp32) + op_mul(
            op_cast(F.tuple_to_array(
                (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32))

        update = next_m / (eps + op_sqrt(next_v))
        if decay_flag:
            update = op_mul(weight_decay, param_fp32) + update

        update_with_lr = op_mul(lr, update)
        next_param = param_fp32 - op_reshape(update_with_lr,
                                             op_shape(param_fp32))

        next_param = F.depend(
            next_param, F.assign(param, op_cast(next_param, F.dtype(param))))
        next_param = F.depend(next_param,
                              F.assign(m, op_cast(next_m, F.dtype(m))))
        next_param = F.depend(next_param,
                              F.assign(v, op_cast(next_v, F.dtype(v))))

        return op_cast(next_param, F.dtype(param))
    return gradient
 def construct(self, x, y):
     out = self.zero
     for i in range(self.max_cycles):
         if out <= 20:
             self.weight = out
             F.assign(self.weight, i)
             out = x * y + out
     return out, self.weight
 def step_end(self, run_context):
     cb_params = run_context.original_args()
     arr_lr = cb_params.optimizer.learning_rate.asnumpy()
     lr = float(np.array2string(arr_lr))
     new_lr = self.learning_rate_function(lr, cb_params.cur_step_num)
     if not math.isclose(lr, new_lr, rel_tol=1e-10):
         F.assign(cb_params.optimizer.learning_rate, Tensor(new_lr, mstype.float32))
         print(f'At step {cb_params.cur_step_num}, learning_rate change to {new_lr}')
 def construct(self, x, y):
     out = self.zero
     i = self.i
     if x > y:
         while i < self.max_cycles:
             self.weight = i
             F.assign(self.weight, i)
             out = x * y + out
             i = i + 1
     return out, self.weight
Example #7
0
 def construct(self, x, y):
     i = self.i
     out = self.zero
     while i < self.max_cycles:
         if out <= 20:
             out = x * y + out
             # use F.Assign will throw NameSpace error.
             F.assign(self.weight, i)
             self.weight = i
         i = i + 1
     return out, self.weight
Example #8
0
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power,
                         beta2_power, beta1, beta2, eps, lr, gradient, param, m, v, ps_parameter, cache_enable):
    """Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
    success = True
    indices = gradient.indices
    values = gradient.values
    if ps_parameter and not cache_enable:
        op_shape = P.Shape()
        shapes = (op_shape(param), op_shape(m), op_shape(v),
                  op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
                  op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
        success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
                                               eps, values, indices), shapes), param))
        return success

    if not target:
        success = F.depend(success, sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2,
                                               eps, values, indices))
    else:
        op_mul = P.Mul()
        op_square = P.Square()
        op_sqrt = P.Sqrt()
        scatter_add = P.ScatterAdd(use_locking)

        success = F.depend(success, F.assign(m, op_mul(beta1, m)))
        success = F.depend(success, F.assign(v, op_mul(beta2, v)))

        grad_indices = gradient.indices
        grad_value = gradient.values

        next_m = scatter_add(m,
                             grad_indices,
                             op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))

        next_v = scatter_add(v,
                             grad_indices,
                             op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value)))

        if use_nesterov:
            m_temp = next_m * _scaler_ten
            F.assign(m, op_mul(beta1, next_m))
            div_value = scatter_add(m,
                                    op_mul(grad_indices, _scaler_one),
                                    op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))
            param_update = div_value / (op_sqrt(next_v) + eps)
            F.assign(m, m_temp / _scaler_ten)
        else:
            param_update = next_m / (op_sqrt(next_v) + eps)

        lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power)
        next_param = param - lr_t * param_update

        success = F.depend(success, F.assign(param, next_param))
        success = F.depend(success, F.assign(m, next_m))
        success = F.depend(success, F.assign(v, next_v))

    return success
Example #9
0
 def construct(self, x, y):
     i = self.i
     out = self.zero
     while i < self.max_cycles:
         F.assign(self.weight, i)
         self.weight = i
         out = x * y + out
         i = i + 1
     if out >= 20:
         F.assign(self.weight, out)
         self.weight = out
         out = out - 20
     return out, self.weight
    def construct(self,
                  input_ids,
                  input_mask,
                  token_type_id,
                  label_ids,
                  sens=None):
        """Defines the computation performed."""
        weights = self.weights
        for i in range(self.length):
            F.assign(self.saved_params[i], weights[i])

        for i in range(self.quant_embedding_list_length):
            quant_embedding = self.quantize_embedding(
                weights[self.quant_embedding_list[i]])
            F.assign(weights[self.quant_embedding_list[i]], quant_embedding)

        for i in range(self.quant_weight_list_length):
            quant_weight = self.quantize_weight(
                weights[self.quant_weight_list[i]])
            F.assign(weights[self.quant_weight_list[i]], quant_weight)

        if sens is None:
            scaling_sens = self.loss_scale
        else:
            scaling_sens = sens

        # alloc status and clear should be right before grad operation
        init = self.alloc_status()
        self.clear_before_grad(init)
        grads = self.grad(self.network,
                          weights)(input_ids, input_mask, token_type_id,
                                   label_ids,
                                   self.cast(scaling_sens, mstype.float32))
        # apply grad reducer on grads
        grads = self.grad_reducer(grads)
        grads = self.hyper_map(
            F.partial(grad_scale, scaling_sens * self.degree), grads)
        grads = self.hyper_map(
            F.partial(clip_grad, self.clip_type, self.clip_value), grads)

        for i in range(self.length):
            param = F.depend(self.saved_params[i], grads)
            F.assign(weights[i], param)

        self.get_status(init)
        flag_sum = self.reduce_sum(init, (0, ))
        if self.is_distributed:
            # sum overflow flag over devices
            flag_reduce = self.allreduce(flag_sum)
            cond = self.less_equal(self.base, flag_reduce)
        else:
            cond = self.less_equal(self.base, flag_sum)
        overflow = cond
        if sens is None:
            overflow = self.loss_scaling_manager(self.loss_scale, cond)
        if overflow:
            succ = False
        else:
            succ = self.optimizer(grads)
        return succ
    def construct(self, input_ids, input_mask, token_type_id, label_ids):
        """Defines the computation performed."""
        weights = self.weights
        for i in range(self.length):
            F.assign(self.saved_params[i], weights[i])

        for i in range(self.quant_embedding_list_length):
            quant_embedding = self.quantize_embedding(
                weights[self.quant_embedding_list[i]])
            F.assign(weights[self.quant_embedding_list[i]], quant_embedding)

        for i in range(self.quant_weight_list_length):
            quant_weight = self.quantize_weight(
                weights[self.quant_weight_list[i]])
            F.assign(weights[self.quant_weight_list[i]], quant_weight)

        grads = self.grad(self.network,
                          weights)(input_ids, input_mask, token_type_id,
                                   label_ids,
                                   self.cast(F.tuple_to_array((self.sens, )),
                                             mstype.float32))
        # apply grad reducer on grads
        grads = self.grad_reducer(grads)
        grads = self.hyper_map(
            F.partial(clip_grad, self.clip_type, self.clip_value), grads)

        for i in range(self.length):
            param = F.depend(self.saved_params[i], grads)
            F.assign(weights[i], param)

        succ = self.optimizer(grads)
        return succ
Example #12
0
def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v,
                   gradient, decay_flag):
    """
    Update parameters.

    Args:
        beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0).
        beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0).
        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
        lr (Tensor): Learning rate.
        weight_decay_tensor (Tensor): Weight decay. Should be equal to or greater than 0.
        param (Tensor): Parameters.
        m (Tensor): m value of parameters.
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.

    Returns:
        Tensor, the new value of v after updating.
    """
    op_mul = P.Mul()
    op_square = P.Square()
    op_sqrt = P.Sqrt()
    op_cast = P.Cast()
    op_reshape = P.Reshape()
    op_shape = P.Shape()

    param = op_cast(param, mstype.float32)
    m = op_cast(m, mstype.float32)
    v = op_cast(v, mstype.float32)
    gradient = op_cast(gradient, mstype.float32)

    next_m = op_mul(beta1, m) + op_mul(
        op_cast(F.tuple_to_array((1.0, )), mstype.float32) - beta1, gradient)

    next_v = op_mul(beta2, v) + op_mul(
        op_cast(F.tuple_to_array(
            (1.0, )), mstype.float32) - beta2, op_square(gradient))

    update = next_m / (op_sqrt(next_v) + eps)
    if decay_flag:
        update = update + op_mul(weight_decay_tensor, param)

    update_with_lr = op_mul(lr, update)
    next_param = param - op_reshape(update_with_lr, op_shape(param))

    next_v = F.depend(next_v, F.assign(param, next_param))
    next_v = F.depend(next_v, F.assign(m, next_m))
    next_v = F.depend(next_v, F.assign(v, next_v))
    return next_v
Example #13
0
def update_opt_step(learning_rate, batch_size, parameter, gradient):
    """
    Update opt step.

    Args:
        learning_rate (Tensor): Learning rate.
        batch_size (Tensor): Batch Size.
        parameter (Tensor): Parameter.
        gradient (Tensor): Gradients.

    Returns:
    """
    next_param = parameter - learning_rate * gradient / batch_size
    F.assign(parameter, next_param)
    return next_param
Example #14
0
def tensor_run_opt(opt, iters, learning_rate, momentum, gradient, variable,
                   moment):
    """ tensor_run_opt """
    success = True
    new_weight = opt(variable, moment, learning_rate, gradient, momentum)
    success = F.depend(success, F.assign(variable, new_weight))
    return success
def tensor_grad_scale(scale, grad, accu_grad):
    #mul = P.Mul()
    new_grad = accu_grad * reciprocal(scale)
    zeros = F.tensor_mul(accu_grad, 0.0)
    clear = F.assign(accu_grad, zeros)
    F.control_depend(new_grad, clear)
    F.control_depend(grad, new_grad)
    return new_grad
Example #16
0
    def construct(self, beta1, beta2, one_sub_beta_1, one_sub_beta_2, gradient, eps, weight_decay_tensor, lr):
        F.assign(self.param, self.x)

        param_fp32 = self.op_cast(self.param, mstype.float32)
        m_fp32 = self.op_cast(self.m, mstype.float32)
        v_fp32 = self.op_cast(self.v, mstype.float32)
        gradient_fp32 = self.op_cast(gradient, mstype.float32)

        next_m = self.op_mul(beta1, m_fp32) + \
            self.op_mul(self.op_cast(one_sub_beta_1,
                                     mstype.float32), gradient_fp32)
        next_v = self.op_mul(beta2, v_fp32) + self.op_mul(self.op_cast(one_sub_beta_2,
                                                                       mstype.float32), self.op_square(gradient_fp32))
        update = next_m / (eps + self.op_sqrt(next_v))
        if self.decay_flag:
            update = self.op_mul(weight_decay_tensor, param_fp32) + update
        update_with_lr = self.op_mul(lr, update)
        next_param = param_fp32 - \
            self.op_reshape(update_with_lr, self.op_shape(param_fp32))

        depend_v = F.depend(next_param, F.assign(self.param, next_param))
        depend_v = F.depend(depend_v, F.assign(self.m, next_m))
        depend_v = F.depend(depend_v, F.assign(self.v, next_v))

        F.assign(self.x, self.m)
        return depend_v
Example #17
0
 def construct(self, input_ids, input_mask, token_type_id, label_ids):
     """Defines the computation performed."""
     weights = self.weights
     saved = ()
     for i in range(self.length):
         saved = saved + (F.assign(self.saved_params[i], weights[i]), )
     assign_embedding = ()
     for i in range(self.quant_embedding_list_length):
         quant_embedding = self.quantize_embedding(
             weights[self.quant_embedding_list[i]])
         assign_embedding = assign_embedding + (F.assign(
             weights[self.quant_embedding_list[i]], quant_embedding), )
         F.control_depend(saved, assign_embedding[i])
     assign_weight = ()
     for i in range(self.quant_weight_list_length):
         quant_weight = self.quantize_weight(
             weights[self.quant_weight_list[i]])
         assign_weight = assign_weight + (F.assign(
             weights[self.quant_weight_list[i]], quant_weight), )
         F.control_depend(saved, assign_weight[i])
     for i in range(self.quant_embedding_list_length):
         F.control_depend(assign_embedding[i], input_ids)
     for i in range(self.quant_weight_list_length):
         F.control_depend(assign_weight[i], input_ids)
     grads = self.grad(self.network,
                       weights)(input_ids, input_mask, token_type_id,
                                label_ids,
                                self.cast(F.tuple_to_array((self.sens, )),
                                          mstype.float32))
     F.control_depend(input_ids, grads)
     # apply grad reducer on grads
     grads = self.grad_reducer(grads)
     grads = self.hyper_map(
         F.partial(clip_grad, gradient_cfg.clip_type,
                   gradient_cfg.clip_value), grads)
     restore = ()
     for i in range(self.length):
         restore = restore + (F.assign(weights[i], self.saved_params[i]), )
         F.control_depend(grads, restore[i])
     succ = self.optimizer(grads)
     for i in range(self.length):
         F.control_depend(restore[i], succ)
     return succ
Example #18
0
    def construct(self, beta1, beta2, gradient, eps, weight_decay_tensor, lr):
        param_fp32 = self.op_cast(self.param, mstype.float32)
        m_fp32 = self.op_cast(self.m, mstype.float32)
        v_fp32 = self.op_cast(self.v, mstype.float32)
        gradient_fp32 = self.op_cast(gradient, mstype.float32)

        next_m = self.op_mul(beta1, m_fp32) + \
                 self.op_mul(self.op_cast(F.tuple_to_array((1.0,)), mstype.float32) - beta1, gradient_fp32)
        next_v = self.op_mul(beta2, v_fp32) + self.op_mul(self.op_cast(F.tuple_to_array((1.0,)), mstype.float32) - \
                                                          beta2, self.op_square(gradient_fp32))
        update = next_m / (eps + self.op_sqrt(next_v))
        if self.decay_flag:
            update = self.op_mul(weight_decay_tensor, param_fp32) + update
        update_with_lr = self.op_mul(lr, update)
        next_param = param_fp32 - self.op_reshape(update_with_lr,
                                                  self.op_shape(param_fp32))

        next_v = F.depend(next_v, F.assign(self.param, next_param))
        next_v = F.depend(next_v, F.assign(self.m, next_m))
        next_v = F.depend(next_v, F.assign(self.v, next_v))
        return next_v
def _reset_accu_grads(accu_grad):
    succ = True
    return F.depend(succ, F.assign(accu_grad, zeroslike(accu_grad)))
def _update_accu_grads(accu_grad, grad):
    succ = True
    return F.depend(succ, F.assign(accu_grad, cast(grad, mstype.float32)))
Example #21
0
    def construct(self, x, y):
        add_res = self.add(x, y)
        F.depend(add_res, F.assign(self.param, add_res))

        return add_res
Example #22
0
def _update_run_op(beta1, beta2, eps, global_step, lr, weight_decay, param, m,
                   v, gradient, decay_flag, optim_filter):
    """
    Update parameters.

    Args:
        beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
        beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0).
        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
        lr (Tensor): Learning rate.
        weight_decay (Number): Weight decay. Should be equal to or greater than 0.
        global_step (Tensor): Global step.
        param (Tensor): Parameters.
        m (Tensor): m value of parameters.
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.
        decay_flag (bool): Specifies whether param update with weight decay.
        optim_filter(bool): Applies parameter update or not.

    Returns:
        Tensor, the new value of v after updating.
    """
    if optim_filter:
        op_mul = P.Mul()
        op_sqrt = P.Sqrt()
        op_rsqrt = P.Rsqrt()
        op_square = P.Square()
        op_cast = P.Cast()
        op_reshape = P.Reshape()
        op_shape = P.Shape()
        op_pow = P.Pow()
        op_norm = layer.Norm()
        op_select = P.Select()
        op_greater = P.Greater()
        op_fill = P.Fill()
        op_dtype = P.DType()

        param_fp32 = op_cast(param, mstype.float32)
        m_fp32 = op_cast(m, mstype.float32)
        v_fp32 = op_cast(v, mstype.float32)
        gradient_fp32 = op_cast(gradient, mstype.float32)

        next_m = op_mul(beta1, m_fp32) + op_mul(
            op_cast(num_one, mstype.float32) - beta1, gradient_fp32)

        next_v = op_mul(beta2, v_fp32) + op_mul(
            op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32))

        next_mm = next_m / (op_cast(num_one, mstype.float32) - op_pow(
            beta1, op_cast(global_step + num_one, mstype.float32)))
        next_vv = next_v / (op_cast(num_one, mstype.float32) - op_pow(
            beta2, op_cast(global_step + num_one, mstype.float32)))
        w_norm = op_norm(param_fp32)
        g_norm = op_norm(gradient_fp32)

        g_norm_hat = op_norm(
            op_mul(next_mm, op_rsqrt(next_vv + eps)) +
            weight_decay * param_fp32)
        zeros = F.zeros_like(w_norm)
        ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
        trust_ratio = op_select(
            op_greater(w_norm, zeros),
            op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones),
            ones)
        tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0)
        trust_ratio = C.clip_by_value(trust_ratio, zeros, tens)
        update = next_mm / (op_sqrt(next_vv) + eps)

        if decay_flag:
            update = update + op_mul(weight_decay, param_fp32)

        update_with_lr = op_mul(op_mul(trust_ratio, lr), update)

        next_param = param_fp32 - op_reshape(update_with_lr,
                                             op_shape(param_fp32))

        next_param = F.depend(
            next_param, F.assign(param, op_cast(next_param, F.dtype(param))))
        next_param = F.depend(next_param,
                              F.assign(m, op_cast(next_m, F.dtype(m))))
        next_param = F.depend(next_param,
                              F.assign(v, op_cast(next_v, F.dtype(v))))

        return op_cast(next_param, F.dtype(param))
    return gradient
Example #23
0
def _clear_grad_sum(grad_sum, zero):
    """Apply zero to clear grad_sum."""
    success = True
    success = F.depend(success, F.assign(grad_sum, zero))
    return success
 def construct(self, x, y):
     F.assign(self.cov_step, y)
     F.assign(x, y)
     return x