Ejemplo n.º 1
0
def _compute_process(input_list):
    var, m, lr, logbase, sign_decay, beta, grad = input_list[0], input_list[1], \
                                                  input_list[2], input_list[3], \
                                                  input_list[4], input_list[5], input_list[6]

    m_t = _compute_m_t(m, beta, grad)

    sign_gm = te.lang.cce.vmul(sign(m_t), sign(grad))

    update = _compute_update(logbase, sign_decay, sign_gm, grad)

    var_t = _compute_var(var, lr, update)

    return var_t, m_t
Ejemplo n.º 2
0
def _compute_positive(prox_v, alpha_broad, l1_broad, l2_broad):
    """
    the operator's compute
    var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}

    Parameters:
    ----------
    prox_v: the value of prox_v
    alpha_broad: the value of alpha_broad
    l1_broad: the value of l1_broad
    l2_broad: the value of l2_broad

    Returns
    the value of var_res
    """
    prox_v_abs = te.lang.cce.vabs(prox_v)
    prox_v_sign = sign(prox_v)
    # 1+alpha*l2
    alpha_l2 = te.lang.cce.vmul(alpha_broad, l2_broad)
    alpha_l2_1 = te.lang.cce.vadds(alpha_l2, tvm.const(CONST_ONE, "float32"))
    # max{|prox_v|-alpha*l1,0}
    alpha_l1 = te.lang.cce.vmul(alpha_broad, l1_broad)
    alpha_l1_neg = te.lang.cce.vmuls(alpha_l1,
                                     tvm.const(CONST_ONE_NEG, "float32"))
    prox_v_l1 = te.lang.cce.vadd(prox_v_abs, alpha_l1_neg)
    max_value = te.lang.cce.vmax(
        prox_v_l1,
        te.lang.cce.broadcast(tvm.const(CONST_ZERO, "float32"), prox_v.shape))
    # sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
    res = te.lang.cce.vdiv(prox_v_sign, alpha_l2_1)
    var_res = te.lang.cce.vmul(res, max_value)

    return var_res
Ejemplo n.º 3
0
def bessel_i1e_compute(x, y, kernel_name="bessel_i1e"):
    """
    Algrithm:
    I0 = 1 + ( (z/2) / (1!) )^2 + ((z/2)^2 / (2!))^2 + ... + ((z/2)^n / (n!)) ^2
    I0e = I0 / exp(x)
    I1e = I0e * z / (2*(k+1))
    u = 4 * v^2
    Ive = (1 - (u-1)/(8*z) + (u-1)*(u-9)/(2! * (8*z)^2) - (u-1)*(u-9)*(u-25)/(3!*(8*z)^3))
          /sqrt(2*pi*z)

    Parameters
    ----------
    x: the placeholder of data input

    y: the dict of output

    kernel_name: cce kernel name, default value is "bessel_i1e"

    Returns
    -------
    A tensor. Has the same type as x.
    """

    shape_input = x.shape
    dtype_input = x.dtype

    # chose the type of data in begin
    if dtype_input == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        x = te.lang.cce.cast_to(x, "float32")

    abs_data = te.lang.cce.vabs(x)

    broad_const_limit = te.lang.cce.broadcast(tvm.const(CONST_LIMIT, x.dtype), shape_input)
    before_res = _before_res_compute(abs_data, broad_const_limit)
    after_res = _after_res_compute(abs_data, broad_const_limit)

    if abs_data.dtype == before_res.dtype and \
            api_check_support("te.lang.cce.vcmpsel", abs_data.dtype):
        res = te.lang.cce.vcmpsel(abs_data,
                                  broad_const_limit,
                                  'lt',
                                  before_res,
                                  after_res)
    else:
        select_index = te.lang.cce.vcmp(abs_data, broad_const_limit, 'lt')
        res = te.lang.cce.vsel(select_index, before_res, after_res)

    data_sign = util_compute.sign(x)
    res = te.lang.cce.vmul(res, data_sign)

    if dtype_input == "float16":
        res = te.lang.cce.cast_to(res, "float16")

    return res
Ejemplo n.º 4
0
def atan_compute(x, y, kernel_name="atan"):
    """
    Algorithm: atan

    ----------------------------------
    Parameters:

    x: Input data

    y : the dict of output

    kernel_name: cce kernel name, default value is "atan"

    ----------------------------------
    Returns:

        A Tensor of atan(x).

    """

    dtype = x.dtype
    shape = x.shape

    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        x = te.lang.cce.cast_to(x, "float32")
    abs_data = te.lang.cce.vabs(x)

    tensor_one = te.lang.cce.broadcast(tvm.const(CONST_POS_ONE, x.dtype),
                                       shape)

    abs_data_sub_one = te.lang.cce.vsub(abs_data, tensor_one)
    abs_data_add_one = te.lang.cce.vadd(abs_data, tensor_one)
    abs_data2 = te.lang.cce.vdiv(abs_data_sub_one, abs_data_add_one)
    abs_data2 = te.lang.cce.vabs(abs_data2)

    # calucate data less than one
    res = _do_taylor(abs_data)
    # calucate data more than one
    res_mt_one = _do_taylor(abs_data2)
    res_mt_one = te.lang.cce.vadds(res_mt_one, CONST_PI_BY_FOUR)

    res = te.lang.cce.vmin(res, res_mt_one)

    sign_mask = util_compute.sign(x)
    res = te.lang.cce.vmul(res, sign_mask)

    if dtype == "float16":
        res = te.lang.cce.cast_to(res, "float16")
    return res
Ejemplo n.º 5
0
def _atan_compute(input_x):
    """
    Algorithm: atan

    ----------------------------------
    Parameters:

        input_x: Input data.

    ----------------------------------
    Returns:

        A Tensor of atan(x).

    """

    shape = input_x.shape
    dtype = input_x.dtype
    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        input_x = te.lang.cce.cast_to(input_x, "float32")
    abs_data = te.lang.cce.vabs(input_x)

    tensor_one = te.lang.cce.broadcast(tvm.const(CONST_POS_ONE, input_x.dtype),
                                       shape)

    abs_data_sub_one = te.lang.cce.vsub(abs_data, tensor_one)
    abs_data_add_one = te.lang.cce.vadd(abs_data, tensor_one)
    abs_data2 = te.lang.cce.vdiv(abs_data_sub_one, abs_data_add_one)
    abs_data2 = te.lang.cce.vabs(abs_data2)

    # calucate data less than one
    res = _do_taylor(abs_data)
    # calucate data more than one
    res_mt_one = _do_taylor(abs_data2)
    res_mt_one = te.lang.cce.vadds(res_mt_one, CONST_PI_BY_FOUR)

    res = te.lang.cce.vmin(res, res_mt_one)

    sign_mask = util_compute.sign(input_x)
    res = te.lang.cce.vmul(res, sign_mask)

    if dtype == "float16":
        res = te.lang.cce.cast_to(res, "float16")
    return res
Ejemplo n.º 6
0
def asinh_compute_cloud(input_x, output_y, kernel_name="asinh"):
    """
    algrithm: asinh(x) = log(x + sqrt(x^2 + 1))

    Parameters
    ----------
    input_x: the placeholder of data input

    output_y : the dict of output

    kernel_name : cce kernel name, default value is "asinh"

    Returns
    -------
    res : result of asinh

    """

    inp_dtype = input_x.dtype.lower()
    has_improve_precision = False
    if inp_dtype == "float16" and \
            tbe_platform.cce_conf.api_check_support("te.lang.cce.vlog",
                                                    "float32"):
        input_x = te.lang.cce.cast_to(input_x, "float32")
        has_improve_precision = True
        inp_dtype = "float32"

    data_abs = te.lang.cce.vabs(input_x)
    data_x_square = te.lang.cce.vmul(data_abs, data_abs)
    data_add = te.lang.cce.vadds(data_x_square,
                                 tvm.const(CONST_ONE, inp_dtype))
    data_s_1_sqrt = te.lang.cce.vsqrt(data_add)
    data_res = te.lang.cce.vadd(data_s_1_sqrt, data_abs)
    result = te.lang.cce.vlog(data_res)
    res = te.lang.cce.vmul(result, sign(input_x))

    if has_improve_precision:
        res = te.lang.cce.cast_to(res, "float16")

    return res
Ejemplo n.º 7
0
def apply_ftrl_d_compute(var,
                         accum,
                         linear,
                         grad,
                         lr,
                         l1,
                         l2,
                         lr_power,
                         var_out,
                         accum_out,
                         linear_out,
                         kernel_name='apply_ftrl_d'):
    """
    Update '*var' according to the Ftrl-proximal algorithm.

    accum_new = accum + grad * grad
    linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
    x = l1 * linear.sign - linear
    y = accum_new^(-lr_power) / lr + 2 * l2
    var = x / y if |linear| > l1 else 0.0
    accum = accum_new

    Parameters:
    ----------
    var : mutable tensor var.

    accum: mutable tensor accum.

    linear : mutable tensor linear.

    grad : tensor grad.

    lr : scalar lr.

    l1 : scalar l1.

    l2 : scalar l2.

    lr_power : scalar lr_power.

    var_out : the dict of var output.

    accum_out : the dict of accum output.

    linear_out : the dict of linear output.

    kernel_name : cce kernel name, default value is "apply_ftrl_d" (optional).

    Returns:
    -------
    None
    """

    # cast to float32 for higher accuracy
    dtype = var.dtype
    has_improve_precision = False
    if dtype == "float16" and \
        tbe_platform.cce_conf.api_check_support("te.lang.cce.vexp",
                                                "float32"):
        var_tmp = te.lang.cce.cast_to(var, "float32")
        accum_tmp = te.lang.cce.cast_to(accum, "float32")
        linear_tmp = te.lang.cce.cast_to(linear, "float32")
        grad = te.lang.cce.cast_to(grad, "float32")
        lr = te.lang.cce.cast_to(lr, "float32")
        l1 = te.lang.cce.cast_to(l1, "float32")
        l2 = te.lang.cce.cast_to(l2, "float32")
        lr_power = te.lang.cce.cast_to(lr_power, "float32")
        has_improve_precision = True
    else:
        var_tmp = te.lang.cce.vadds(var, tvm.const(NUM_ZERO, dtype))
        accum_tmp = te.lang.cce.vadds(accum, tvm.const(NUM_ZERO, dtype))
        linear_tmp = te.lang.cce.vadds(linear, tvm.const(NUM_ZERO, dtype))

    # broadcast scalar to appropriate shape
    zero_tensor = te.lang.cce.broadcast(tvm.const(NUM_ZERO, var_tmp.dtype),
                                        var.shape)
    lr = te.lang.cce.broadcast(lr, var.shape)
    l1 = te.lang.cce.broadcast(l1, var.shape)
    l2 = te.lang.cce.broadcast(l2, var.shape)
    lr_power = te.lang.cce.broadcast(lr_power, var.shape)

    # 1.accum_new = accum + grad^2
    gs = te.lang.cce.vmul(grad, grad)
    accum_new = te.lang.cce.vadd(accum_tmp, gs)

    # 2.linear += grad - (accum_new^(-lr_power)-accum^(-lr_power))/lr*var
    lr_power = te.lang.cce.vmuls(lr_power, tvm.const(NUM_M_ONE, var_tmp.dtype))
    accum_new_p = _pow(accum_new, lr_power, zero_tensor)
    accum_p = _pow(accum_tmp, lr_power, zero_tensor)
    accum_p = te.lang.cce.vsub(accum_new_p, accum_p)

    accum_p = te.lang.cce.vdiv(accum_p, lr)
    accum_p = te.lang.cce.vmul(accum_p, var_tmp)
    accum_p = te.lang.cce.vsub(grad, accum_p)
    linear_t = te.lang.cce.vadd(linear_tmp, accum_p)

    # 3.x_res = l1*linear.sign()-linear
    x_res = sign(linear_t)
    x_res = te.lang.cce.vmul(x_res, l1)
    x_res = te.lang.cce.vsub(x_res, linear_t)

    # 4.y_res = accum_new^(-lr_power)/lr + 2*l2
    l2 = te.lang.cce.vmuls(l2, tvm.const(NUM_TWO, var_tmp.dtype))
    y_res = te.lang.cce.vdiv(accum_new_p, lr)
    y_res = te.lang.cce.vadd(y_res, l2)

    # 5.var = x_res / y_res if linear.abs > l1, else var = 0
    x_res = te.lang.cce.vdiv(x_res, y_res)
    linear_abs = te.lang.cce.vabs(linear_t)
    var_sel = te.lang.cce.vcmp(linear_abs, l1, 'gt')
    var_t = te.lang.cce.vsel(var_sel, x_res, zero_tensor)

    # result of vsel is fp16, should cast to fp32
    var_t = te.lang.cce.cast_to(var_t, "float32")

    if has_improve_precision:
        var_t = te.lang.cce.cast_to(var_t, "float16")
        accum_new = te.lang.cce.cast_to(accum_new, "float16")
        linear_t = te.lang.cce.cast_to(linear_t, "float16")

    # 8.var_output_data = var_t
    var_output_data = te.lang.cce.vadds(var_t,
                                        tvm.const(NUM_ZERO, var_t.dtype))
    accum_output_data = te.lang.cce.vadds(accum_new,
                                          tvm.const(NUM_ZERO, accum_new.dtype))
    linear_output_data = te.lang.cce.vadds(linear_t,
                                           tvm.const(NUM_ZERO, linear_t.dtype))

    def _compute(*index):
        return var_t(*index), accum_new(*index), linear_t(
            *index), var_output_data(*index), accum_output_data(
                *index), linear_output_data(*index)

    return tvm.compute(var.shape, _compute, name="outputs")
Ejemplo n.º 8
0
def apply_ftrl_v2_d_compute(var,
                            accum,
                            linear,
                            grad,
                            lr,
                            l1,
                            l2,
                            l2_shrinkage,
                            lr_power,
                            var_out,
                            accum_out,
                            linear_out,
                            kernel_name='apply_ftrl_v2_d'):
    """
    Update '*var' according to the Ftrl-proximal algorithm.

    grad_with_shrinkage = grad + 2 * l2_shrinkage * var
    accum_new = accum + grad * grad
    linear += grad_with_shrinkage -
        (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
    x = l1 * linear.sign - linear
    y = accum_new^(-lr_power) / lr + 2 * l2
    var = x / y if |linear| > l1 else 0.0
    accum = accum_new

    Parameters:
    ----------
    var : mutable tensor var.

    accum: mutable tensor accum.

    linear : mutable tensor linear.

    grad : tensor grad.

    lr : scalar lr.

    l1 : scalar l1.

    l2 : scalar l2.

    l2_shrinkage: scalar l2_shrinkage.

    lr_power : scalar lr_power.

    var_out : the dict of output var.

    accum_out : the dict of output accum.

    linear_out : the dict of output linear.

    kernel_name : cce kernel name, default value is "apply_ftrl_v2_d".

    Returns:
    -------
    the value of var_new, accum_new, linear_new, output_data
    """
    dtype = var.dtype
    # cast to float32 for higher accuracy
    has_improve_precision = False
    if dtype == "float16" and \
        tbe_platform.cce_conf.api_check_support("te.lang.cce.vexp",
                                                "float32"):
        var_tmp = te.lang.cce.cast_to(var, "float32")
        accum_tmp = te.lang.cce.cast_to(accum, "float32")
        linear_tmp = te.lang.cce.cast_to(linear, "float32")
        grad = te.lang.cce.cast_to(grad, "float32")
        lr = te.lang.cce.cast_to(lr, "float32")
        l1 = te.lang.cce.cast_to(l1, "float32")
        l2 = te.lang.cce.cast_to(l2, "float32")
        l2_shrinkage = te.lang.cce.cast_to(l2_shrinkage, "float32")
        lr_power = te.lang.cce.cast_to(lr_power, "float32")
        has_improve_precision = True
    else:
        var_tmp = te.lang.cce.vadds(var, tvm.const(NUM_ZERO, "float32"))
        accum_tmp = te.lang.cce.vadds(accum, tvm.const(NUM_ZERO, "float32"))
        linear_tmp = te.lang.cce.vadds(linear, tvm.const(NUM_ZERO, "float32"))

    # 1.grad_with_shrinkage = grad + 2 * l2_shrinkage * var
    mul_value = te.lang.cce.vmuls(l2_shrinkage, tvm.const(NUM_TWO, "float32"))
    mul_value = te.lang.cce.broadcast(mul_value, var_tmp.shape)
    mul_value2 = te.lang.cce.vmul(mul_value, var_tmp)
    grad_with_shrinkage = te.lang.cce.vadd(grad, mul_value2)

    # 2.accum_new = accum + grad^2
    gs = te.lang.cce.vmul(grad, grad)
    accum_new = te.lang.cce.vadd(accum_tmp, gs)

    # 3.accum_pow_sub = accum_new^(-lr_power)-accum^(-lr_power)
    lr_power = te.lang.cce.vmuls(lr_power, tvm.const(NUM_M_ONE, "float32"))
    lr_power = te.lang.cce.broadcast(lr_power, var_tmp.shape)
    accum_new_pow = _pow(accum_new, lr_power)
    accum_pow = _pow(accum_tmp, lr_power)
    accum_pow_sub = te.lang.cce.vsub(accum_new_pow, accum_pow)

    # 4.linear += grad_with_shrinkage - accum_pow_sub / lr * var
    lr = te.lang.cce.broadcast(lr, var_tmp.shape)
    accum_pow_div = te.lang.cce.vdiv(accum_pow_sub, lr)
    accum_pow_mul = te.lang.cce.vmul(accum_pow_div, var_tmp)
    accum_pow = te.lang.cce.vsub(grad_with_shrinkage, accum_pow_mul)
    linear_new = te.lang.cce.vadd(linear_tmp, accum_pow)

    # 5.x_res = l1*linear.sign()-linear
    l1 = te.lang.cce.broadcast(l1, var_tmp.shape)
    x_res = sign(linear_new)
    x_res = te.lang.cce.vmul(x_res, l1)
    x_res = te.lang.cce.vsub(x_res, linear_new)

    # 6.y_res = accum_new^(-lr_power)/lr + 2*l2
    l2 = te.lang.cce.vmuls(l2, tvm.const(NUM_TWO, "float32"))
    l2 = te.lang.cce.broadcast(l2, var_tmp.shape)
    y_res = te.lang.cce.vdiv(accum_new_pow, lr)
    y_res = te.lang.cce.vadd(y_res, l2)

    # 7.var = x_res / y_res if linear.abs > l1, else var = 0
    x_res = te.lang.cce.vdiv(x_res, y_res)
    linear_abs = te.lang.cce.vabs(linear_new)
    zero_tensor = te.lang.cce.broadcast(tvm.const(NUM_ZERO, "float32"),
                                        var_tmp.shape)
    var_sel = te.lang.cce.vcmp(linear_abs, l1, 'gt')
    var_new = te.lang.cce.vsel(var_sel, x_res, zero_tensor)

    # dtype after vsel is float16 at mini
    var_new = te.lang.cce.cast_to(var_new, "float32")

    if has_improve_precision:
        var_new = te.lang.cce.cast_to(var_new, "float16")
        accum_new = te.lang.cce.cast_to(accum_new, "float16")
        linear_new = te.lang.cce.cast_to(linear_new, "float16")

    # 8.output_var = var_new
    output_data = te.lang.cce.vadds(var_new, tvm.const(NUM_ZERO,
                                                       var_new.dtype))
    accum_out_data = te.lang.cce.vadds(accum_new,
                                       tvm.const(NUM_ZERO, accum_new.dtype))
    linear_out_data = te.lang.cce.vadds(linear_new,
                                        tvm.const(NUM_ZERO, linear_new.dtype))

    def _compute(*index):
        return var_new(*index), accum_new(*index), \
               linear_new(*index), output_data(*index), \
               accum_out_data(*index), linear_out_data(*index)

    return tvm.compute(var.shape, _compute, name="outputs")
Ejemplo n.º 9
0
def apply_proximal_adagrad_d_compute(var,
                                     accum,
                                     lr,
                                     l1,
                                     l2,
                                     grad,
                                     var_out,
                                     accum_out,
                                     use_locking=False,
                                     kernel_name="apply_proximal_adagrad"):
    """
    the operator's compute
    accum += grad * grad
    learning_rate = lr_broad * rsqrt(accum)
    prox_v = var - grad * learning_rate
    if l1 > 0 :
        var = sign(prox_v)/(1+learning_rate*l2)*max{|prox_v|-learning_rate*l1,0}
    else:
        var = prox_v / (1+l2*learning_rate)

    Parameters
    ----------
    var: dict
        input tensor contains shape and dtype attributes.
        only support float16, float32.
    accum: dict
        input tensor contains shape and dtype attributes.
        Must have the same type as 'var'.
    lr: dict
        input tensor contains shape and dtype attributes.
        Must have the same type as 'var'.
    l1: dict
        input tensor contains shape and dtype attributes.
        Must have the same type as 'var'.
    l2: dict
        input tensor contains shape and dtype attributes.
        Must have the same type as 'var'.
    grad: dict
        input tensor contains shape and dtype attributes.
        Must have the same type as 'var'.
    var_out: dict
        output tensor contains shape and dtype attributes.
        Must have the same type as 'var'.
    accum_out: dict
        output tensor contains shape and dtype attributes.
        Must have the same type as 'accum'.
    use_locking: bool
        default value is "False"
    kernel_name: str
        kernel name, default value is "apply_proximal_adagrad_d"

    Returns:
        the value of out_var, accum_out, out_data
    """
    dtype = var.dtype
    has_improve_precision = False
    if dtype == "float16" and \
        tbe_platform.cce_conf.api_check_support("te.lang.cce.vsqrt",
                                                "float32"):
        var = te.lang.cce.cast_to(var, "float32")
        accum = te.lang.cce.cast_to(accum, "float32")
        lr = te.lang.cce.cast_to(lr, "float32")
        l1 = te.lang.cce.cast_to(l1, "float32")
        l2 = te.lang.cce.cast_to(l2, "float32")
        grad = te.lang.cce.cast_to(grad, "float32")
        has_improve_precision = True

    lr_broad = te.lang.cce.broadcast(lr, var.shape)
    l1_broad = te.lang.cce.broadcast(l1, var.shape)
    l2_broad = te.lang.cce.broadcast(l2, var.shape)

    grad_2 = te.lang.cce.vmul(grad, grad)
    accum_out = te.lang.cce.vadd(accum, grad_2)
    accum_sqrt = te.lang.cce.vsqrt(accum_out)
    learning_rate = te.lang.cce.vdiv(lr_broad, accum_sqrt)
    learning_rate_grad = te.lang.cce.vmul(grad, learning_rate)
    prox_v = te.lang.cce.vsub(var, learning_rate_grad)
    l2_lr = te.lang.cce.vmul(l2_broad, learning_rate)
    l2_lr_1 = te.lang.cce.vadds(l2_lr, tvm.const(CONST_ONE, "float32"))
    prox_v_abs = te.lang.cce.vabs(prox_v)
    prox_v_sign = sign(prox_v)
    learning_rate_l1 = te.lang.cce.vmul(learning_rate, l1_broad)
    prox_v_l1 = te.lang.cce.vsub(prox_v_abs, learning_rate_l1)
    max_value = te.lang.cce.vmax(
        prox_v_l1,
        te.lang.cce.broadcast(tvm.const(CONST_ZERO, "float32"), prox_v.shape))
    var_res = te.lang.cce.vmul(prox_v_sign, max_value)
    var_new = te.lang.cce.vdiv(var_res, l2_lr_1)
    output_data = te.lang.cce.vadds(var_new, tvm.const(CONST_ZERO, "float32"))
    output_accum_data = te.lang.cce.vadds(accum_out,
                                          tvm.const(CONST_ZERO, "float32"))

    if has_improve_precision:
        var_new = te.lang.cce.cast_to(var_new, "float16")
        accum_out = te.lang.cce.cast_to(accum_out, "float16")
        output_data = te.lang.cce.cast_to(output_data, "float16")
        output_accum_data = te.lang.cce.cast_to(output_accum_data, "float16")

    # this compute is for muti output
    def _compute(*index):
        return var_new(*index), accum_out(*index), output_data(
            *index), output_accum_data(*index)

    return tvm.compute(var.shape, _compute, name="outputs")
Ejemplo n.º 10
0
def apply_adagrad_da_d_compute(var,
                               gradient_accumulator,
                               gradient_squared_accumulator,
                               grad,
                               lr,
                               l1,
                               l2,
                               global_step,
                               var_out,
                               gradient_accumulator_out,
                               gradient_squared_accumulator_out,
                               kernel_name='apply_adagrad_da_d'):
    """
    Update '*var' according to the Ftrl-proximal algorithm.

    grad_accum += grad
    grad_squared_accum += grad * grad
    tmp_val=sign(grad_accum) * max⁡{|grad_accum|-l1*global_step, 0}
        if l1>0 else grad_accum
    x_value = -1 * lr * tmp_val
    y_value = l2 * global_step * lr + sqrt(grad_squared_accum)
    var = x_value / y_value

    Parameters:
    ----------
    var : mutable tensor var.

    gradient_accumulator: mutable tensor gradient_accumulator.

    gradient_squared_accumulator : mutable tensor gradient_squared_accumulator.

    grad : tensor grad.

    lr : scalar lr.

    l1 : scalar l1.

    l2 : scalar l2.

    global_step : scalar global_step.

    var_out : the dict of output.

    gradient_accumulator_out : the dict of output.

    gradient_squared_accumulator_out : the dict of output.

    kernel_name : cce kernel name, default value is "apply_adagrad_da".

    Returns:
    -------
    None
    """
    # cast to float32 for higher accuracy
    dtype = var.dtype
    has_improve_precision = False
    cast_type = var.dtype
    if dtype == "float16" and \
            tbe_platform.cce_conf.api_check_support("te.lang.cce.vsqrt",
                                                    "float32"):
        cast_type = "float32"
        has_improve_precision = True

    if dtype == "float16":
        if has_improve_precision:
            var_tmp = te.lang.cce.cast_to(var, "float32")
            var_tmp = te.lang.cce.vmuls(var_tmp,
                                        tvm.const(NUM_ZERO, "float32"))
            grad_accum_tmp = te.lang.cce.cast_to(gradient_accumulator,
                                                 "float32")
            grad_sq_accum_tmp = te.lang.cce.cast_to(
                gradient_squared_accumulator, "float32")
            grad = te.lang.cce.cast_to(grad, "float32")
            lr = te.lang.cce.cast_to(lr, "float32")
            l1 = te.lang.cce.cast_to(l1, "float32")
            l2 = te.lang.cce.cast_to(l2, "float32")
        else:
            var_tmp = te.lang.cce.vmuls(var, tvm.const(NUM_ZERO, "float16"))
            grad_accum_tmp = te.lang.cce.vadds(gradient_accumulator,
                                               tvm.const(NUM_ZERO, "float16"))
            grad_sq_accum_tmp = te.lang.cce.vadds(
                gradient_squared_accumulator, tvm.const(NUM_ZERO, "float16"))
    else:
        var_tmp = te.lang.cce.vmuls(var, tvm.const(NUM_ZERO, "float32"))
        grad_accum_tmp = te.lang.cce.vadds(gradient_accumulator,
                                           tvm.const(NUM_ZERO, "float32"))
        grad_sq_accum_tmp = te.lang.cce.vadds(gradient_squared_accumulator,
                                              tvm.const(NUM_ZERO, "float32"))

    global_step = te.lang.cce.cast_to(global_step, cast_type)

    # 1.grad_accum += grad
    gradient_accum_new = te.lang.cce.vadd(grad_accum_tmp, grad)

    # 2.grad_squared_accum += grad * grad
    gs = te.lang.cce.vmul(grad, grad)
    gradient_squared_accum_new = te.lang.cce.vadd(grad_sq_accum_tmp, gs)

    # 3.tmp_val = sign(grad_accum) * max⁡{|grad_accum|-l1*global_step, 0}
    #     if l1>0 else grad_accum
    sign_val = sign(gradient_accum_new)
    abs_val = te.lang.cce.vabs(gradient_accum_new)

    mul_val = te.lang.cce.vmul(global_step, l1)
    mul_val = te.lang.cce.broadcast(mul_val, var_tmp.shape)
    sub_val = te.lang.cce.vsub(abs_val, mul_val)
    zero_tensor = te.lang.cce.broadcast(tvm.const(NUM_ZERO, cast_type),
                                        var_tmp.shape)
    max_val = te.lang.cce.vmax(sub_val, zero_tensor)
    tmp_val = te.lang.cce.vmul(sign_val, max_val)

    l1 = te.lang.cce.broadcast(l1, var_tmp.shape)
    l1_cmp = te.lang.cce.vcmp(l1, zero_tensor, "gt")
    tmp_val = te.lang.cce.vsel(l1_cmp, tmp_val, gradient_accum_new)

    # 4.x_value = -1 * lr * tmp_val
    x_value = te.lang.cce.vmuls(lr, tvm.const(NUM_M_ONE, cast_type))
    x_value = te.lang.cce.broadcast(x_value, var_tmp.shape)
    x_value = te.lang.cce.vmul(x_value, tmp_val)

    # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum)
    pro_val = te.lang.cce.vmul(l2, global_step)
    pro_val = te.lang.cce.vmul(pro_val, lr)
    pro_val = te.lang.cce.broadcast(pro_val, var_tmp.shape)
    sqrt_val = te.lang.cce.vsqrt(gradient_squared_accum_new, priority_flag=1.0)
    y_value = te.lang.cce.vadd(pro_val, sqrt_val)

    # 6.var = x_value / y_value
    var_t = te.lang.cce.vdiv(x_value, y_value)
    var_new = te.lang.cce.vadd(var_t, var_tmp)

    if dtype == "float16" and has_improve_precision:
        var_new = te.lang.cce.cast_to(var_new, "float16")
        gradient_accum_new = te.lang.cce.cast_to(gradient_accum_new, "float16")
        gradient_squared_accum_new = te.lang.cce.cast_to(
            gradient_squared_accum_new, "float16")

    # 7. output_data = var_new
    output_data = te.lang.cce.vadds(var_new, tvm.const(NUM_ZERO,
                                                       var_new.dtype))
    res1_data = te.lang.cce.vadds(gradient_accum_new,
                                  tvm.const(NUM_ZERO, var_new.dtype))
    res2_data = te.lang.cce.vadds(gradient_squared_accum_new,
                                  tvm.const(NUM_ZERO, var_new.dtype))

    def _compute(*index):
        return var_new(*index), gradient_accum_new(*index), \
               gradient_squared_accum_new(*index), output_data(*index),\
               res1_data(*index), res2_data(*index)

    return tvm.compute(var.shape, _compute, name="outputs")
Ejemplo n.º 11
0
def asin_compute(x, y, kernel_name="asin"):
    """
    do element-wise asin compute
    asin(x) = | arcsin(sqrt(1-x^2)) - HALF_PI, x belongs to (-1, -2^(-0.5))
              | the 15th order taylor expansion, x belongs to (-2^(-0.5), 2^(-0.5))
              | HALF_PI - arcsin(sqrt(1-x^2)), x belongs to (2^(-0.5), 1)

    Parameters:
    ----------
    x: the placeholder of data input

    y : the dict of output

    kernel_name : cce kernel name, default value is "cce_asin"

    Returns : A Tensor. Has the same type as data_input.
    -------
    """

    shape = x.shape
    dtype = x.dtype

    # Change dtype to float32
    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        x = te.lang.cce.cast_to(x, "float32")

    # Sign mask
    sign = util_compute.sign(x)

    # All positive
    x = te.lang.cce.vmul(x, sign)

    # x belongs to (0, 2^(-0.5))
    if api_check_support("te.lang.cce.vmins", x.dtype):
        choice_1 = te.lang.cce.vmins(x, tvm.const(BOUNDARY_1, x.dtype))
    else:
        boundary_mask1 = te.lang.cce.broadcast(tvm.const(BOUNDARY_1, x.dtype),
                                               shape)
        choice_1 = te.lang.cce.vmin(x, boundary_mask1)

    if api_check_support("te.lang.cce.vsubs", choice_1.dtype):
        choice_1 = te.lang.cce.vsubs(choice_1,
                                     tvm.const(BOUNDARY_1, choice_1.dtype))
    else:
        boundary_mask1 = te.lang.cce.broadcast(
            tvm.const(BOUNDARY_1, choice_1.dtype), shape)
        choice_1 = te.lang.cce.vsub(choice_1, boundary_mask1)

    choice_1 = te.lang.cce.vmuls(te.lang.cce.floor(choice_1), NEG_NUM_ONE)

    res_1 = _taylor_compute(x)
    res_1 = te.lang.cce.vmul(res_1, choice_1)

    # x belongs to (2^(-0.5), 1)
    choice_2 = te.lang.cce.vmuls(choice_1, tvm.const(NEG_NUM_ONE, x.dtype))
    choice_2 = te.lang.cce.vadds(choice_2, tvm.const(NUM_ONE, x.dtype))

    res_2 = te.lang.cce.vmul(x, x)
    res_2 = te.lang.cce.vmuls(res_2, tvm.const(NEG_NUM_ONE, x.dtype))
    res_2 = te.lang.cce.vadds(res_2, tvm.const(NUM_ONE, x.dtype))
    res_2_sqrt = te.lang.cce.vsqrt(res_2)

    res_2 = _taylor_compute(res_2_sqrt, res_2)

    res_2 = te.lang.cce.vmuls(res_2, tvm.const(NEG_NUM_ONE, x.dtype))
    res_2 = te.lang.cce.vadds(res_2, tvm.const(HALF_PI, x.dtype))
    res_2 = te.lang.cce.vmul(res_2, choice_2)

    # Restore sign
    res_1 = te.lang.cce.vadd(res_1, res_2)
    res_1 = te.lang.cce.vmul(res_1, sign)

    # Restore dtype
    if dtype == "float16":
        res_1 = te.lang.cce.cast_to(res_1, "float16")

    return res_1