예제 #1
0
def fused_bn_reduce_grad(data0,
                         data1,
                         data2,
                         data3,
                         data4,
                         data5,
                         data6,
                         data7,
                         layout='NHWC',
                         out_dtype='float16',
                         target=utils.CUDA):

    if layout == 'NCHW':
        data3 = topi.transpose(data3, (0, 2, 3, 1))
        data7 = topi.transpose(data7, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError('Layout not supported {} '.format(layout))

    n, h, w, c = data3.shape
    const = n * h * w
    inter_dtype = 'float32'
    out1 = topi.multiply(data4, data5)
    out1 = topi.divide(out1, const)
    out1 = topi.expand_dims(out1, axis=0, num_newaxis=3)
    out1 = topi.broadcast_to(out1, (n, h, w, c))

    data3 = topi.cast(data3, inter_dtype)
    data2 = topi.expand_dims(data2, axis=0, num_newaxis=3)
    data2 = topi.broadcast_to(data2, (n, h, w, c))
    out2 = topi.multiply(data3, const)
    out2 = topi.subtract(out2, data2)

    data1 = topi.expand_dims(data1, axis=0, num_newaxis=3)
    data1 = topi.broadcast_to(data1, (n, h, w, c))
    data7 = topi.cast(data7, inter_dtype)
    out3 = topi.divide(data6, const)
    out3 = topi.subtract(data7, out3)
    out3 = topi.multiply(data1, out3)
    out3 = topi.divide(out3, data0)

    output = topi.subtract(out2, out3)
    output = topi.multiply(output, out1)

    output = topi.cast(output, out_dtype)

    if layout == "NCHW":
        output = topi.transpose(output, (0, 3, 1, 2))

    return output
예제 #2
0
파일: asin_grad.py 프로젝트: zhuyawen/akg
def _asin_grad_compute(x, dy):
    """Compute asin_grad."""

    dtype = x.dtype
    if dtype == "float16":
        x = topi.cast(x, "float32")
        dy = topi.cast(dy, "float32")

    # step 1: calculate num_to_vrsqrt = 1 - x^2
    data = topi.multiply(x, x)
    data = topi.multiply(data, tvm.const(-1, "float32"))
    num_to_vrsqrt = topi.add(data, tvm.const(1, "float32"))

    # step 2: calculate dy * (1 / sqrt(1 - x^2))
    if utils.product_is_mini():
        # mini: use newton's method for high accuracy result
        res = _vrsqrt_newton(num_to_vrsqrt)
        res = topi.multiply(res, dy)
    else:
        # cloud: use vdiv for high efficiency computation
        vsqrt_res = topi.sqrt(num_to_vrsqrt)
        res = topi.divide(dy, vsqrt_res)

    if dtype == "float16":
        res = topi.cast(res, "float16")

    return res
예제 #3
0
def fused_bn_follow(data0, data1, data2, data3, data4, target=utils.CUDA):
    """
    input:
    data: length is 5
    data0: param0 beta
    data1: param1 gamma
    data2: param2 BNupdate: xi_variance
    data3: param6 BNreduce: xi_mean
    data4: param7 xi_conv2d

    layout: (N, C, H, W)

    output:
    beta + gamma * xi_variance * ( xi -  xi_mean/(N*H*W) )
    """

    n, h, w, c = data4.shape
    const = n * h * w
    inter_dtype = 'float32'
    data4 = topi.cast(data4, inter_dtype)

    multiply0 = topi.divide(data3, const)
    multiply0 = topi.expand_dims(multiply0, axis=0, num_newaxis=3)
    multiply0 = topi.broadcast_to(multiply0, (n, h, w, c))

    subtract0 = topi.subtract(data4, multiply0)

    multiply1 = topi.multiply(subtract0, data2)
    multiply2 = topi.multiply(multiply1, data1)

    add0 = topi.add(multiply2, data0)

    return add0
예제 #4
0
def fused_bn_follow_relu_avgpool(data0, data1, data2, data3, data4, data5, layout='NHWC', out_dtype='float16', target=utils.CUDA):
    """
    input:
    data: length is 6
    data0: tensor1 after bn_double_relu
    data1-6: bn parameters for conv2d tensor2
    layout: only (N, H, W, C), (N, C, H, W) supported
    out_dtype: float16

    output:
    avg-pooling( max(batch-normalized tensor1 + batch-normalized tensor2,  0) )
    """
    if layout == 'NCHW':
        data0 = topi.transpose(data0, (0, 2, 3, 1))
        data5 = topi.transpose(data5, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError(
            'Layout not supported {} '.format(layout))

    n, h, w, c = data0.shape
    inter_dtype = 'float32'
    add0 = fused_bn_follow(data1, data2, data3, data4, data5)
    add0 = topi.cast(add0, data0.dtype)
    add1 = topi.add(data0, add0)
    output = topi.maximum(add1, 0)
    output = topi.cast(output, inter_dtype)
    output = topi.sum(output, axis=(1, 2))
    output = topi.divide(output, h * w)
    output = topi.cast(output, out_dtype)

    return output
예제 #5
0
def _atan_compute(data):
    """compute for atan"""
    dtype = data.dtype

    if dtype == "float16":
        data = topi.cast(data, "float32")

    abs_data = topi.abs(data)
    tensor_one = dc.one_const(abs_data.dtype)

    abs_data_sub_one = topi.subtract(abs_data, tensor_one)
    abs_data_add_one = topi.add(abs_data, tensor_one)
    abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one))

    # calucate data less than one
    res = _do_atan_taylor(abs_data)
    # calucate data more than one
    res_mt_one = topi.add(_do_atan_taylor(abs_data2),
                          tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype))
    res = topi.minimum(res, res_mt_one)

    if utils.product_is_mini() and data.dtype == "float32":
        sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32")
    else:
        sign_mask = topi.sign(data)

    res = topi.multiply(res, sign_mask)

    if dtype == "float16":
        res = topi.cast(res, "float16")

    return res
예제 #6
0
def fused_mul_div_rsqrt_mul_isfinite_red(input1, input2, out_dtype):
    """
    fused operator.

    Args:
        input1: tvm.tensor.Tensor.
        input2: tvm.tensor.Tensor.
        dtype: dtype of Tensor.

    Returns:
        list of tvm.tensor.Tensor.
    """
    mul_param1 = topi.multiply(input2, input2)
    divide_val = topi.divide(1, mul_param1)
    rsqrt_val = topi.rsqrt(divide_val)
    mul_param0 = topi.multiply(input1, rsqrt_val)
    isfinite = topi.isfinite(mul_param0)
    reduce_and = topi.all(isfinite)

    if mul_param0.dtype != out_dtype:
        mul_param0 = topi.cast(mul_param0, out_dtype)
        rsqrt_val = topi.cast(rsqrt_val, out_dtype)
        divide_val = topi.cast(divide_val, out_dtype)

    return [reduce_and, mul_param0, rsqrt_val, divide_val]
def sigmoid_cross_entropy_with_logits_grad_compute(predict, target, dout):
    """sigmoid_cross_entropy_with_logits_grad compute implemention"""
    dtype = predict.dtype
    if dtype == "float16":
        predict = topi.cast(predict, "float32")
        target = topi.cast(target, "float32")
        dout = topi.cast(dout, "float32")

    # e^x
    val1 = exp(predict)
    # 1 + e^x
    val2 = topi.add(val1, tvm.const(SCALAR_ONE, dtype="float32"))
    # e^x / (1 + e^x)
    val3 = topi.divide(val1, val2)
    # -target
    val4 = topi.multiply(target, tvm.const(SCALAR_NEGTIVE_ONE,
                                           dtype="float32"))
    # e^x / (1 + e^x) -y
    val5 = topi.add(val3, val4)

    result = topi.multiply(val5, dout)

    if dtype == "float16":
        result = topi.cast(result, dtype)
    return result
예제 #8
0
파일: asinh.py 프로젝트: mindspore-ai/akg
def asinh(x, target=utils.CCE):
    r"""
    Compute asinh function.

    .. math:: asinh(x) = log(x+\sqrt{x*x+1})

    Args:
        x (tvm.tensor.Tensor): Tensor of type float16, float32. 

    Returns:
       tvm.tensor.Tensor, has the same type and shape as x.
    
    Supported Platforms:
        'Ascend'
    """
    # check shape
    utils.check_shape(x)

    # check input tensor data_type
    utils.ops_dtype_check(x.dtype, utils.DtypeForDavinci.ALL_FLOAT)
    dtype = x.dtype

    # Known that, asinh(x) = log(x + sqrt(x*x+1)), and, asinh(-x) = -asinh(x)
    # If x is a large negative number, (x + sqrt(x*x+1)) will be close to zero.
    # So, asinh(x) = sign(x) * log(|x| + sqrt(|x|*|x| + 1))
    compute_dtype = dtype
    if dtype == "float16":
        # To avoid overflow and higher accuracy, x is casted to float32
        compute_dtype = "float32"
        x = topi.cast(x, compute_dtype)

    x_abs = topi.abs(x)

    if product_is_mini():
        # sqrt(|x|*|x| + 1) = |x| * sqrt(1 + 1/(|x|*|x|))
        vsquare_add_one = topi.add(1,
                                   topi.divide(1, topi.multiply(x_abs, x_abs)))
        sqrt_compute_value = sqrt_mini_newton_iter_impl(vsquare_add_one)
        sqrt_value = topi.multiply(x_abs, sqrt_compute_value)
    else:
        x_abs_square_add_one = topi.add(topi.multiply(x_abs, x_abs), 1)
        sqrt_value = topi.sqrt(x_abs_square_add_one)

    x_add_sqrt = topi.add(x_abs, sqrt_value)

    if product_is_mini():
        log_value = log_compute_mini_impl(x_add_sqrt, target)
    else:
        log_value = topi.log(x_add_sqrt)

    res = topi.multiply(Sign(x, target), log_value)

    if res.dtype != dtype:
        res = topi.cast(res, dtype)

    if product_is_mini():
        attrs = {"enable_auto_inline": False}
        return res, attrs
    return res
예제 #9
0
def truncate_div_compute(input_x1, input_x2):
    """compute for truncate_div"""
    int_list = ("int32", "int8", "uint8")

    if input_x1.dtype in int_list:
        data_zero = dc.zero_const("float32")
        data_x_broad = cast(input_x1, "float32")
        data_y_broad = cast(input_x2, "float32")
        res_div = topi.divide(data_x_broad, data_y_broad)
        res_min_int = ceil(topi.minimum(res_div, data_zero))
        res_max_int = floor(topi.maximum(res_div, data_zero))
        res_trunc = topi.add(res_min_int, res_max_int)
        res_trunc = cast(res_trunc, "float32")
    else:
        res_trunc = topi.divide(input_x1, input_x2)

    return cast(res_trunc, input_x1.dtype)
예제 #10
0
def fused_relu_grad_bn_reduce_grad(data_1,
                                   data_2,
                                   data_3,
                                   data_4,
                                   data_5,
                                   data_6,
                                   data_7,
                                   data_8,
                                   data_9,
                                   layout='NHWC',
                                   target=utils.CUDA):
    """
    fused_relu_grad_bn_reduce_grad.

    Args:
        data_1~data_9: tvm.tensor.Tensor.
        layout: input layout, only 'NCHW', 'NHWC' supported

    Returns:
        tvm.tensor.Tensor.
    """
    transform_list = [data_7, data_8, data_9]
    for i in transform_list:
        if layout == "NCHW":
            i = topi.transpose(i, axes=(0, 2, 3, 1))
        elif layout != "NHWC":
            raise NotImplementedError(
                'Layout not supported {} '.format(layout))

    data_tmp1 = topi.multiply(data_4, data_5)
    n, h, w, c = data_9.shape
    data_tmp2 = topi.full_like(data_tmp1, 1.0 / (n * h * w))
    data_tmp3 = topi.multiply(data_tmp1, data_tmp2)

    data_tmp5 = topi.full_like(data_9, 0.0)
    data_tmp6 = topi.greater(data_9, data_tmp5)

    data_tmp7 = topi.where(data_tmp6, data_8, data_tmp5)

    data_tmp8 = topi.cast(data_tmp7, 'float32')
    data_tmp9 = topi.full_like(data_tmp8, n * h * w)
    data_tmp10 = topi.multiply(data_tmp8, data_tmp9)
    data_tmp12 = topi.subtract(data_tmp10, data_3)
    data_tmp14 = topi.cast(data_7, 'float32')
    data_tmp15 = topi.multiply(data_6, data_tmp2)

    data_tmp17 = topi.subtract(data_tmp14, data_tmp15)
    data_tmp18 = topi.multiply(data_2, data_tmp17)
    data_tmp20 = topi.divide(data_tmp18, data_1)
    data_tmp21 = topi.subtract(data_tmp12, data_tmp20)
    data_tmp22 = topi.multiply(data_tmp3, data_tmp21)
    data_out = topi.cast(data_tmp22, 'float16')

    return data_out
예제 #11
0
def my_dsl(dtype, kernel_name, attrs):
    m = tvm.var("M")
    n = tvm.var("N")
    A = tvm.placeholder((m, ), name="A", dtype=dtype)
    B = tvm.placeholder((m, ), name="B", dtype=dtype)

    if insn == "add":
        C = topi.add(A, B)
    elif insn == "sub":
        C = topi.subtract(A, B)
    if insn == "mul":
        C = topi.multiply(A, B)
    elif insn == "div":
        C = topi.divide(A, B)
    elif insn == "max":
        C = topi.maximum(A, B)
    elif insn == "min":
        C = topi.minimum(A, B)

    elif insn == "abs":
        C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C')
    elif insn == "exp":
        C = topi.exp(A)
    elif insn == "log":
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)

    elif insn == "adds":
        C = A + tvm.const(2, dtype)
    elif insn == "muls":
        C = A * tvm.const(2, dtype)

    # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C")
    s = tvm.create_schedule([C.op])
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        if insnType == "binary":
            mod = akg.build(s, [A, B, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
        else:
            mod = akg.build(s, [A, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
    return mod
예제 #12
0
def bn_gamma_grad(head, in_data, data_sum, layout="NHWC"):
    if layout == "NCHW":
        head = topi.tranpose(head, (0, 2, 3, 1))

    n, h, w, c = head.shape
    n = n.value
    h = h.value
    w = w.value
    c = c.value
    scale = tvm.const(n * h * w, head.dtype)
    mean = topi.divide(data_sum, scale)
    x_hat = topi.subtract(in_data, mean)
    x_hat_mul = topi.multiply(x_hat, head)
    bn_gamma_grad = topi.sum(x_hat_mul, axis=(0, 1, 2))
    return bn_gamma_grad
예제 #13
0
def fused_relu_grad_bn_double_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, data8,
                           data9, data10, data11, data12, data13, data14, data15, layout="NHWC",
                           out_dtype="float16", target=utils.CUDA):
    
    if layout == 'NCHW':
        data5 = topi.transpose(data5, (0, 2, 3, 1))
        data9 = topi.transpose(data9, (0, 2, 3, 1))
        data13 = topi.transpose(data13, (0, 2, 3, 1))
        data14 = topi.transpose(data14, (0, 2, 3, 1))
        data15 = topi.transpose(data15, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError(
            'Layout not supported {} '.format(layout))
    
    inter_dtype = "float32"
    n, h, w, c = data5.shape
    scale = n * h * w

    mul = topi.multiply(data2, data3)
    mul1221 = topi.divide(mul, scale)

    # ReluGrad
    zero = tvm.const(0, data15.dtype)
    add = topi.add(data13, data14)
    addgrad = tvm.compute(add.shape, lambda *i: tvm.if_then_else(data15(*i) >= zero, add(*i), zero), tag=tag.INJECTIVE)
    addgrad = topi.cast(addgrad, inter_dtype)
    mul3283 = topi.multiply(scale, addgrad)
    sub1159 = topi.subtract(mul3283, data6)

    data5_cast = topi.cast(data5, inter_dtype)
    mul2372 = topi.divide(data4, scale)
    sub631 = topi.subtract(data5_cast, mul2372)
    mul1220 = topi.multiply(sub631, data1)
    div = topi.divide(mul1220, data0)
    sub271 = topi.subtract(sub1159, div)
    mul1218 = topi.multiply(mul1221, sub271)
    mul1218_cast = topi.cast(mul1218, out_dtype)

    mul1231 = topi.multiply(data11, data12)
    mul1230 = topi.divide(mul1231, scale)
    data9_cast = topi.cast(data9, inter_dtype)
    mul2364 = topi.divide(data8, scale)
    sub625 = topi.subtract(data9_cast, mul2364)
    mul1229 = topi.multiply(data10, sub625)

    div272 = topi.divide(mul1229, data7)
    sub272 = topi.subtract(sub1159, div272)
    mul1228 = topi.multiply(mul1230, sub272)
    mul1228_cast = topi.cast(mul1228, out_dtype)

    if layout == "NCHW":
        mul1218_cast = topi.transpose(mul1218_cast, (0, 3, 1, 2))
        mul1228_cast = topi.transpose(mul1228_cast, (0, 3, 1, 2))
    
    return [mul1218_cast, mul1228_cast]
예제 #14
0
파일: asinh.py 프로젝트: mindspore-ai/akg
def sqrt_mini_newton_iter_impl(x):
    """sqrt compute on mini with the Newton's Iteration"""

    # mini supports the rsqrt instruction, but not the sqrt instruction
    x_rsqrt = topi.rsqrt(x)
    x_sqrt = topi.divide(1, x_rsqrt)

    # newton_iter: x(n+1) = 1/2 *(x(n) + a/x(n))
    steps = 3
    half = tvm.const(0.5, x.dtype)
    shape = x.shape
    for i in range(steps):
        x_sqrt = tvm.compute(shape,
                             lambda *indice: half *
                             (x_sqrt(*indice) + x(*indice) / x_sqrt(*indice)),
                             name="x_sqrt_%s" % i)
    return x_sqrt
예제 #15
0
def acosh_grad(y, dy):
    """
    Gradient for acosh.

    Note:
        dx = dy * 1/sinh(y)

    Args:
        y (tvm.tensor.Tensor): tensor of type float16, float32.
        dy (tvm.tensor.Tensor): same type and shape as y.

    Returns:
        tvm.tensor.Tensor, same type and shape as y.
    
    Supported Platforms:
        'Ascend'
    """

    # mini product just used to infer
    if product_is_mini():
        raise RuntimeError(
            "The mini product does not support the acosh_grad operator")

    dtype = y.dtype
    utils.ops_dtype_check(y.dtype, utils.DtypeForDavinci.ALL_FLOAT)
    utils.elemwise_dtype_check(dtype, dy.dtype)
    utils.check_shape(y.shape)
    utils.elemwise_shape_check(y.shape, dy.shape)

    if dtype == "float16":
        y = topi.cast(y, "float32")
        dy = topi.cast(dy, "float32")

    # If we use sinh(y) = (exp(y) - exp(-y))/2 directly, there will be some precision problems
    # For example, as dx = dy/sinh(y), if we use exp directly, when exp(y) and exp(-y) are close,
    # the small precision error of exp calculation will be greatly enlarged in the final result
    sinh_y = _sinh_taylor(y)
    dx = topi.divide(dy, sinh_y)

    if dx.dtype != dtype:
        dx = topi.cast(dx, dtype)
    attrs = {"enable_auto_inline": False}
    return dx, attrs
예제 #16
0
def _atan2_compute(y, x):
    """compute for atan2"""
    const_pi_by_two = 1.5707963267948966192313216916398
    dtype = y.dtype
    if dtype == "float16":
        y = topi.cast(y, "float32")
        x = topi.cast(x, "float32")

    x_lt_zero_y_mask, y_ge_zero_mask = _init_atan2_mask(y, x)
    y_cmp_zero = topi.multiply(y_ge_zero_mask,
                               tvm.const(const_pi_by_two, "float32"))
    res_x_lt_zero = topi.multiply(x_lt_zero_y_mask, dc.pi_const("float32"))

    # caculate the atan(y/x) when x > 0
    if product_is_mini():
        x_rec = reciprocal(x, target=utils.CCE)
        res = topi.multiply(y, x_rec)
    else:
        res = topi.divide(y, x)
    res, _ = atan(res)

    if product_is_mini():
        tensor_zero = dc.zero_const("float16")
        x = topi.cast(x, "float16")
        y_cmp_zero = topi.cast(y_cmp_zero, "float16")
        res = topi.cast(res, "float16")
    else:
        tensor_zero = dc.zero_const("float32")

    res = tvm.compute(res.shape,
                      lambda *i: tvm.expr.Select(
                          x(*i) == tensor_zero, y_cmp_zero(*i), res(*i)),
                      name="res")

    if product_is_mini():
        res = topi.cast(res, "float32")

    res = topi.add(res, res_x_lt_zero)
    return topi.cast(res, dtype)
예제 #17
0
def asinh_grad(y, dy):
    """
    Gradient for asinh.

    Note:
        dx = dy * 1/cosh(y)

    Args:
        y (tvm.tensor.Tensor): tensor of type float16, float32.
        dy (tvm.tensor.Tensor): same type and shape as y.

    Returns:
        tvm.tensor.Tensor, same type and shape as y.
    
    Supported Platforms:
        'Ascend'
    """

    # mini product just used to infer
    if product_is_mini():
        raise RuntimeError(
            "The mini product does not support the asinh_grad operator")

    dtype = y.dtype
    utils.ops_dtype_check(y.dtype, utils.DtypeForDavinci.ALL_FLOAT)
    utils.elemwise_dtype_check(dtype, dy.dtype)
    utils.check_shape(y.shape)
    utils.elemwise_shape_check(y.shape, dy.shape)

    if dtype == "float16":
        y = topi.cast(y, "float32")
        dy = topi.cast(dy, "float32")

    dx = topi.divide(dy, cosh(y))

    if dx.dtype != dtype:
        dx = topi.cast(dx, dtype)

    return dx
예제 #18
0
def _do_atan_taylor(data):
    """
    Taylor algorithm for atan.

        if x > 0 and x < tan(pi/8):
            atan(x) = x - x^3/3 + x^5/5 - x^7/7 ...
        elif x > tan(pi/8) and x < tan(pi/4):
            atan(x) = atan(y) + atan((x-y)/(1+xy))

    Args:
        data (tvm.tensor.Tensor): Input data.

    Returns:
        A tvm.tensor.Tensor of atan(x).
    """
    dtype = data.dtype

    tensor_offset = tvm.const(TAN_PI_BY_EIGHT, dtype)
    deno = topi.multiply(data, tvm.const(TAN_PI_BY_EIGHT, dtype))
    deno = topi.add(deno, dc.one_const(dtype))
    molecule = topi.subtract(data, tensor_offset)
    ddata = topi.divide(molecule, deno)
    ddata = topi.abs(ddata)

    square_ddata = topi.multiply(ddata, ddata)
    res = tvm.const(ATAN_TAYLOR_COEF[CONST_ITERTOR], dtype)
    for i in reversed(range(CONST_ITERTOR)):
        res = topi.multiply(res, square_ddata)
        res = topi.add(res, tvm.const(ATAN_TAYLOR_COEF[i], dtype))
    res = topi.multiply(res, ddata)
    res = topi.add(res, tvm.const(CONST_PI_BY_EIGHT, dtype))

    square_data = topi.multiply(data, data)
    res2 = tvm.const(ATAN_TAYLOR_COEF[CONST_ITERTOR2], dtype)
    for i in reversed(range(CONST_ITERTOR2)):
        res2 = topi.multiply(res2, square_data)
        res2 = topi.add(res2, tvm.const(ATAN_TAYLOR_COEF[i], dtype))
    return topi.minimum(res, topi.multiply(res2, data))
예제 #19
0
def _apply_adagrad_da_compute(var, gradient_accum, gradient_squared_accum,
                              grad, lr, l1, l2, global_step):
    """Compute adagrad_da."""
    dtype = var.dtype
    # cast to float32 for higher precision
    if dtype == "float16":
        gradient_accum = topi.cast(gradient_accum, "float32")
        gradient_squared_accum = topi.cast(gradient_squared_accum, "float32")
        grad = topi.cast(grad, "float32")
        lr = topi.cast(lr, "float32")
        l1 = topi.cast(l1, "float32")
        l2 = topi.cast(l2, "float32")
    if product_is_mini():
        global_step = topi.cast(global_step, "float16")
        global_step = topi.cast(global_step, "float32")
    else:
        global_step = topi.cast(global_step, "float32")

    # 1.grad_accum += grad
    gradient_accum = topi.add(gradient_accum, grad)

    # 2.grad_squared_accum += grad * grad
    gs = topi.multiply(grad, grad)
    gradient_squared_accum = topi.add(gradient_squared_accum, gs)

    # 3.if l1 > 0: tmp_val = Sign(grad_accum) * max(|grad_accum|-l1*global_step, 0)
    #   else:      tmp_val = grad_accum
    sign_val = Sign(gradient_accum)
    abs_val = topi.abs(gradient_accum)
    mul_val = topi.multiply(global_step, l1)
    sub_val = topi.subtract(abs_val, mul_val)
    max_val = topi.maximum(sub_val, tvm.const(0, sub_val.dtype))
    tmp_val = topi.multiply(sign_val, max_val)

    def select(l1, tmp_val, gradient_accum):
        """Returns tmp_val if l1 > 0 else gradient_accum."""
        if product_is_mini():
            l1 = topi.cast(l1, "float16")
            tmp_val = topi.cast(tmp_val, "float16")
            gradient_accum = topi.cast(gradient_accum, "float16")
        tmp_val = akg.tvm.compute(
            tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i),
                                                      gradient_accum(*i)))
        return topi.cast(tmp_val, "float32") if product_is_mini() else tmp_val

    tmp_val = select(l1, tmp_val, gradient_accum)

    # 4.x_value = -1 * lr * tmp_val
    x_value = topi.multiply(lr, tvm.const(-1, "float32"))
    x_value = topi.multiply(x_value, tmp_val)

    # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum)
    pro_val = topi.multiply(l2, global_step)
    pro_val = topi.multiply(pro_val, lr)
    sqrt_val = sqrt(gradient_squared_accum, target=utils.CCE)
    y_value = topi.add(pro_val, sqrt_val)

    # 6.var = x_value / y_value
    if product_is_mini():
        y_rec = reciprocal(y_value, target=utils.CCE)
        var_out = topi.multiply(x_value, y_rec)
    else:
        var_out = topi.divide(x_value, y_value)

    if dtype == "float16":
        var_out = akg.lang.ascend.cast_to(var_out, "float16")
        gradient_accum = akg.lang.ascend.cast_to(gradient_accum, "float16")
        gradient_squared_accum = akg.lang.ascend.cast_to(
            gradient_squared_accum, "float16")

    return var_out, gradient_accum, gradient_squared_accum