Beispiel #1
0
def xdivy_grad_compute(placeholders, shape_max, dtype, rx, ry):
    """
    Do element-wise xdivy_grad compute.

    Args:
        placeholders (Union[list, typle]): the placeholder of data input
        shape_max (Union[list, typle]): the shape of broadcast
        dtype (string): the type of data input
        rx (list): the reduction indices of data input with broadcast
        ry (list): the reduction indices for data input with broadcast

    Returns:
        output_y1 (tvm.tensor.Tensor): result of xdivy_grad
        output_y2 (tvm.tensor.Tensor): result of xdivy_grad
    """
    x1_ori = placeholders[0]
    x2_ori = placeholders[1]
    grad_ori = placeholders[2]

    if dtype == "float16":
        x1 = akg.lang.cce.cast_to(x1_ori, "float32")
        x2 = akg.lang.cce.cast_to(x2_ori, "float32")
        grad = akg.lang.cce.cast_to(grad_ori, "float32")
        x1 = akg.lang.cce.broadcast(x1, shape_max)
        x2 = akg.lang.cce.broadcast(x2, shape_max)
        grad = akg.lang.cce.broadcast(grad, shape_max)
    else:
        x1 = akg.lang.cce.broadcast(x1_ori, shape_max)
        x2 = akg.lang.cce.broadcast(x2_ori, shape_max)
        grad = akg.lang.cce.broadcast(grad_ori, shape_max)

    esp_min = tvm.const(1.18e-38, dtype="float32")
    x1_addepsmin = akg.lang.cce.vadds(x1, esp_min)

    if utils.product_is_mini():
        x1_addepsmin_rec = reciprocal(x1_addepsmin)
        not_zero_x1 = akg.lang.cce.vmul(x1, x1_addepsmin_rec)
        x2_rec = reciprocal(x2)
        partial_x1 = akg.lang.cce.vmul(not_zero_x1, x2_rec)
    else:
        not_zero_x1 = div(x1, x1_addepsmin)
        partial_x1 = div(not_zero_x1, x2)

    partial_x1g = akg.lang.cce.vmul(partial_x1, grad)

    neg_one = tvm.const(-1, dtype="float32")
    neg_x1 = akg.lang.cce.vmuls(x1, neg_one)
    partial_x1pow = akg.lang.cce.vmul(partial_x1, partial_x1)
    partial_x2 = akg.lang.cce.vmul(neg_x1, partial_x1pow)
    partial_x2g = akg.lang.cce.vmul(partial_x2, grad)

    output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True)
    output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True)

    if dtype == "float16":
        output_y1 = akg.lang.cce.cast_to(output_y1, "float16")
        output_y2 = akg.lang.cce.cast_to(output_y2, "float16")

    return output_y1, output_y2
Beispiel #2
0
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry):
    """
    do element-wise xlogy_grad compute

    Args:
        placeholders (Union[list, typle]): the placeholder of data input
        shape_max (Union[list, typle]): the shape of broadcast
        dtype (string): the type of data input
        rx (list): the reduction indices of data input with broadcast
        ry (list): the reduction indices for data input with broadcast

    Returns
        output_y1 (tvm.tensor.Tensor): result of xlogy_grad
        output_y2 (tvm.tensor.Tensor): result of xlogy_grad
    """
    x1_ori = placeholders[0]
    x2_ori = placeholders[1]
    grad_ori = placeholders[2]

    if dtype == "float16":
        x1 = akg.lang.cce.cast_to(x1_ori, "float32")
        x2 = akg.lang.cce.cast_to(x2_ori, "float32")
        grad = akg.lang.cce.cast_to(grad_ori, "float32")
        x1 = akg.lang.cce.broadcast(x1, shape_max)
        x2 = akg.lang.cce.broadcast(x2, shape_max)
        grad = akg.lang.cce.broadcast(grad, shape_max)
    else:
        x1 = akg.lang.cce.broadcast(x1_ori, shape_max)
        x2 = akg.lang.cce.broadcast(x2_ori, shape_max)
        grad = akg.lang.cce.broadcast(grad_ori, shape_max)

    esp_min = tvm.const(1.18e-38, dtype="float32")
    x1_addespmin = akg.lang.cce.vadds(x1, esp_min)

    if utils.product_is_mini():
        not_zero_x1 = akg.lang.cce.vmul(x1, reciprocal(x1_addespmin))
        log_x2 = tvm.compute(
            x2.shape,
            lambda *i: (tvm.log(x2(*i).astype("float16"))).astype("float32"),
            name="log_x2")
    else:
        not_zero_x1 = div(x1, x1_addespmin)
        log_x2 = akg.lang.cce.vlog(x2)

    partial_x1 = akg.lang.cce.vmul(not_zero_x1, log_x2)
    partial_x1g = akg.lang.cce.vmul(partial_x1, grad)

    partial_x2 = div(x1, x2) if not utils.product_is_mini() else \
        akg.lang.cce.vmul(x1, reciprocal(x2))
    partial_x2g = akg.lang.cce.vmul(partial_x2, grad)

    output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True)
    output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True)

    if dtype == "float16":
        output_y1 = akg.lang.cce.cast_to(output_y1, "float16")
        output_y2 = akg.lang.cce.cast_to(output_y2, "float16")
    return output_y1, output_y2
Beispiel #3
0
def _tan_2x_multi(input_x, times):
    """calculating tan x by calculating tan (x/2^times) and using double angle formula multiple times"""
    # Calculate tan (x/2^times)
    if input_x.dtype == FLOAT_16 and utils.product_is_mini():
        input_x_divide = topi.multiply(input_x, tvm.const(1.0/(2.0**times), FLOAT_16))
        res = _tan_expand(input_x_divide)
    else:
        input_x_divide = topi.multiply(input_x, 1.0/(2.0**times))
        res = _tan_expand(input_x_divide)
    while times != 0:
        # using double angle formula: tan 2x = 2*tan x/(1-tan x*tan x)
        if input_x.dtype == FLOAT_16 and utils.product_is_mini():
            res_numerator = topi.multiply(res, tvm.const(2.0, FLOAT_16))
            tanx_square = topi.multiply(res, res)
            res_denominator = topi.add(topi.multiply(tanx_square, tvm.const(-1.0, FLOAT_16)), tvm.const(1.0, FLOAT_16))
        else:
            res_numerator = topi.multiply(res, 2.0)
            tanx_square = topi.multiply(res, res)
            res_denominator = topi.add(topi.multiply(tanx_square, -1.0), 1.0)

        if utils.product_is_mini():
            res = mul(res_numerator, reciprocal(res_denominator))
        else:
            res = div(res_numerator, res_denominator)
        times = times - 1
    return res
Beispiel #4
0
def floordiv(data1, data2):
    """
    Calculate x/y, and always returns an integer which is floored.

    Args:
        data1 (tvm.tensor.Tensor): Tensor of type float16, float32.
        data2 (tvm.tensor.Tensor): Tensor of type float16, float32.

    Returns:
        tvm.tensor.Tensor, has type of int32.
    """
    vc_util.ops_dtype_check([data1.dtype, data2.dtype],
                            vc_util.DtypeForDavinci.ALL_FLOAT)
    shape1 = [x.value for x in data1.shape]
    vc_util.check_shape(shape1)
    shape2 = [x.value for x in data2.shape]
    vc_util.check_shape(shape2)

    if utils.product_is_mini():
        rec = reciprocal(data2, high_precision=True)
        res = data1 * rec
    else:
        res = akg.topi.divide(data1, data2)
    res = akg.lang.cce.floor(res)
    return res
Beispiel #5
0
def _div_ascend(data1, data2):
    """
    Calculates x/y, and returns an integer when inputs are all integers.

    When both arguments are integers, use integer division (also known as "floor division").
    When arguments are float numbers, use normal floating point division

    Note:
        div supports broadcasting.

    Args:
        data1 (tvm.tensor.Tensor): Tensor of type float16, float32, int32, int8 and uint8.
        data2 (tvm.tensor.Tensor): Tensor of type float16, float32, int32, int8 and uint8.

    Returns:
        tvm.tensor.Tensor, has the same type as data1 and data2.
    """

    utils.ops_dtype_check([data1.dtype, data2.dtype],
                          utils.DtypeForDavinci.ALL_TYPES)
    utils.elemwise_dtype_check(data1.dtype, data2.dtype)
    dtype = data1.dtype

    shape1 = [x.value for x in data1.shape]
    shape2 = [x.value for x in data2.shape]
    utils.check_shape(shape1)
    utils.check_shape(shape2)

    utils.auto_broadcast_check(shape1, shape2)
    n_shape1, n_shape2, out_shape = produce_shapes(shape1, shape2)
    if n_shape1 != out_shape:
        input1_cast = akg.topi.broadcast_to(data1, out_shape)
    else:
        input1_cast = data1
    if n_shape2 != out_shape:
        input2_cast = akg.topi.broadcast_to(data2, out_shape)
    else:
        input2_cast = data2

    if dtype in ("int32", "int8", "uint8"):
        input1p = Case(input1_cast, "float16", utils.CCE)
        input2p = Cast(input2_cast, "float16", utils.CCE)
    else:
        input1p = input1_cast
        input2p = input2_cast

    if product_is_mini():
        input2p_rec = reciprocal(input2p, target=utils.CCE)
        res = akg.topi.multiply(input1p, input2p_rec)
    else:
        res = akg.topi.divide(input1p, input2p)

    if dtype in ("int8", "uint8"):
        res = floor(res, utils.CCE)
        res = Cast(res, "float16", utils.CCE)
    if dtype in ("int32", "int8", "uint8"):
        res = Cast(res, dtype, utils.CCE)

    return res
Beispiel #6
0
def _apply_ada_max_compute(var, m, v, grad, lr, beta1, beta1_power, beta2, epsilon):
    """Compute ada_max."""
    # cast to float32 for improved accuracy
    inp_dtype = var.dtype
    if inp_dtype == 'float16':
        var = topi.cast(var, 'float32')
        m = topi.cast(m, 'float32')
        v = topi.cast(v, 'float32')
        lr = topi.cast(lr, 'float32')
        beta1_power = topi.cast(beta1_power, 'float32')
        beta1 = topi.cast(beta1, 'float32')
        beta2 = topi.cast(beta2, 'float32')
        grad = topi.cast(grad, 'float32')
    epsilon = tvm.const(epsilon, 'float32')

    # m += (grad - m) * (1 - beta1)
    rhs = tvm.compute(beta1.shape, lambda *i: beta1(*i) * neg_one_const("float32"))
    rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) + one_const("float32"))
    lhs = topi.subtract(grad, m)
    rhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) * rhs[0])
    m = topi.add(m, rhs)

    # v = max(beta2*v, abs(grad))
    lhs = tvm.compute(v.shape, lambda *i: v(*i) * beta2[0])
    rhs = topi.abs(grad)
    v = topi.maximum(lhs, rhs)

    # var -= lr / (1 - beta1_power) * (m / (v + epsilon))
    # lr * m / (1 - beta1_power) * (v + epsilon)
    # v + epsilon
    rhs = tvm.compute(v.shape, lambda *i: v(*i) + epsilon)
    # 1 - beta1_power
    lhs = tvm.compute(beta1_power.shape, lambda *i: beta1_power(*i) * neg_one_const("float32"))
    lhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) + one_const("float32"))
    # (1 - beta1_power) * (v + epsilon)
    rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) * lhs[0])
    # lr * m
    lhs = tvm.compute(m.shape, lambda *i: m(*i) * lr[0])
    # lr * m / (1 - beta1_power) * (v + epsilon)
    rhs = reciprocal(rhs)
    rhs = topi.multiply(lhs, rhs)
    var = topi.subtract(var, rhs)

    if inp_dtype == 'float16':
        var = topi.cast(var, inp_dtype)
        m = topi.cast(m, inp_dtype)
        v = topi.cast(v, inp_dtype)

    return var, m, v
Beispiel #7
0
def softsign_compute(input_features):
    """ompute for softsign"""
    dtype = input_features.dtype
    if dtype == "float16":
        input_features = akg.lang.cce.cast_to(input_features, "float32")

    data_abs = akg.lang.cce.vabs(input_features)
    data_add = akg.lang.cce.vadds(data_abs, SCALAR_ONE)
    data_rec = reciprocal(data_add)
    res = akg.lang.cce.vmul(input_features, data_rec)

    if dtype == "float16":
        res = akg.lang.cce.cast_to(res, "float16")

    return res
def fake_quant_with_min_max_vars_per_channel_compute(input_data,
                                                     input_min,
                                                     input_max,
                                                     num_bits=8,
                                                     narrow_range=False):
    """fake_quant_with_min_max_vars_per_channel compute implemention"""
    shape = get_shape(input_data.shape)
    dtype = input_data.dtype
    min_broadcast = akg.lang.cce.broadcast(input_min, shape, dtype)
    max_broadcast = akg.lang.cce.broadcast(input_max, shape, dtype)
    # get nudged_min and nudged_max by nudged_min_max_compute function
    nudged_min_nudged_max = nudged_min_max_compute(min_broadcast,
                                                   max_broadcast, num_bits,
                                                   narrow_range)
    # transform the input between nudged_max and nudged_min
    clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1])
    clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0])

    # calculate the quantized and dequantized results
    clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0])
    if utils.product_is_mini():
        clamped_shifted_div_scale = mul(clamped_shifted,
                                        reciprocal(nudged_min_nudged_max[2]))
    else:
        clamped_shifted_div_scale = div(clamped_shifted,
                                        nudged_min_nudged_max[2])
    result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype))
    floor_result_tmp = akg.lang.cce.floor(result_tmp)
    if utils.product_is_mini():
        floor_result_tmp = topi.cast(floor_result_tmp, "float16")

    floor_result_tmp = topi.cast(floor_result_tmp, "float32")
    scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2])
    tmp_res = topi.add(scale_product, nudged_min_nudged_max[0])
    # get bool_both_zero_value by bool_both_zero_compute function
    bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast)
    res = topi.multiply(tmp_res, bool_both_zero_value)

    return res
Beispiel #9
0
def _atan2_compute(y, x):
    """compute for atan2"""
    dtype = y.dtype
    if dtype == "float16":
        y = topi.cast(y, "float32")
        x = topi.cast(x, "float32")

    x_lt_zero_y_mask, y_ge_zero_mask = _init_atan2_mask(y, x)
    y_cmp_zero = topi.multiply(y_ge_zero_mask,
                               tvm.const(CONST_PI_BY_TWO, "float32"))
    res_x_lt_zero = topi.multiply(x_lt_zero_y_mask, dc.pi_const("float32"))

    # caculate the atan(y/x) when x > 0
    if utils.product_is_mini():
        x_rec = reciprocal(x)
        res = topi.multiply(y, x_rec)
    else:
        res = topi.divide(y, x)
    res, _ = atan(res)

    if utils.product_is_mini():
        tensor_zero = dc.zero_const("float16")
        x = topi.cast(x, "float16")
        y_cmp_zero = topi.cast(y_cmp_zero, "float16")
        res = topi.cast(res, "float16")
    else:
        tensor_zero = dc.zero_const("float32")

    res = tvm.compute(res.shape,
                      lambda *i: tvm.expr.Select(
                          x(*i) == tensor_zero, y_cmp_zero(*i), res(*i)),
                      name="res")

    if utils.product_is_mini():
        res = topi.cast(res, "float32")

    res = topi.add(res, res_x_lt_zero)
    return topi.cast(res, dtype)
Beispiel #10
0
def Reciprocal(x):
    """reciprocal"""
    return reciprocal.reciprocal(x, high_precision=True)
def nudged_min_max_compute(min_broadcast, max_broadcast, num_bits,
                           narrow_range):
    """
    Calculate the maximum and minimum values of the quantization.

    Notes:
        Each channel scale[i] euqal to (max_broadcast[i] - min_broadcast[i]) / (quant_max - quant_min).
        Then compute nudged_zero_point:
                nudged_zero_point = floor(between_min_max_float + 0.5) + less_quant_min_float + more_quant_max_float,
        between_min_max_float is first calculated by:
                zero_point_from_min = (quant_min_float - min_broadcast) / scale,
        then between_min_max_float = zero_point_from_min, which min_broadcast <= zero_point_from_min <= max_broadcast.
        Besides, the value of less_quant_min_float is equal to quant_min or zero, zero_point_from_min < quant_min_float,
        the value is quant_min, else is 0. The same as more_quant_max_float.
        Finally according to scale and nudged_zero_point to compute nudged_min and nudged_max:
                 nudged_min = (quant_min - nudged_zero_point) * scale
                 nudged_max = (quant_max - nudged_zero_point) * scale

    Args:
        min_broadcast (tvm.tensor.Tensor): minimum value to be quantified for each channel.
        max_broadcast (tvm.tensor.Tensor): maximum value to be quantified for each channel.
        num_bits (int): num_bits is the bitwidth of the quantization, range [2,16].
        narrow_range (bool): if True, for each channel, quantized into the quantization range [0, 2^num_bits - 1] else
                      quantized into the quantization range [1, 2^num_bits - 1].

    Returns:
        nudged_min (tvm.tensor.Tensor): The same type and shape as min_broadcast.
        nudged_max (tvm.tensor.Tensor): The same type and shape as max_broadcast.
        scale (tvm.tensor.Tensor): The same type and shape as max_broadcast.
    """

    dtype = min_broadcast.dtype
    quant_min = 1 if narrow_range else 0
    quant_max = (2**num_bits) - 1

    # because of need compute each channel, so quant_min and quant_max need to broadcast.
    quant_min_float = topi.full(min_broadcast.shape, dtype,
                                tvm.const(quant_min, dtype))
    quant_max_float = topi.full(min_broadcast.shape, dtype,
                                tvm.const(quant_max, dtype))

    # caculate each channel max and min difference.
    max_sub_min = topi.subtract(max_broadcast, min_broadcast)
    quant_max_sub_quant_min = topi.subtract(quant_max_float, quant_min_float)
    # compute scale = (max_broadcast - min_broadcast) / (quant_max - quant_min)
    # and min_div_scale = min_broadcast / scale
    if utils.product_is_mini():
        scale = mul(max_sub_min, reciprocal(quant_max_sub_quant_min))
        min_div_scale = mul(min_broadcast, reciprocal(scale))
    else:
        scale = div(max_sub_min, quant_max_sub_quant_min)
        min_div_scale = div(min_broadcast, scale)

    # zero_point_from_min = quant_min_float - min_broadcast / scale
    zero_point_from_min = topi.subtract(quant_min_float, min_div_scale)
    # if zero_point_from_min < quant_min_float, bool_less_quant_min_float = 1 else 0
    bool_less_quant_min_float = less_compare_float32(zero_point_from_min,
                                                     quant_min_float)
    # if quant_max_float < zero_point_from_min, bool_more_quant_max_float = 1 else 0
    bool_more_quant_max_float = less_compare_float32(quant_max_float,
                                                     zero_point_from_min)

    # according to above bool param to select effective value
    less_quant_min_float = topi.multiply(quant_min_float,
                                         bool_less_quant_min_float)
    more_quant_max_float = topi.multiply(quant_max_float,
                                         bool_more_quant_max_float)

    # compute which num is not less than quant_min_float and not large than quant_max_float
    tensor_one = topi.full(min_broadcast.shape, dtype, dc.one_const(dtype))
    bool_not_less_quant_min_float = topi.subtract(tensor_one,
                                                  bool_less_quant_min_float)
    bool_not_more_quant_max_float = topi.subtract(tensor_one,
                                                  bool_more_quant_max_float)
    bool_between_min_max = topi.multiply(bool_not_less_quant_min_float,
                                         bool_not_more_quant_max_float)
    between_min_max_float = topi.multiply(zero_point_from_min,
                                          bool_between_min_max)
    # add 0.5 to num which min <= num <= max and then floor them.
    between_min_max_add_half_one = topi.add(between_min_max_float,
                                            dc.half_const(dtype))
    between_min_max_round = akg.lang.cce.floor(between_min_max_add_half_one)
    if utils.product_is_mini():
        between_min_max_round = topi.cast(between_min_max_round, "float16")

    between_min_max_round = topi.cast(between_min_max_round, "float32")

    # calculate the maximum and minimum values of the quantization
    nudged_zero_point_tmp = topi.add(less_quant_min_float,
                                     more_quant_max_float)
    nudged_zero_point = topi.add(nudged_zero_point_tmp, between_min_max_round)

    nudged_min_tmp = topi.subtract(quant_min_float, nudged_zero_point)
    nudged_max_tmp = topi.subtract(quant_max_float, nudged_zero_point)
    nudged_min = topi.multiply(nudged_min_tmp, scale)
    nudged_max = topi.multiply(nudged_max_tmp, scale)
    res = [nudged_min, nudged_max, scale]

    return res
Beispiel #12
0
def reciprocal_auto(tensor):
    """Reciprocal with auto schedule."""
    return reciprocal.reciprocal(tensor)
Beispiel #13
0
def reciprocal_manual(tensor):
    """Reciprocal with manual schedule."""
    return reciprocal.reciprocal(tensor)
Beispiel #14
0
def _apply_adagrad_da_compute(var, gradient_accum, gradient_squared_accum,
                              grad, lr, l1, l2, global_step):
    """Compute adagrad_da."""
    dtype = var.dtype
    # cast to float32 for higher precision
    if dtype == "float16":
        gradient_accum = topi.cast(gradient_accum, "float32")
        gradient_squared_accum = topi.cast(gradient_squared_accum, "float32")
        grad = topi.cast(grad, "float32")
        lr = topi.cast(lr, "float32")
        l1 = topi.cast(l1, "float32")
        l2 = topi.cast(l2, "float32")
    if utils.product_is_mini():
        global_step = topi.cast(global_step, "float16")
        global_step = topi.cast(global_step, "float32")
    else:
        global_step = topi.cast(global_step, "float32")

    # 1.grad_accum += grad
    gradient_accum = topi.add(gradient_accum, grad)

    # 2.grad_squared_accum += grad * grad
    gs = topi.multiply(grad, grad)
    gradient_squared_accum = topi.add(gradient_squared_accum, gs)

    # 3.if l1 > 0: tmp_val = sign(grad_accum) * max(|grad_accum|-l1*global_step, 0)
    #   else:      tmp_val = grad_accum
    sign_val = sign(gradient_accum)
    abs_val = topi.abs(gradient_accum)
    mul_val = topi.multiply(global_step, l1)
    sub_val = topi.subtract(abs_val, mul_val)
    max_val = topi.maximum(sub_val, tvm.const(0, sub_val.dtype))
    tmp_val = topi.multiply(sign_val, max_val)

    def select(l1, tmp_val, gradient_accum):
        """Returns tmp_val if l1 > 0 else gradient_accum."""
        if utils.product_is_mini():
            l1 = topi.cast(l1, "float16")
            tmp_val = topi.cast(tmp_val, "float16")
            gradient_accum = topi.cast(gradient_accum, "float16")
        tmp_val = akg.tvm.compute(
            tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i),
                                                      gradient_accum(*i)))
        return topi.cast(tmp_val,
                         "float32") if utils.product_is_mini() else tmp_val

    tmp_val = select(l1, tmp_val, gradient_accum)

    # 4.x_value = -1 * lr * tmp_val
    x_value = topi.multiply(lr, tvm.const(-1, "float32"))
    x_value = topi.multiply(x_value, tmp_val)

    # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum)
    pro_val = topi.multiply(l2, global_step)
    pro_val = topi.multiply(pro_val, lr)
    sqrt_val = sqrt(gradient_squared_accum)
    y_value = topi.add(pro_val, sqrt_val)

    # 6.var = x_value / y_value
    if utils.product_is_mini():
        y_rec = reciprocal(y_value)
        var_out = topi.multiply(x_value, y_rec)
    else:
        var_out = topi.divide(x_value, y_value)

    if dtype == "float16":
        var_out = akg.lang.cce.cast_to(var_out, "float16")
        gradient_accum = akg.lang.cce.cast_to(gradient_accum, "float16")
        gradient_squared_accum = akg.lang.cce.cast_to(gradient_squared_accum,
                                                      "float16")

    return var_out, gradient_accum, gradient_squared_accum