Esempio n. 1
0
def xdivy_grad_compute(placeholders, shape_max, dtype, rx, ry):
    """
    Do element-wise xdivy_grad compute.

    Args:
        placeholders (Union[list, typle]): the placeholder of data input
        shape_max (Union[list, typle]): the shape of broadcast
        dtype (string): the type of data input
        rx (list): the reduction indices of data input with broadcast
        ry (list): the reduction indices for data input with broadcast

    Returns:
        output_y1 (tvm.tensor.Tensor): result of xdivy_grad
        output_y2 (tvm.tensor.Tensor): result of xdivy_grad
    """
    x1_ori = placeholders[0]
    x2_ori = placeholders[1]
    grad_ori = placeholders[2]

    if dtype == "float16":
        x1 = akg.lang.ascend.cast_to(x1_ori, "float32")
        x2 = akg.lang.ascend.cast_to(x2_ori, "float32")
        grad = akg.lang.ascend.cast_to(grad_ori, "float32")
        x1 = akg.lang.ascend.broadcast(x1, shape_max)
        x2 = akg.lang.ascend.broadcast(x2, shape_max)
        grad = akg.lang.ascend.broadcast(grad, shape_max)
    else:
        x1 = akg.lang.ascend.broadcast(x1_ori, shape_max)
        x2 = akg.lang.ascend.broadcast(x2_ori, shape_max)
        grad = akg.lang.ascend.broadcast(grad_ori, shape_max)

    esp_min = tvm.const(1.18e-38, dtype="float32")
    x1_addepsmin = akg.lang.ascend.vadds(x1, esp_min)

    if product_is_mini():
        x1_addepsmin_rec = reciprocal(x1_addepsmin)
        not_zero_x1 = akg.lang.ascend.vmul(x1, x1_addepsmin_rec)
        x2_rec = reciprocal(x2)
        partial_x1 = akg.lang.ascend.vmul(not_zero_x1, x2_rec)
    else:
        not_zero_x1 = Divide(x1, x1_addepsmin, target="cce")
        partial_x1 = Divide(not_zero_x1, x2, target="cce")

    partial_x1g = akg.lang.ascend.vmul(partial_x1, grad)

    neg_one = tvm.const(-1, dtype="float32")
    neg_x1 = akg.lang.ascend.vmuls(x1, neg_one)
    partial_x1pow = akg.lang.ascend.vmul(partial_x1, partial_x1)
    partial_x2 = akg.lang.ascend.vmul(neg_x1, partial_x1pow)
    partial_x2g = akg.lang.ascend.vmul(partial_x2, grad)

    output_y1 = akg.lang.ascend.sum(partial_x1g, rx, keepdims=True)
    output_y2 = akg.lang.ascend.sum(partial_x2g, ry, keepdims=True)

    if dtype == "float16":
        output_y1 = akg.lang.ascend.cast_to(output_y1, "float16")
        output_y2 = akg.lang.ascend.cast_to(output_y2, "float16")

    return output_y1, output_y2
Esempio n. 2
0
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry):
    """
    do element-wise xlogy_grad compute

    Args:
        placeholders (Union[list, typle]): the placeholder of data input
        shape_max (Union[list, typle]): the shape of broadcast
        dtype (string): the type of data input
        rx (list): the reduction indices of data input with broadcast
        ry (list): the reduction indices for data input with broadcast

    Returns
        output_y1 (tvm.tensor.Tensor): result of xlogy_grad
        output_y2 (tvm.tensor.Tensor): result of xlogy_grad
    """
    x1_ori = placeholders[0]
    x2_ori = placeholders[1]
    grad_ori = placeholders[2]

    if dtype == "float16":
        x1 = akg.lang.ascend.cast_to(x1_ori, "float32")
        x2 = akg.lang.ascend.cast_to(x2_ori, "float32")
        grad = akg.lang.ascend.cast_to(grad_ori, "float32")
        x1 = akg.lang.ascend.broadcast(x1, shape_max)
        x2 = akg.lang.ascend.broadcast(x2, shape_max)
        grad = akg.lang.ascend.broadcast(grad, shape_max)
    else:
        x1 = akg.lang.ascend.broadcast(x1_ori, shape_max)
        x2 = akg.lang.ascend.broadcast(x2_ori, shape_max)
        grad = akg.lang.ascend.broadcast(grad_ori, shape_max)

    esp_min = tvm.const(1.18e-38, dtype="float32")
    x1_addespmin = akg.lang.ascend.vadds(x1, esp_min)

    if product_is_mini():
        not_zero_x1 = akg.lang.ascend.vmul(x1, reciprocal(x1_addespmin))
        log_x2 = tvm.compute(
            x2.shape,
            lambda *i: (tvm.log(x2(*i).astype("float16"))).astype("float32"),
            name="log_x2")
    else:
        not_zero_x1 = Divide(x1, x1_addespmin, target="cce")
        log_x2 = akg.lang.ascend.vlog(x2)

    partial_x1 = akg.lang.ascend.vmul(not_zero_x1, log_x2)
    partial_x1g = akg.lang.ascend.vmul(partial_x1, grad)

    partial_x2 = Divide(x1, x2, target="cce") if not product_is_mini() else \
        akg.lang.ascend.vmul(x1, reciprocal(x2))
    partial_x2g = akg.lang.ascend.vmul(partial_x2, grad)

    output_y1 = akg.lang.ascend.sum(partial_x1g, rx, keepdims=True)
    output_y2 = akg.lang.ascend.sum(partial_x2g, ry, keepdims=True)

    if dtype == "float16":
        output_y1 = akg.lang.ascend.cast_to(output_y1, "float16")
        output_y2 = akg.lang.ascend.cast_to(output_y2, "float16")
    return output_y1, output_y2
Esempio n. 3
0
def _apply_adam_compute(var,
                        m,
                        v,
                        beta1_power,
                        beta2_power,
                        lr,
                        beta1,
                        beta2,
                        epsilon,
                        grad,
                        use_nesterov=False):
    """Compute for adam algorithm"""

    shape = var.shape

    # m_new <- m + (1-beta1)*(grad - m)
    m_new = akg.tvm.compute(shape,
                            lambda *indice: m(*indice) + (1 - beta1[0]) *
                            (grad(*indice) - m(*indice)),
                            name="m_new")

    # v_new <- v + (1-beta2)*(grad*grad-v)
    v_new = akg.tvm.compute(shape,
                            lambda *indice: v(*indice) + (1 - beta2[0]) *
                            (grad(*indice) * grad(*indice) - v(*indice)),
                            name="v_new")

    # lr_t <- lr*sqrt(1-beta2_power)/(1-beta1_power)
    one_const = akg.tvm.const(1, var.dtype)
    sqrt_value_beta2 = sqrt(akg.topi.subtract(one_const, beta2_power),
                            target=utils.CCE)
    lr_mul_sqrt_value = akg.topi.multiply(lr, sqrt_value_beta2)
    sub_value_beta1 = akg.topi.subtract(one_const, beta1_power)
    lr_t = Divide(lr_mul_sqrt_value, sub_value_beta1, target=utils.CCE)

    # if use_nersterov: var_new <- var - lr_t*(m_new*beta1 + (1-beta1)*grad) / (epsilon + sqrt(v_new))
    # if not use_nersterov: var_new <- var - lr_t*m_new / (epsilon + sqrt(v_new))
    if use_nesterov:
        lr_t_mul_m_new = akg.tvm.compute(shape,
                                         lambda *indice: lr_t[0] *
                                         (m_new(*indice) * beta1[0] +
                                          (1 - beta1[0]) * grad(*indice)),
                                         name="lr_t_mul_m_new")
    else:
        lr_t_mul_m_new = akg.tvm.compute(
            shape,
            lambda *indice: lr_t[0] * m_new(*indice),
            name="lr_t_mul_m_new")
    sqrt_value_v_new = sqrt(v_new, target=utils.CCE)
    epsilon_add_sqrt_value = akg.topi.add(epsilon, sqrt_value_v_new)
    div_value = Divide(lr_t_mul_m_new,
                       epsilon_add_sqrt_value,
                       target=utils.CCE)
    var_new = akg.topi.subtract(var, div_value)

    return var_new, m_new, v_new
Esempio n. 4
0
def apply_ftrl_impl(var, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power, with_l2_shrinkage=False):
    """Ftrl-proximal Optimization algorithm"""

    dtype = var.dtype
    # cast to float32 for higher accuracy
    compute_dtype = dtype
    if dtype == "float16":
        compute_dtype = "float32"
        var, accum, linear, grad, lr, l1, l2, lr_power = [akg.topi.cast(t, compute_dtype) for t in
                                                          [var, accum, linear, grad, lr, l1, l2, lr_power]]
        if with_l2_shrinkage:
            l2_shrinkage = akg.topi.cast(l2_shrinkage, compute_dtype)

    shape = var.shape
    # grad_shrinkage = grad + 2 * l2_shrinkage * var
    if with_l2_shrinkage:
        l2_shrinkage = akg.topi.broadcast_to(l2_shrinkage, shape)
        grad_shrinkage = akg.tvm.compute(shape, lambda *indice:
                                         grad(*indice) + akg.tvm.const(2.0, compute_dtype) * l2_shrinkage(*indice) *
                                         var(*indice), name="grad_shrinkage")
    else:
        grad_shrinkage = grad

    # accum_new = accum + grad^2
    accum_new = akg.tvm.compute(shape, lambda *indice: accum(*indice) + grad(*indice)*grad(*indice), name="accum_new")

    # linear_new = linear +  grad_shrinkage - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
    lr_power_neg = akg.topi.negative(lr_power)
    accum_new_pow = akg.topi.power(accum_new, lr_power_neg)
    accum_pow = akg.topi.power(accum, lr_power_neg)
    accum_pow_sub = akg.topi.subtract(accum_new_pow, accum_pow)
    accum_pow_sub_div_lr = Divide(accum_pow_sub, lr, target=utils.CCE)
    linear_add_shrinkage = akg.topi.add(linear, grad_shrinkage)
    linear_new = akg.tvm.compute(shape, lambda *indice:
                                 linear_add_shrinkage(*indice) - accum_pow_sub_div_lr(*indice)*var(*indice),
                                 name="linear_new")

    # x = clip(linear_new, -l1, l1) - linear_new
    l1_neg = akg.topi.negative(l1)
    linear_new_clip = akg.topi.minimum(akg.topi.maximum(linear_new, l1_neg), l1)
    x_res = akg.topi.subtract(linear_new_clip, linear_new)
    # y = accum_new^(-lr_power) / lr + 2 * l2
    accum_new_pow_div_lr = Divide(accum_new_pow, lr, target="cce")
    l2_2 = akg.topi.multiply(l2, 2)
    y_res = akg.topi.add(accum_new_pow_div_lr, l2_2)
    # var_new = x / y
    var_new = Divide(x_res, y_res, target="cce")

    # cast to original type
    if dtype == "float16":
        var_new = akg.topi.cast(var_new, dtype)
        accum_new = akg.topi.cast(accum_new, dtype)
        linear_new = akg.topi.cast(linear_new, dtype)

    return var_new, accum_new, linear_new
def apply_proximal_gradient_descent_impl(var, alpha, l1, l2, delta):
    """implement the FOBOS algorithm"""

    dtype = var.dtype
    compute_type = dtype
    if dtype == "float16":
        # cast to float32 for higher accuracy
        compute_type = "float32"
        var, alpha, l1, l2, delta = [akg.topi.cast(t, compute_type) for t in [var, alpha, l1, l2, delta]]

    shape = var.shape
    alpha = akg.topi.broadcast_to(alpha, shape)
    l1 = akg.topi.broadcast_to(l1, shape)
    l2 = akg.topi.broadcast_to(l2, shape)
    # prox_var = var - alpha * delta
    prox_var = akg.tvm.compute(shape, lambda *indice: var(*indice) - alpha(*indice)*delta(*indice), name="prox_var")

    # l1>0: var_new = Sign(prox_var)/(1+alpha*l2) * max{|prox_var|-alpha*l1,0}
    sign_prox_var = Sign(prox_var)
    alpha_l2_1 = akg.topi.add(akg.tvm.const(1, compute_type), akg.topi.multiply(alpha, l2))
    max_value = akg.tvm.compute(shape, lambda *indice: akg.tvm.max(
        akg.tvm.abs(prox_var(*indice)) - alpha(*indice)*l1(*indice),
        akg.tvm.const(0, compute_type)), name="max_value")
    var_new_l1_gt_0 = akg.topi.multiply(Divide(sign_prox_var, alpha_l2_1, target="cce"), max_value)

    # l1<=0: var_new = prox_var/(1+alpha*l2)
    var_new_l1_le_0 = Divide(prox_var, alpha_l2_1, target="cce")

    if product_is_mini():
        var_new = akg.tvm.compute(shape, lambda *indice:
                                  akg.tvm.expr.Select(l1(*indice).astype("float16") > akg.tvm.const(0, "float16"),
                                                      var_new_l1_gt_0(*indice).astype("float16"),
                                                      var_new_l1_le_0(*indice).astype("float16")),
                                  name="var_new")
    else:
        var_new = akg.tvm.compute(shape, lambda *indice:
                                  akg.tvm.expr.Select(l1(*indice) > akg.tvm.const(0, l1.dtype),
                                                      var_new_l1_gt_0(*indice), var_new_l1_le_0(*indice)),
                                  name="var_new")

    # cast to origin dtype
    if var_new.dtype != dtype:
        var_new = akg.topi.cast(var_new, dtype)
    return var_new
Esempio n. 6
0
 def truncatemod_func(a, b):
     """function for truncatemod formula"""
     # For positive numbers, floor and trunc are equivalent
     return akg.topi.subtract(
         a,
         akg.topi.multiply(
             b,
             Cast(floor(Divide(a, b, utils.CCE)),
                  b.dtype,
                  target=utils.CCE)))
Esempio n. 7
0
def xdivy_compute(input_x, input_y):
    """xdivy compute"""
    _, _, shape_res = produce_shapes(get_shape(input_x), get_shape(input_y))
    utils.check_shape(shape_res)

    dtype = input_x.dtype

    broadcast_x = akg.lang.ascend.broadcast(input_x, shape_res)
    broadcast_y = akg.lang.ascend.broadcast(input_y, shape_res)
    broadcast_one = akg.lang.ascend.broadcast(tvm.const(SCALAR_ONE, dtype),
                                              shape_res, dtype)

    abs_x = akg.lang.ascend.vabs(broadcast_x)
    abs_y = akg.lang.ascend.vabs(broadcast_y)
    add_x_y = akg.lang.ascend.vadd(abs_x, abs_y)

    if dtype == "float32":
        data_min = akg.lang.ascend.broadcast(
            tvm.const(MININUM_NUM_FLOAT, dtype=dtype), shape_res, dtype)
    elif dtype == "float16":
        data_min = akg.lang.ascend.broadcast(
            tvm.const(MININUM_NUM_HALF, dtype=dtype), shape_res, dtype)

    zero_x_y = akg.lang.ascend.vmin(add_x_y, data_min)

    if dtype == "float32":
        data_mul1 = akg.lang.ascend.vmuls(
            zero_x_y, tvm.const(MAX_ONE_CONST_FLOAT, dtype=dtype))
        data_mul2 = akg.lang.ascend.vmuls(
            data_mul1, tvm.const(MAX_ONE_CONST_FLOAT, dtype=dtype))
        mul_data = akg.lang.ascend.vmuls(
            data_mul2, tvm.const(MAX_TWO_CONST_FLOAT, dtype=dtype))
    elif dtype == "float16":
        data_mul1 = akg.lang.ascend.vmuls(
            zero_x_y, tvm.const(MAX_CONST_HALF, dtype=dtype))
        mul_data = akg.lang.ascend.vmuls(
            data_mul1, tvm.const(MAX_CONST_HALF, dtype=dtype))

    sub_x_y_zero = akg.lang.ascend.vsub(mul_data, broadcast_one)
    abs_x_y_zero = akg.lang.ascend.vabs(sub_x_y_zero)
    input_y_revised = akg.lang.ascend.vadd(broadcast_y, abs_x_y_zero)

    if dtype == "float16":
        broadcast_x = akg.lang.ascend.cast_to(broadcast_x, "float32")
        input_y_revised = akg.lang.ascend.cast_to(input_y_revised, "float32")

    res = Divide(broadcast_x, input_y_revised, target="cce")

    if dtype == "float16":
        res = akg.lang.ascend.cast_to(res, dtype)

    return res
Esempio n. 8
0
def lin_space_compute(input_assist, input_start, input_stop, input_num):
    """inv_grad compute implementation"""
    num_float = akg.lang.ascend.cast_to(input_num, "float32")
    num_divided = akg.lang.ascend.vadds(num_float, -1.0)

    step_divider = akg.lang.ascend.vsub(input_stop, input_start)
    step = Divide(step_divider, num_divided, target="cce")

    res_temp = akg.lang.ascend.vmul(
        input_assist, akg.lang.ascend.broadcast(step, input_assist.shape))
    res = akg.lang.ascend.vadd(
        res_temp, akg.lang.ascend.broadcast(input_start, input_assist.shape))

    return res
Esempio n. 9
0
def _bessel_i0e_compute(input_data):
    """bessel i0e compute"""

    shape_input = input_data.shape
    dtype_input = input_data.dtype

    # chose the type of data in begin
    if dtype_input == "float16":
        input_data = Cast(input_data, "float32", target=utils.CCE)
    abs_data = Abs(input_data, target=utils.CCE)

    # compute bessel_i0e for data in (-3.75, 3.75)
    # t = |x| / 3.75
    # I0e = e^-|x|(1 + 3.5156229t^2 + 3.0899424t^4 + 1.2067492t^6 + 0.2659732t^8
    #       + 0.0360768t^10 + 0.0045813t^12)), |x| <= 3.75
    broad_const_limit = akg.lang.ascend.broadcast(
        akg.tvm.const(CONST_LIMIT, "float32"), shape_input)
    before_abs_data = minimum(abs_data, broad_const_limit)
    data = topi.multiply(before_abs_data, 1.0 / CONST_LIMIT)
    square_data = mul(data, data, target=utils.CCE)
    before_res = topi.multiply(square_data, ITR_BEFORE[LEN_BEFORE - 1])
    before_res = topi.add(before_res, ITR_BEFORE[LEN_BEFORE - 2])
    for iter_number in ITR_BEFORE[LEN_BEFORE - 3::-1]:
        before_res = mul(before_res, square_data, target=utils.CCE)
        before_res = topi.add(before_res, iter_number)
    exp_data = Exp(neg(before_abs_data, target=utils.CCE), target=utils.CCE)
    before_res = mul(before_res, exp_data, target=utils.CCE)

    # compute bessel_i0e for data in other domain
    # t = |x| / 3.75
    # I0e(x) = (1 / sqrt(|x|))*(0.39894228 + 0.01328592t^-1 + 0.00225319t^-2 + -0.00157565t^-3
    #           + 0.00916281t^-4 + -0.02057706t^-5 + 0.02635537t^-6 + -0.01647633t^-7
    #           + 0.00392377t^-8), |x| >= 3.75
    data = Divide(broad_const_limit, abs_data, target=utils.CCE)
    after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1])
    after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2])
    for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]:
        after_res = mul(after_res, data, target=utils.CCE)
        after_res = topi.add(after_res, iter_number)
    rsqrt_data = rsqrt(abs_data, target=utils.CCE)
    after_res = mul(after_res, rsqrt_data, target=utils.CCE)
    after_res = minimum(before_res, after_res, target=utils.CCE)

    # chose the type of data in end
    if dtype_input == "float16":
        after_res = Cast(after_res, "float16", target=utils.CCE)

    return after_res
Esempio n. 10
0
def softplus_grad_compute(input_gradients, input_features):
    """compute for calculations of softplus gradients"""
    shape_dy = get_shape(input_gradients)
    shape_x = get_shape(input_features)
    dtype = input_gradients.dtype

    if list(shape_dy) != list(shape_x):
        shape_dy, shape_x, shape_max = produce_shapes(shape_dy, shape_x)
        input_gradients = akg.lang.ascend.broadcast(input_gradients, shape_max,
                                                    dtype)
        input_features = akg.lang.ascend.broadcast(input_features, shape_max,
                                                   dtype)
    else:
        shape_max = shape_dy

    if dtype != "float32":
        input_gradients = akg.lang.ascend.cast_to(input_gradients, "float32")
        input_features = akg.lang.ascend.cast_to(
            input_features, "float16" if product_is_mini() else "float32")

    data_exp_tmp = akg.lang.ascend.vexp(input_features)
    data_add_tmp = akg.lang.ascend.vadds(data_exp_tmp, SCALAR_ONE)
    data_div_tmp = Divide(data_exp_tmp, data_add_tmp, target="cce")
    res_tmp = akg.lang.ascend.vmul(input_gradients, data_div_tmp)

    if dtype == "float16":
        res = akg.lang.ascend.cast_to(res_tmp, "float16")
    elif dtype == "int32" or dtype == "int8" or dtype == "uint8":
        data_zero = akg.lang.ascend.broadcast(tvm.const(0, "float16"),
                                              shape_max, "float16")
        res_min = akg.lang.ascend.vmin(res_tmp, data_zero)
        res_max = akg.lang.ascend.vmax(res_tmp, data_zero)
        res_max_int = akg.lang.ascend.floor(res_max)
        res_min_int = akg.lang.ascend.ceil(res_min)
        res = akg.lang.ascend.vadd(res_max_int, res_min_int)
    else:
        res = res_tmp

    if dtype == "int8":
        res = akg.lang.ascend.cast_to(res, "int8")
    elif dtype == "uint8":
        res = akg.lang.ascend.cast_to(res, "uint8")

    return res
Esempio n. 11
0
def fake_quant_with_min_max_vars_per_channel_compute(input_data,
                                                     input_min,
                                                     input_max,
                                                     num_bits=8,
                                                     narrow_range=False):
    """fake_quant_with_min_max_vars_per_channel compute implemention"""
    shape = get_shape(input_data.shape)
    dtype = input_data.dtype
    min_broadcast = akg.lang.ascend.broadcast(input_min, shape, dtype)
    max_broadcast = akg.lang.ascend.broadcast(input_max, shape, dtype)
    # get nudged_min and nudged_max by nudged_min_max_compute function
    nudged_min_nudged_max = nudged_min_max_compute(min_broadcast,
                                                   max_broadcast, num_bits,
                                                   narrow_range)
    # transform the input between nudged_max and nudged_min
    clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1])
    clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0])

    # calculate the quantized and dequantized results
    clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0])
    if product_is_mini():
        clamped_shifted_div_scale = mul(clamped_shifted,
                                        reciprocal(nudged_min_nudged_max[2]),
                                        target=utils.CCE)
    else:
        clamped_shifted_div_scale = Divide(clamped_shifted,
                                           nudged_min_nudged_max[2],
                                           target=utils.CCE)
    result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype))
    floor_result_tmp = akg.lang.ascend.floor(result_tmp)
    if product_is_mini():
        floor_result_tmp = topi.cast(floor_result_tmp, "float16")

    floor_result_tmp = topi.cast(floor_result_tmp, "float32")
    scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2])
    tmp_res = topi.add(scale_product, nudged_min_nudged_max[0])
    # get bool_both_zero_value by bool_both_zero_compute function
    bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast)
    res = topi.multiply(tmp_res, bool_both_zero_value)

    return res
Esempio n. 12
0
def _after_res_compute(abs_data):
    """
    compute bessel_i1e for abs value of data greater than or equal to 3.75

    Algrithm:
    t = 3.75 / x
    I1(x) = (1 / sqrt(x))*(0.39894228 - 0.03988024t - 0.00362018t^2
                           + 0.00163801t^3 - 0.01031555t^4 + 0.02282967t^5
                           - 0.02895312t^6 + 0.01787654t^7 - 0.00420059t^8)
    """
    broad_const_limit = akg.lang.ascend.broadcast(
        akg.tvm.const(CONST_LIMIT, abs_data.dtype), abs_data.shape)
    data = Divide(broad_const_limit, abs_data, target=utils.CCE)
    after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1])
    after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2])
    for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]:
        after_res = mul(after_res, data, target=utils.CCE)
        after_res = topi.add(after_res, iter_number)
    abs_data_rsqrt = rsqrt(abs_data, target=utils.CCE)
    after_res = mul(after_res, abs_data_rsqrt, target=utils.CCE)
    return after_res
Esempio n. 13
0
def div_no_nan(data_x, data_y, target=utils.CCE):
    """
    Returns 0 if the denominator is zero, else, like Div.

    Args:
        data_x (tvm.tensor.Tensor): tensor with type int32/int8/uint8, float16/float32.
        data_y (tvm.tensor.Tensor): tensor with type int32/int8/uint8, float16/float32.

    Returns:
        tvm.tensor.Tensor.
    """
    dtype = data_x.dtype
    if dtype != data_y.dtype:
        raise TypeError("input dtype should be the same")
    utils.ops_dtype_check(dtype, [utils.DtypeForDavinci.ALL_FLOAT, 
                                    utils.DtypeForDavinci.INT8,
                                    utils.DtypeForDavinci.UINT8, 
                                    utils.DtypeForDavinci.INT32])

    utils.check_shape(data_x.shape)
    utils.check_shape(data_y.shape)
    utils.auto_broadcast_check(data_x, data_y)

    # dtype for vsel and vcmp
    if product_is_mini():
        compute_dtype = "float16"
    else:
        compute_dtype = "float32"
 
    # div fp16 y returns 0 if y < 2^-12
    # div fp32 y returns 0 if y < 2^-64
    min_val = tvm.const(2**(-12) if product_is_mini() else 2**(-64),
                        dtype=compute_dtype)    

    tvm_one = tvm.const(1, dtype=compute_dtype)
    tvm_zero = tvm.const(0, dtype=compute_dtype)
    
    if not product_is_mini() and dtype == "float16":
        min_val = tvm.const(2**(-12), "float32")

    data_y_fp32 = akg.lang.ascend.cast_to(data_y, "float32")
    # avoid when y > 2^15 cast from fp32 to fp16 in mini
    clip_y_fp32 = akg.topi.clip(data_y_fp32, -1.0, 1.0)
    abs_clip_y_fp32 = Abs(clip_y_fp32, target)
    y_cmp = akg.lang.ascend.cast_to(abs_clip_y_fp32, compute_dtype) 

    is_zero = tvm.compute(data_y.shape,
                          lambda *i : tvm.expr.Select(
                              y_cmp(*i) < min_val, tvm_one, tvm_zero), 
                          name="is_zero")    
    
    # if fp32 y < 2^-24, cast(y,fp16)==0. to find y in (2^-64, 2^-24): 
    if product_is_mini() and dtype == "float32":
        is_zero = _refine_is_zero(is_zero, abs_clip_y_fp32)
    
    is_zero = akg.lang.ascend.cast_to(is_zero, "float32")
    not_zero = tvm.compute(data_y.shape,
                           lambda *i : (1 - is_zero(*i)).astype("float32"),
                           name="not_zero")    
   
    # replace [x1 x2]/[y1 0] by [x1 0]/[y1 1] 
    data_x = mul(akg.lang.ascend.cast_to(data_x, "float32"), not_zero, target=target)
    data_y = akg.lang.ascend.cast_to(data_y, "float32") + is_zero
    res = Divide(data_x, data_y, target=target)

    if dtype in ("int8", "uint8", "int32"):
        res = akg.lang.ascend.floor(res)
        res = akg.lang.ascend.cast_to(res, dtype)
    else:
        res = akg.lang.ascend.cast_to(res, dtype)
    return res
Esempio n. 14
0
def nudged_min_max_compute(min_broadcast, max_broadcast, num_bits,
                           narrow_range):
    """
    Calculate the maximum and minimum values of the quantization.

    Notes:
        Each channel scale[i] euqal to (max_broadcast[i] - min_broadcast[i]) / (quant_max - quant_min).
        Then compute nudged_zero_point:
                nudged_zero_point = floor(between_min_max_float + 0.5) + less_quant_min_float + more_quant_max_float,
        between_min_max_float is first calculated by:
                zero_point_from_min = (quant_min_float - min_broadcast) / scale,
        then between_min_max_float = zero_point_from_min, which min_broadcast <= zero_point_from_min <= max_broadcast.
        Besides, the value of less_quant_min_float is equal to quant_min or zero, zero_point_from_min < quant_min_float,
        the value is quant_min, else is 0. The same as more_quant_max_float.
        Finally according to scale and nudged_zero_point to compute nudged_min and nudged_max:
                 nudged_min = (quant_min - nudged_zero_point) * scale
                 nudged_max = (quant_max - nudged_zero_point) * scale

    Args:
        min_broadcast (tvm.tensor.Tensor): minimum value to be quantified for each channel.
        max_broadcast (tvm.tensor.Tensor): maximum value to be quantified for each channel.
        num_bits (int): num_bits is the bitwidth of the quantization, range [2,16].
        narrow_range (bool): if True, for each channel, quantized into the quantization range [0, 2^num_bits - 1] else
                      quantized into the quantization range [1, 2^num_bits - 1].

    Returns:
        nudged_min (tvm.tensor.Tensor): The same type and shape as min_broadcast.
        nudged_max (tvm.tensor.Tensor): The same type and shape as max_broadcast.
        scale (tvm.tensor.Tensor): The same type and shape as max_broadcast.
    """

    dtype = min_broadcast.dtype
    quant_min = 1 if narrow_range else 0
    quant_max = (2**num_bits) - 1

    # because of need compute each channel, so quant_min and quant_max need to broadcast.
    quant_min_float = topi.full(min_broadcast.shape, dtype,
                                tvm.const(quant_min, dtype))
    quant_max_float = topi.full(min_broadcast.shape, dtype,
                                tvm.const(quant_max, dtype))

    # caculate each channel max and min difference.
    max_sub_min = topi.subtract(max_broadcast, min_broadcast)
    quant_max_sub_quant_min = topi.subtract(quant_max_float, quant_min_float)
    # compute scale = (max_broadcast - min_broadcast) / (quant_max - quant_min)
    # and min_div_scale = min_broadcast / scale
    if product_is_mini():
        scale = mul(max_sub_min,
                    reciprocal(quant_max_sub_quant_min),
                    target=utils.CCE)
        min_div_scale = Mul(min_broadcast, reciprocal(scale), target=utils.CCE)
    else:
        scale = Divide(max_sub_min, quant_max_sub_quant_min, target=utils.CCE)
        min_div_scale = Divide(min_broadcast, scale, target=utils.CCE)

    # zero_point_from_min = quant_min_float - min_broadcast / scale
    zero_point_from_min = topi.subtract(quant_min_float, min_div_scale)
    # if zero_point_from_min < quant_min_float, bool_less_quant_min_float = 1 else 0
    bool_less_quant_min_float = less_compare_float32(zero_point_from_min,
                                                     quant_min_float)
    # if quant_max_float < zero_point_from_min, bool_more_quant_max_float = 1 else 0
    bool_more_quant_max_float = less_compare_float32(quant_max_float,
                                                     zero_point_from_min)

    # according to above bool param to select effective value
    less_quant_min_float = topi.multiply(quant_min_float,
                                         bool_less_quant_min_float)
    more_quant_max_float = topi.multiply(quant_max_float,
                                         bool_more_quant_max_float)

    # compute which num is not less than quant_min_float and not large than quant_max_float
    tensor_one = topi.full(min_broadcast.shape, dtype, dc.one_const(dtype))
    bool_not_less_quant_min_float = topi.subtract(tensor_one,
                                                  bool_less_quant_min_float)
    bool_not_more_quant_max_float = topi.subtract(tensor_one,
                                                  bool_more_quant_max_float)
    bool_between_min_max = topi.multiply(bool_not_less_quant_min_float,
                                         bool_not_more_quant_max_float)
    between_min_max_float = topi.multiply(zero_point_from_min,
                                          bool_between_min_max)
    # add 0.5 to num which min <= num <= max and then floor them.
    between_min_max_add_half_one = topi.add(between_min_max_float,
                                            dc.half_const(dtype))
    between_min_max_round = akg.lang.ascend.floor(between_min_max_add_half_one)
    if product_is_mini():
        between_min_max_round = topi.cast(between_min_max_round, "float16")

    between_min_max_round = topi.cast(between_min_max_round, "float32")

    # calculate the maximum and minimum values of the quantization
    nudged_zero_point_tmp = topi.add(less_quant_min_float,
                                     more_quant_max_float)
    nudged_zero_point = topi.add(nudged_zero_point_tmp, between_min_max_round)

    nudged_min_tmp = topi.subtract(quant_min_float, nudged_zero_point)
    nudged_max_tmp = topi.subtract(quant_max_float, nudged_zero_point)
    nudged_min = topi.multiply(nudged_min_tmp, scale)
    nudged_max = topi.multiply(nudged_max_tmp, scale)
    res = [nudged_min, nudged_max, scale]

    return res