Ejemplo n.º 1
0
def _atan_compute(data):
    """compute for atan"""
    dtype = data.dtype

    if dtype == "float16":
        data = topi.cast(data, "float32")

    abs_data = topi.abs(data)
    tensor_one = dc.one_const(abs_data.dtype)

    abs_data_sub_one = topi.subtract(abs_data, tensor_one)
    abs_data_add_one = topi.add(abs_data, tensor_one)
    abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one))

    # calucate data less than one
    res = _do_atan_taylor(abs_data)
    # calucate data more than one
    res_mt_one = topi.add(_do_atan_taylor(abs_data2),
                          tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype))
    res = topi.minimum(res, res_mt_one)

    if utils.product_is_mini() and data.dtype == "float32":
        sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32")
    else:
        sign_mask = topi.sign(data)

    res = topi.multiply(res, sign_mask)

    if dtype == "float16":
        res = topi.cast(res, "float16")

    return res
Ejemplo n.º 2
0
def _tan_2x_multi(input_x, times):
    """calculating tan x by calculating tan (x/2^times) and using double angle formula multiple times"""
    # Calculate tan (x/2^times)
    if input_x.dtype == FLOAT_16 and utils.product_is_mini():
        input_x_divide = topi.multiply(input_x, tvm.const(1.0/(2.0**times), FLOAT_16))
        res = _tan_expand(input_x_divide)
    else:
        input_x_divide = topi.multiply(input_x, 1.0/(2.0**times))
        res = _tan_expand(input_x_divide)
    while times != 0:
        # using double angle formula: tan 2x = 2*tan x/(1-tan x*tan x)
        if input_x.dtype == FLOAT_16 and utils.product_is_mini():
            res_numerator = topi.multiply(res, tvm.const(2.0, FLOAT_16))
            tanx_square = topi.multiply(res, res)
            res_denominator = topi.add(topi.multiply(tanx_square, tvm.const(-1.0, FLOAT_16)), tvm.const(1.0, FLOAT_16))
        else:
            res_numerator = topi.multiply(res, 2.0)
            tanx_square = topi.multiply(res, res)
            res_denominator = topi.add(topi.multiply(tanx_square, -1.0), 1.0)

        if utils.product_is_mini():
            res = mul(res_numerator, reciprocal(res_denominator))
        else:
            res = div(res_numerator, res_denominator)
        times = times - 1
    return res
def sigmoid_cross_entropy_with_logits_grad_compute(predict, target, dout):
    """sigmoid_cross_entropy_with_logits_grad compute implemention"""
    dtype = predict.dtype
    if dtype == "float16":
        predict = topi.cast(predict, "float32")
        target = topi.cast(target, "float32")
        dout = topi.cast(dout, "float32")

    # e^x
    val1 = exp(predict)
    # 1 + e^x
    val2 = topi.add(val1, tvm.const(SCALAR_ONE, dtype="float32"))
    # e^x / (1 + e^x)
    val3 = topi.divide(val1, val2)
    # -target
    val4 = topi.multiply(target, tvm.const(SCALAR_NEGTIVE_ONE,
                                           dtype="float32"))
    # e^x / (1 + e^x) -y
    val5 = topi.add(val3, val4)

    result = topi.multiply(val5, dout)

    if dtype == "float16":
        result = topi.cast(result, dtype)
    return result
Ejemplo n.º 4
0
def selu_compute(input_data):
    """selu compute implemention"""
    # if input_dtype is float16,convert it to float32
    dtype = input_data.dtype
    if dtype == "float16" or dtype == "float32":
        input_data = topi.cast(input_data, "float32")
        type_tmp = "float32"
    else:
        input_data = topi.cast(input_data, "float16")
        type_tmp = "float16"

    # generate tensor_zero to be compared
    tensor_zero = topi.multiply(input_data, tvm.const(0, dtype=type_tmp))
    # generate negative_res and positive_res to compute
    # When the element value is greater than 0 and less than 0
    negative_res = topi.minimum(input_data, tensor_zero)
    positive_res = topi.maximum(input_data, tensor_zero)
    exp_res = exp(negative_res)
    sub_res = topi.add(exp_res, tvm.const(SCALAR_NEGATIVE_ONE, dtype=type_tmp))
    negative_muls_res = topi.multiply(sub_res, tvm.const(SCALE_ALPHA_PRODUCT, dtype=type_tmp))
    if dtype == "int8":
        negative_muls_res = akg.lang.cce.ceil(negative_muls_res)

    positive_muls_res = topi.multiply(positive_res, tvm.const(SCALE, dtype=type_tmp))
    res = topi.add(negative_muls_res, positive_muls_res)
    # cast to ori_dtype
    if dtype == "float16" or dtype == "int8" or dtype == "int32":
        res = topi.cast(res, dtype)

    return res
Ejemplo n.º 5
0
def matrix_set_diag_compute(input_matrix, input_diagonal, input_help):
    """matrix_set_diag compute implemention"""
    shape_input = get_shape(input_matrix)
    input_dtype = input_matrix.dtype

    if input_dtype == "int8" or input_dtype == "uint8":
        input_matrix = topi.cast(input_matrix, "float16")
        input_diagonal = topi.cast(input_diagonal, "float16")
        input_help = topi.cast(input_help, "float16")
    if input_dtype == "int32" and product_is_mini():
        input_matrix = topi.cast(input_matrix, "float16")
        input_diagonal = topi.cast(input_diagonal, "float16")
        input_help = topi.cast(input_help, "float16")
        input_matrix = topi.cast(input_matrix, "float32")
        input_diagonal = topi.cast(input_diagonal, "float32")
        input_help = topi.cast(input_help, "float32")
    if input_dtype == "int32" and not product_is_mini():
        input_matrix = topi.cast(input_matrix, "float32")
        input_diagonal = topi.cast(input_diagonal, "float32")
        input_help = topi.cast(input_help, "float32")
    diag_tmp = topi.broadcast_to(input_diagonal, shape_input)
    help_tmp = topi.add(input_help, -1)
    help_y = topi.abs(help_tmp)

    res_vmul_x = topi.multiply(input_matrix, help_y)
    res_vmul_y = topi.multiply(diag_tmp, input_help)
    res = topi.add(res_vmul_x, res_vmul_y)

    if input_dtype == "int32" and product_is_mini():
        res = topi.cast(res, "float16")

    res = topi.cast(res, input_dtype)

    return res
Ejemplo n.º 6
0
def asinh(x, target=utils.CCE):
    r"""
    Compute asinh function.

    .. math:: asinh(x) = log(x+\sqrt{x*x+1})

    Args:
        x (tvm.tensor.Tensor): Tensor of type float16, float32. 

    Returns:
       tvm.tensor.Tensor, has the same type and shape as x.
    
    Supported Platforms:
        'Ascend'
    """
    # check shape
    utils.check_shape(x)

    # check input tensor data_type
    utils.ops_dtype_check(x.dtype, utils.DtypeForDavinci.ALL_FLOAT)
    dtype = x.dtype

    # Known that, asinh(x) = log(x + sqrt(x*x+1)), and, asinh(-x) = -asinh(x)
    # If x is a large negative number, (x + sqrt(x*x+1)) will be close to zero.
    # So, asinh(x) = sign(x) * log(|x| + sqrt(|x|*|x| + 1))
    compute_dtype = dtype
    if dtype == "float16":
        # To avoid overflow and higher accuracy, x is casted to float32
        compute_dtype = "float32"
        x = topi.cast(x, compute_dtype)

    x_abs = topi.abs(x)

    if product_is_mini():
        # sqrt(|x|*|x| + 1) = |x| * sqrt(1 + 1/(|x|*|x|))
        vsquare_add_one = topi.add(1,
                                   topi.divide(1, topi.multiply(x_abs, x_abs)))
        sqrt_compute_value = sqrt_mini_newton_iter_impl(vsquare_add_one)
        sqrt_value = topi.multiply(x_abs, sqrt_compute_value)
    else:
        x_abs_square_add_one = topi.add(topi.multiply(x_abs, x_abs), 1)
        sqrt_value = topi.sqrt(x_abs_square_add_one)

    x_add_sqrt = topi.add(x_abs, sqrt_value)

    if product_is_mini():
        log_value = log_compute_mini_impl(x_add_sqrt, target)
    else:
        log_value = topi.log(x_add_sqrt)

    res = topi.multiply(Sign(x, target), log_value)

    if res.dtype != dtype:
        res = topi.cast(res, dtype)

    if product_is_mini():
        attrs = {"enable_auto_inline": False}
        return res, attrs
    return res
Ejemplo n.º 7
0
def _update_m(m, beta, grad):
    """Update m_out = m * beta + grad * (1 - beta)"""
    m_beta = topi.multiply(m, beta)
    beta_neg = topi.multiply(beta, tvm.const(-1, beta.dtype))
    beta_1 = topi.add(beta_neg, tvm.const(1, beta_neg.dtype))
    grad_beta_gs = topi.multiply(grad, beta_1)
    m_out = topi.add(m_beta, grad_beta_gs)
    return m_out
Ejemplo n.º 8
0
def _update_var(decay_gm, alpha, lr, grad, var):
    """Update var_out = var - lr * (alpha + decay_gm) * grad"""
    decay_gm_alpha = topi.add(decay_gm, alpha)
    res = topi.multiply(decay_gm_alpha, lr)
    res = topi.multiply(res, grad)
    res_neg = topi.multiply(res, tvm.const(-1, res.dtype))
    var_out = topi.add(var, res_neg)
    return var_out
Ejemplo n.º 9
0
def fake_quant_with_min_max_args(input_data,
                                 min_=-6,
                                 max_=6,
                                 num_bits=8,
                                 narrow_range=False):
    """
    Computes Fake-quantize the 'input_data' tensor,
    type float32 to 'output_data' tensor of same type

    output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale
                  + nudged_min
    scale = (max-min) / (quant_max-quant_min)

    Args:
        data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32"
        min ([float, int]): scalar, defaults to -6
        max ([float, int]): scalar, defaults to 6. [min; max] define the
                            clamping range for the input_data data
        num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth
                                 of the quantization,between 2 and 16
        narrow_range ([bool]):
            True, quantized into the quantization range [1; 2^num_bits - 1]
            False,quantized into the quantization range [0; 2^num_bits - 1]

    Returns:
        tvm.tensor.Tensor
    """
    shape = get_shape(input_data)
    utils.check_shape(shape)

    dtype = input_data.dtype
    utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32)

    nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits,
                                                  narrow_range)

    zero_tensor = tvm.compute(input_data.shape,
                              lambda *i: tvm.const(0, dtype="float32"),
                              name="zero_tensor")
    nudged_max_tensor = topi.add(zero_tensor, nudged_max)
    nudged_min_tensor = topi.add(zero_tensor, nudged_min)
    inv_nudged_scale = 1.00 / scale

    # Transform the input between nudged_max and nudged_min
    clamped_vmin = topi.minimum(input_data, nudged_max_tensor)
    clamped = topi.maximum(clamped_vmin, nudged_min_tensor)

    # Calculate the quantized and dequantized results
    clamped_shifted = topi.subtract(clamped, nudged_min_tensor)
    vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale)
    vadds_shifted = topi.add(vmul_shifted, 0.5)
    floor_vadds_shifted = floor(vadds_shifted)
    floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype)
    res_scale = topi.multiply(floor_cast, scale)
    res = topi.add(res_scale, nudged_min_tensor)

    return res
Ejemplo n.º 10
0
def _compute_log(data_input):
    """Atanh(x) = 0.5*log((1+x)/(1-x))"""

    data_1_sum_x = topi.add(data_input, dc.one_const(data_input.dtype))
    data_sub_x = topi.multiply(data_input, dc.neg_one_const(data_input.dtype))
    data_1_sub_x = topi.add(data_sub_x, dc.one_const(data_input.dtype))
    data_x_mul = data_1_sum_x / data_1_sub_x
    data_x_log = log.log(data_x_mul)
    data_res = topi.multiply(data_x_log, dc.half_const(data_input.dtype))

    return data_res
Ejemplo n.º 11
0
def _compute_log(data_input, target=utils.CCE):
    """atanh(x) value is 0.5*log((1+x)/(1-x))"""

    data_1_sum_x = topi.add(data_input, dc.one_const(data_input.dtype))
    data_sub_x = topi.multiply(data_input, dc.neg_one_const(data_input.dtype))
    data_1_sub_x = topi.add(data_sub_x, dc.one_const(data_input.dtype))
    data_x_mul = data_1_sum_x / data_1_sub_x
    data_x_log = log(data_x_mul, target)
    data_res = topi.multiply(data_x_log, dc.half_const(data_input.dtype))

    return data_res
Ejemplo n.º 12
0
def sgd_compute(parameters, gradient, learning_rate, accum, momentum, stat,
                dampening=0.0, weight_decay=0.0, nesterov=False):
    """sgd compute implementation"""
    dtype = parameters.dtype
    if dtype == "float16":
        parameters = topi.cast(parameters, "float32")
        accum = topi.cast(accum, "float32")
        learning_rate = topi.cast(learning_rate, "float32")
        gradient = topi.cast(gradient, "float32")
        momentum = topi.cast(momentum, "float32")
        stat = topi.cast(stat, "float32")

    # if weight_decay != 0.0, need compute grad_delta to update gradient
    if weight_decay != 0.0:
        parameters = topi.multiply(parameters, tvm.const(1.0, 'float32'))
        grad_delta = topi.multiply(parameters, weight_decay)
        gradient = topi.add(gradient, grad_delta)

    stat_mid = topi.multiply(stat, tvm.const(-1, "float32"))
    stat_act = topi.add(stat_mid, tvm.const(1, "float32"))

    dampening_t = topi.multiply(stat_act, dampening)

    # update accum
    accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0])

    gradient_damp = topi.multiply(gradient, dampening_t)
    accum_t = topi.add(accum_delta, gradient)
    if dampening != 0.0:
        accum_t = topi.subtract(accum_t, gradient_damp)

    # update parameters
    if nesterov:
        parameters_delta = tvm.compute(gradient.shape, lambda *indice: gradient(*indice) * learning_rate[0])
        parameters_delta_2 = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0])
        parameters_delta_2 = tvm.compute(parameters_delta_2.shape,
                                         lambda *indice: parameters_delta_2(*indice) * learning_rate[0])
        parameters_delta = topi.add(parameters_delta, parameters_delta_2)
        parameters_t = topi.subtract(parameters, parameters_delta)
    else:
        parameters_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * learning_rate[0])
        parameters_t = topi.subtract(parameters, parameters_delta)

    # update stat
    stat_t = topi.multiply(stat_act, tvm.const(NUM_ZERO, 'float32'))


    if dtype == "float16":
        parameters_t = topi.cast(parameters_t, "float16")
        accum_t = topi.cast(accum_t, "float16")
        stat_t = topi.cast(stat_t, "float16")

    return parameters_t, accum_t, stat_t
Ejemplo n.º 13
0
Archivo: asin.py Proyecto: zhuyawen/akg
def _asin_compute(data_input):
    """Compute asin"""

    dtype = data_input.dtype
    boundary = tvm.const(BOUNDARY, "float32")

    # Change dtype to float32
    if dtype == "float16":
        data_input = topi.cast(data_input, "float32")

    # Sign mask
    data_sign = sign(data_input)

    # All positive
    data1 = topi.multiply(data_input, data_sign)

    # x belongs to (0, 2^(-0.5))
    choice_1 = topi.minimum(data1, boundary)
    choice_1 = topi.subtract(choice_1, boundary)
    choice_1_floor = akg.lang.cce.floor(choice_1)
    # the dtype of choice_1_floor is int32, need to be cast to fp32.
    if utils.product_is_mini():
        choice_1_floor = topi.cast(choice_1_floor, "float16")
        choice_1_floor = topi.cast(choice_1_floor, "float32")
    else:
        choice_1_floor = topi.cast(choice_1_floor, "float32")
    choice_1 = topi.multiply(choice_1_floor, neg_one_const("float32"))

    taylor1 = _taylor_compute(data1)
    res_1 = topi.multiply(taylor1, choice_1)

    # x belongs to (2^(-0.5), 1)
    choice_2 = topi.subtract(one_const("float32"), choice_1)
    data2 = topi.subtract(one_const("float32"), topi.multiply(data1, data1))
    data2_sqrt = _sqrt(data2)

    taylor2 = _taylor_compute(data2_sqrt, data2)

    res_2 = topi.multiply(taylor2, neg_one_const("float32"))
    res_2 = topi.add(res_2, tvm.const(HALF_PI, "float32"))
    res_2 = topi.multiply(res_2, choice_2)

    # Restore sign
    res_1 = topi.add(res_1, res_2)
    res_1 = topi.multiply(res_1, data_sign)

    # Restore dtype
    if dtype == "float16":
        res_1 = topi.cast(res_1, "float16")

    return res_1
Ejemplo n.º 14
0
 def _log_taylor(data):
     """log algrithm is log(1+x) = ((((0.2x - 0.25)x + 0.33333)x - 0.5)x + 1)x"""
     data = topi.subtract(data, 1)
     taylor_params = [0.2, -0.25, 1 / 3, -0.5, 1]
     taylor_five = topi.multiply(data, taylor_params[0])
     taylor_four_1 = topi.add(taylor_five, taylor_params[1])
     taylor_four_2 = topi.multiply(taylor_four_1, data)
     taylor_three_1 = topi.add(taylor_four_2, taylor_params[2])
     taylor_three_2 = topi.multiply(taylor_three_1, data)
     taylor_two_1 = topi.add(taylor_three_2, taylor_params[3])
     taylor_two_2 = topi.multiply(taylor_two_1, data)
     taylor_one = topi.add(taylor_two_2, taylor_params[4])
     taylor = topi.multiply(taylor_one, data)
     return taylor
Ejemplo n.º 15
0
def fake_quant_with_min_max_args_gradient(input_gradients,
                                          input_data,
                                          min=-6,
                                          max=6,
                                          num_bits=8,
                                          narrow_range=False):
    """
    Computes gradients of Fake-quantize on the 'input_data' tensor,

    output_backprops = input_gradients*(if input_data>=nudged_min and <=nudged_max 1 else 0)

    Args:
        input_gradients (tvm.tensor.Tensor): input gradients from previously operation
        input_data (tvm.tensor.Tensor): input of fake-quantize, only supports "float32"
        min ([float, int]): scalar, defaults to -6
        max ([float, int]): scalar, defaults to 6. [min; max] define the 
                            clamping range for the input_data data
        num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth
                                 of the quantization,between 2 and 16
        narrow_range ([bool]): 
            True, quantized into the quantization range [1; 2^num_bits - 1]
            False,quantized into the quantization range [0; 2^num_bits - 1]

    Returns:
        tvm.tensor.Tensor
    """
    shape = get_shape(input_data)
    utils.check_shape(shape)
    utils.elemwise_shape_check(input_gradients.shape, input_data.shape)

    utils.ops_dtype_check(input_data.dtype, utils.DtypeForDavinci.FLOAT32)
    utils.ops_dtype_check(input_gradients.dtype, utils.DtypeForDavinci.FLOAT32)

    nudged_min, nudged_max, scale = nudge_min_max(min, max, num_bits,
                                                  narrow_range)

    zero_tensor = tvm.compute(input_data.shape,
                              lambda *i: tvm.const(0, dtype="float32"),
                              name="zero_tensor")
    nudged_max_tensor = topi.add(zero_tensor, nudged_max)
    nudged_min_tensor = topi.add(zero_tensor, nudged_min)

    # where((input_data<=nudged_max)&(x>=nudged_min),1,0),Convert the input to 0 and 1 tensor
    between_nudged_min_max = _cmpare_value(input_data, nudged_min_tensor,
                                           nudged_max_tensor)

    res = topi.multiply(input_gradients, between_nudged_min_max)

    return res
Ejemplo n.º 16
0
def _tan_expand(input_x):
    """calculating tan x = x + x^3/3 + 2*x^5/15 + 17*x^7/315 + 62*x^9/2835 + 1382*x^11/155925...(|x|<pi/2)"""
    # Taylor expansion coefficient
    factors = [1/3, 2/15, 17/315, 62/2835, 1382/155925]
    input_x_power = topi.multiply(input_x, input_x)
    iter_value = input_x
    res = input_x
    for i in range(TAN_EXPANSION_ORDER):
        if input_x.dtype == FLOAT_16 and utils.product_is_mini():
            iter_value = topi.multiply(input_x_power, iter_value)
            res = topi.add(res, topi.multiply(iter_value, tvm.const(factors[i], FLOAT_16)))
        else:
            iter_value = topi.multiply(input_x_power, iter_value)
            res = topi.add(res, topi.multiply(iter_value, factors[i]))
    return res
Ejemplo n.º 17
0
def _apply_adadelta_compute(var, accum, accum_update, grad, lr, rho, epsilon):
    """Compute apply_adadelta"""
    dtype = var.dtype
    if dtype == "float16":
        var = topi.cast(var, "float32")
        accum = topi.cast(accum, "float32")
        accum_update = topi.cast(accum_update, "float32")
        lr = topi.cast(lr, "float32")
        rho = topi.cast(rho, "float32")
        grad = topi.cast(grad, "float32")

    epsilon = tvm.const(epsilon, "float32")
    tensor_one = akg.lang.ascend.broadcast(tvm.const(1, "float32"), var.shape)
    tensor_rho = topi.broadcast_to(rho, var.shape)
    tensor_rho_gs = topi.subtract(tensor_one, tensor_rho)
    tensor_epsilon = akg.lang.ascend.broadcast(epsilon, var.shape)

    # accum = accum * rho + grad ** 2 * (1 - rho)
    rhs = topi.multiply(accum, tensor_rho)
    lhs = topi.multiply(grad, grad)
    lhs = topi.multiply(lhs, tensor_rho_gs)
    accum_res = akg.lang.ascend.vadd(lhs, rhs)

    # update = (accum_update + epsilon).sqrt * (accum + epsilon).rsqrt * grad
    rhs = topi.add(accum_update, tensor_epsilon)
    rhs = sqrt(rhs, target=utils.CCE)
    lhs = topi.add(accum_res, tensor_epsilon)
    lhs = rsqrt(lhs, target=utils.CCE)
    lhs = topi.multiply(grad, lhs)
    update = topi.multiply(lhs, rhs)

    # var -= update * lr
    var_res = topi.broadcast_to(lr, var.shape)
    var_res = topi.multiply(update, var_res)
    var_res = topi.subtract(var, var_res)

    # accum_update = rho * accum_update + (1 - rho) * update.square
    rhs = topi.multiply(accum_update, tensor_rho)
    lhs = topi.multiply(update, update)
    lhs = topi.multiply(lhs, tensor_rho_gs)
    accum_update_res = akg.lang.ascend.vadd(lhs, rhs)

    if dtype == "float16":
        var_res = topi.cast(var_res, "float16")
        accum_res = topi.cast(accum_res, "float16")
        accum_update_res = topi.cast(accum_update_res, "float16")

    return var_res, accum_res, accum_update_res
Ejemplo n.º 18
0
def _bessel_i0e_compute(input_data):
    """bessel i0e compute"""

    shape_input = input_data.shape
    dtype_input = input_data.dtype

    # chose the type of data in begin
    if dtype_input == "float16":
        input_data = Cast(input_data, "float32", target=utils.CCE)
    abs_data = Abs(input_data, target=utils.CCE)

    # compute bessel_i0e for data in (-3.75, 3.75)
    # t = |x| / 3.75
    # I0e = e^-|x|(1 + 3.5156229t^2 + 3.0899424t^4 + 1.2067492t^6 + 0.2659732t^8
    #       + 0.0360768t^10 + 0.0045813t^12)), |x| <= 3.75
    broad_const_limit = akg.lang.ascend.broadcast(
        akg.tvm.const(CONST_LIMIT, "float32"), shape_input)
    before_abs_data = minimum(abs_data, broad_const_limit)
    data = topi.multiply(before_abs_data, 1.0 / CONST_LIMIT)
    square_data = mul(data, data, target=utils.CCE)
    before_res = topi.multiply(square_data, ITR_BEFORE[LEN_BEFORE - 1])
    before_res = topi.add(before_res, ITR_BEFORE[LEN_BEFORE - 2])
    for iter_number in ITR_BEFORE[LEN_BEFORE - 3::-1]:
        before_res = mul(before_res, square_data, target=utils.CCE)
        before_res = topi.add(before_res, iter_number)
    exp_data = Exp(neg(before_abs_data, target=utils.CCE), target=utils.CCE)
    before_res = mul(before_res, exp_data, target=utils.CCE)

    # compute bessel_i0e for data in other domain
    # t = |x| / 3.75
    # I0e(x) = (1 / sqrt(|x|))*(0.39894228 + 0.01328592t^-1 + 0.00225319t^-2 + -0.00157565t^-3
    #           + 0.00916281t^-4 + -0.02057706t^-5 + 0.02635537t^-6 + -0.01647633t^-7
    #           + 0.00392377t^-8), |x| >= 3.75
    data = Divide(broad_const_limit, abs_data, target=utils.CCE)
    after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1])
    after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2])
    for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]:
        after_res = mul(after_res, data, target=utils.CCE)
        after_res = topi.add(after_res, iter_number)
    rsqrt_data = rsqrt(abs_data, target=utils.CCE)
    after_res = mul(after_res, rsqrt_data, target=utils.CCE)
    after_res = minimum(before_res, after_res, target=utils.CCE)

    # chose the type of data in end
    if dtype_input == "float16":
        after_res = Cast(after_res, "float16", target=utils.CCE)

    return after_res
Ejemplo n.º 19
0
def bool_both_zero_compute(juduged_min, juduged_max):
    """if input min and max are both zero then output_data will be all zero,so need a juduge compute tensor"""
    dtype = juduged_min.dtype
    tensor_zero = topi.full(juduged_min.shape, dtype, dc.zero_const(dtype))
    min_abs = topi.abs(juduged_min)
    max_abs = topi.abs(juduged_max)
    min_max_replace = topi.add(min_abs, max_abs)
    # just check wether min and max are all zero, if true  return 0
    bool_min_max_product_less_zero = less_compare_float32(
        min_max_replace, tensor_zero)
    bool_min_max_product_more_zero = less_compare_float32(
        tensor_zero, min_max_replace)
    bool_both_zero = topi.add(bool_min_max_product_less_zero,
                              bool_min_max_product_more_zero)

    return bool_both_zero
Ejemplo n.º 20
0
def fused_l2loss_grad(data_f16,
                      data_f32,
                      layout='NHWC',
                      fill_data=4e-05,
                      target=utils.CUDA):
    """
    fused_l2loss_grad.

    Args:
        input: tvm.tensor.Tensor.

    Returns:
        ret.
    """
    if layout == "NCHW":
        data_f16 = topi.transpose(data_f16, axes=(0, 2, 3, 1))
    elif layout != "NHWC":
        raise NotImplementedError('Layout not supported {} '.format(layout))

    data_f16 = topi.cast(data_f16, 'float32')
    constant_tmp = topi.cast(fill_data, 'float32')
    data_constant = topi.full_like(data_f32, constant_tmp)
    data_out = topi.multiply(data_constant, data_f32)
    data_out = topi.add(data_f16, data_out)

    return data_out
Ejemplo n.º 21
0
def fused_bn_update(input1, input2, input3, input4, dtype, c1, c2, c3, c4):
    """
    fused operator.

    Args:
        input1 ~ input4: tvm.tensor.Tensor.
        dtype: dtype of Tensor.
        c1 ~ c4: const.

    Returns:
        Three output (list of tvm.tensor.Tensor).
    """
    const1 = tvm.const(c1, dtype)
    mul0 = topi.multiply(input2, const1)
    mul1 = topi.multiply(input1, const1)
    mul2 = topi.multiply(mul1, mul1)
    sigma2 = topi.subtract(mul0, mul2)
    const2 = tvm.const(c2, dtype)
    rsqrt_val = topi.rsqrt(topi.add(sigma2, const2))

    const3 = tvm.const(c3, dtype)
    mul3 = topi.multiply(sigma2, const3)
    sub1 = topi.subtract(input3, mul3)
    const4 = tvm.const(c4, dtype)
    data1 = topi.multiply(const4, sub1)

    sub2 = topi.subtract(input4, mul1)
    data2 = topi.multiply(const4, sub2)

    return (rsqrt_val, data1, data2)
Ejemplo n.º 22
0
def batch_matmul_4D(data1, data2, bias=None, out_dtype="float32", layout1="NHDT", layout2="NHDT", layout_out="NHDT"):
    layout1_dict = {}
    layout2_dict = {}
    layout1_str = layout1.replace('N', 'B').replace('H', 'b').replace('D', 'm').replace('T', 'k')  
    layout2_str = layout2.replace('N', 'B').replace('H', 'b').replace('D', 'n').replace('T', 'k')   
    layout1_list = list(layout1_str)
    layout2_list = list(layout2_str)

    for i in range(len(layout1)):
        layout1_dict[layout1_list[i]] = data1.shape[i]
        layout2_dict[layout2_list[i]] = data2.shape[i]

    reduce_axis = tvm.reduce_axis((0, layout1_dict['k']), name='reduce_axis')

    if out_dtype == "float32":
        res = tvm.compute((layout1_dict['B'], layout1_dict['b'], layout1_dict['m'], layout2_dict['n']), lambda B, b, i, j: tvm.sum(
                data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i].astype("float") *
                data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j].astype("float"), axis=reduce_axis))
    else:
        res = tvm.compute((layout1_dict['B'], layout1_dict['b'], layout1_dict['m'], layout2_dict['n']), lambda B, b, i, j: tvm.sum(
                data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i] *
                data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j], axis=reduce_axis))
    
    if bias is not None:
        res = topi.add(res, bias)

    if layout_out != "NHDT":
        res = auto_out_transpose(res, layout_out)
    return res
Ejemplo n.º 23
0
def _asin_grad_compute(x, dy):
    """Compute asin_grad."""

    dtype = x.dtype
    if dtype == "float16":
        x = topi.cast(x, "float32")
        dy = topi.cast(dy, "float32")

    # step 1: calculate num_to_vrsqrt = 1 - x^2
    data = topi.multiply(x, x)
    data = topi.multiply(data, tvm.const(-1, "float32"))
    num_to_vrsqrt = topi.add(data, tvm.const(1, "float32"))

    # step 2: calculate dy * (1 / sqrt(1 - x^2))
    if utils.product_is_mini():
        # mini: use newton's method for high accuracy result
        res = _vrsqrt_newton(num_to_vrsqrt)
        res = topi.multiply(res, dy)
    else:
        # cloud: use vdiv for high efficiency computation
        vsqrt_res = topi.sqrt(num_to_vrsqrt)
        res = topi.divide(dy, vsqrt_res)

    if dtype == "float16":
        res = topi.cast(res, "float16")

    return res
def fused_relu_grad_bn_double_update_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout='NHWC'):
    transform_list = [data_2, data_4, data_5, data_6, data_7]
    for i in transform_list:
        if layout == "NCHW":
            i = topi.transpose(i, axes=(0, 2, 3, 1))
        elif layout != "NHWC":
            raise NotImplementedError( 'Layout not supported {} '.format(layout))

    data_tmp1 = topi.full_like(data_7, 0.0)
    data_tmp2 = topi.greater(data_7, data_tmp1)
    data_tmp3 = topi.add(data_5, data_6)
    data_tmp4 = topi.where(data_tmp2, data_tmp3, data_tmp1)
    data_tmp5 = topi.cast(data_tmp4, 'float32')
    data_tmp7 = topi.sum(data_tmp5, axis=(0, 1, 2))

    n, h, w, c = data_7.shape
    data_tmp8 = topi.cast(data_2, 'float32')
    data_tmp9 = topi.full_like(data_tmp7, 1.0/(n*h*w))
    data_tmp10 = topi.multiply(data_1, data_tmp9)
    data_tmp11 = topi.broadcast_to(data_tmp10, data_tmp8.shape)
    data_tmp12 = topi.subtract(data_tmp8, data_tmp11)
    data_tmp13 = topi.multiply(data_tmp5, data_tmp12)
    data_tmp15 = topi.sum(data_tmp13, axis=(0, 1, 2))

    data_tmp16 = topi.cast(data_4, 'float32')
    data_tmp17 = topi.multiply(data_3, data_tmp9)
    data_tmp18 = topi.broadcast_to(data_tmp17, data_tmp16.shape)
    data_tmp19 = topi.subtract(data_tmp16, data_tmp18)
    data_tmp20 = topi.multiply(data_tmp5, data_tmp19)
    data_tmp22 = topi.sum(data_tmp20, axis=(0, 1, 2))

    return [data_tmp7, data_tmp15, data_tmp22]
Ejemplo n.º 25
0
def fake_quant_with_min_max_vars_per_channel_gradient_compute(
        input_gradients,
        inputs_data,
        min_broadcast,
        max_broadcast,
        num_bits=8,
        narrow_range=False):
    """Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation."""
    shape = get_shape(inputs_data)
    sum_axis = [x for x in range(0, len(shape) - 1)]
    dtype = inputs_data.dtype

    nudged_min, nudged_max, _ = nudged_min_max_compute(min_broadcast,
                                                       max_broadcast, num_bits,
                                                       narrow_range)
    # both zero yields zero
    bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast)
    bool_both_zero_negate = _bool_negate(bool_both_zero_value)

    bool_less_equal_nudged_max = _less_equal_compare_float32(
        inputs_data, nudged_max)
    bool_more_equal_nudged_min = _less_equal_compare_float32(
        nudged_min, inputs_data)
    bool_between_nudged_min_max = topi.multiply(bool_less_equal_nudged_max,
                                                bool_more_equal_nudged_min)
    # gradient is 1 if input in [min, max] else 0
    backprops_input_tmp = topi.multiply(bool_between_nudged_min_max,
                                        input_gradients)
    backprops_bool_both_zero = topi.multiply(backprops_input_tmp,
                                             bool_both_zero_value)
    # if min and max are both zero, gradients is input_gradients
    input_gradients_both_zero = topi.multiply(input_gradients,
                                              bool_both_zero_negate)
    backprops_input = topi.add(backprops_bool_both_zero,
                               input_gradients_both_zero)

    # gradients for min is input_gradients if inputs_data < nudged_min else 0
    bool_less_nudged_min = _bool_negate(bool_more_equal_nudged_min)
    output_backprop_min_tmp = topi.multiply(bool_less_nudged_min,
                                            input_gradients)
    # gradients for min is 0 if min and max are both 0
    output_backprop_min_bool = topi.multiply(output_backprop_min_tmp,
                                             bool_both_zero_value)
    if sum_axis == []:
        output_backprop_min = output_backprop_min_bool
    else:
        output_backprop_min = topi.sum(output_backprop_min_bool, sum_axis)

    # gradients for max is input_gradients if inputs_data > nudged_max else 0
    bool_more_nudged_max = _bool_negate(bool_less_equal_nudged_max)
    output_backprop_max_tmp = topi.multiply(bool_more_nudged_max,
                                            input_gradients)
    # gradients for max is 0 if min and max are both 0
    output_backprop_max_bool = topi.multiply(output_backprop_max_tmp,
                                             bool_both_zero_value)
    if sum_axis == []:
        output_backprop_max = output_backprop_max_bool
    else:
        output_backprop_max = topi.sum(output_backprop_max_bool, sum_axis)
    return backprops_input, output_backprop_min, output_backprop_max
Ejemplo n.º 26
0
def fused_bn_follow(data0, data1, data2, data3, data4, target=utils.CUDA):
    """
    input:
    data: length is 5
    data0: param0 beta
    data1: param1 gamma
    data2: param2 BNupdate: xi_variance
    data3: param6 BNreduce: xi_mean
    data4: param7 xi_conv2d

    layout: (N, C, H, W)

    output:
    beta + gamma * xi_variance * ( xi -  xi_mean/(N*H*W) )
    """

    n, h, w, c = data4.shape
    const = n * h * w
    inter_dtype = 'float32'
    data4 = topi.cast(data4, inter_dtype)

    multiply0 = topi.divide(data3, const)
    multiply0 = topi.expand_dims(multiply0, axis=0, num_newaxis=3)
    multiply0 = topi.broadcast_to(multiply0, (n, h, w, c))

    subtract0 = topi.subtract(data4, multiply0)

    multiply1 = topi.multiply(subtract0, data2)
    multiply2 = topi.multiply(multiply1, data1)

    add0 = topi.add(multiply2, data0)

    return add0
Ejemplo n.º 27
0
def fused_bn_follow_relu_avgpool(data0, data1, data2, data3, data4, data5, layout='NHWC', out_dtype='float16', target=utils.CUDA):
    """
    input:
    data: length is 6
    data0: tensor1 after bn_double_relu
    data1-6: bn parameters for conv2d tensor2
    layout: only (N, H, W, C), (N, C, H, W) supported
    out_dtype: float16

    output:
    avg-pooling( max(batch-normalized tensor1 + batch-normalized tensor2,  0) )
    """
    if layout == 'NCHW':
        data0 = topi.transpose(data0, (0, 2, 3, 1))
        data5 = topi.transpose(data5, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError(
            'Layout not supported {} '.format(layout))

    n, h, w, c = data0.shape
    inter_dtype = 'float32'
    add0 = fused_bn_follow(data1, data2, data3, data4, data5)
    add0 = topi.cast(add0, data0.dtype)
    add1 = topi.add(data0, add0)
    output = topi.maximum(add1, 0)
    output = topi.cast(output, inter_dtype)
    output = topi.sum(output, axis=(1, 2))
    output = topi.divide(output, h * w)
    output = topi.cast(output, out_dtype)

    return output
Ejemplo n.º 28
0
 def _sinh_2x(sinh_x):
     """sinh(2x) = 2*sinh(x)*sqrt(sinh(x)^2+1)"""
     sinh_x_square = topi.multiply(sinh_x, sinh_x)
     sinh_x_square_add_one = topi.add(sinh_x_square, 1)
     sqrt_value = topi.sqrt(sinh_x_square_add_one)
     sinh_x_mul_sqrt_value = topi.multiply(sinh_x, sqrt_value)
     sinh_2x = topi.multiply(2, sinh_x_mul_sqrt_value)
     return sinh_2x
Ejemplo n.º 29
0
def _compute_taylor(data_input):
    """Algorithm: atanh(x) = x + x^3/3 +  x^5/5 +  x^7/7"""

    taylor_para = [0, 1.0, 0, 1 / 3.0, 0, 1.0 / 5, 0, 1.0 / 7]
    # x^2
    data_mul_2 = topi.multiply(data_input, data_input)
    # 1/5 + x^2/7
    data_mul_2_7 = topi.multiply(data_mul_2,
                                 tvm.const(taylor_para[7], "float32"))
    result = topi.add(data_mul_2_7, tvm.const(taylor_para[5], "float32"))
    # 1/3 + x^2(1/5 + x^2/7)
    result = topi.multiply(data_mul_2, result)
    result = topi.add(result, tvm.const(taylor_para[3], "float32"))
    # 1 + x^2(1/3 + x^2(1/5 + x^2/7))
    result = topi.multiply(data_mul_2, result)
    result = topi.add(result, tvm.const(taylor_para[1], "float32"))
    # x(1 + x^2(1/3 + x^2(1/5 + x^2/7)))
    return topi.multiply(data_input, result)
Ejemplo n.º 30
0
Archivo: asin.py Proyecto: zhuyawen/akg
def _newton_iter(data, init_x):
    """Do element-wise Newton compute."""
    # Newton begin:x(n+1) = x(n)*(3-a*x(n)^2)/2
    init_square = topi.multiply(init_x, init_x)
    newton_res = topi.multiply(init_square, data)
    newton_res = topi.multiply(newton_res, neg_one_const("float32"))
    newton_res = topi.add(newton_res, tvm.const(3, "float32"))
    newton_res = topi.multiply(newton_res, init_x)
    newton_res = topi.multiply(newton_res, tvm.const(0.5, "float32"))
    return newton_res