예제 #1
0
def _atan_compute(data):
    """compute for atan"""
    dtype = data.dtype

    if dtype == "float16":
        data = topi.cast(data, "float32")

    abs_data = topi.abs(data)
    tensor_one = dc.one_const(abs_data.dtype)

    abs_data_sub_one = topi.subtract(abs_data, tensor_one)
    abs_data_add_one = topi.add(abs_data, tensor_one)
    abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one))

    # calucate data less than one
    res = _do_atan_taylor(abs_data)
    # calucate data more than one
    res_mt_one = topi.add(_do_atan_taylor(abs_data2),
                          tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype))
    res = topi.minimum(res, res_mt_one)

    if utils.product_is_mini() and data.dtype == "float32":
        sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32")
    else:
        sign_mask = topi.sign(data)

    res = topi.multiply(res, sign_mask)

    if dtype == "float16":
        res = topi.cast(res, "float16")

    return res
예제 #2
0
파일: tan.py 프로젝트: mindspore-ai/akg
def tan_compute(input_x):
    """tan compute implemention"""
    dtype = input_x.dtype

    # cast to type float32 when type is float16 in cloud and mini, or int32 in cloud
    if dtype == FLOAT_16 or dtype == FLOAT_32 or (dtype == INT_32
                                                  and not product_is_mini()):
        input_x = topi.cast(input_x, FLOAT_32)
        # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi
        round_pi_div = akg.lang.ascend.round(
            topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_32)))
        round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_32)
        input_x = topi.subtract(
            input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_32)))
    # cast to type float16 when type is int32 in mini
    elif dtype == INT_32 and product_is_mini():
        input_x = topi.cast(input_x, FLOAT_16)
        # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi
        round_pi_div = akg.lang.ascend.round(
            topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_16)))
        round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_16)
        input_x = topi.subtract(
            input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_16)))

    res = _tan_2x_multi(input_x, TAN_2X_TIMES)
    # cast the dtype to original dtype
    res = topi.cast(res, dtype)
    return res
예제 #3
0
def matrix_diag_part_compute(input_diagonal, input_help):
    """matrix_diag_part compute implemention"""
    shape_input_diagonal = get_shape(input_diagonal)
    dtype_input_diagonal = input_diagonal.dtype
    if dtype_input_diagonal == "int8" or dtype_input_diagonal == "uint8":
        input_diagonal = topi.cast(input_diagonal, "float16")
        input_help = topi.cast(input_help, "float16")
    if dtype_input_diagonal == "int32" and product_is_mini():
        input_diagonal = topi.cast(input_diagonal, "float16")
        input_help = topi.cast(input_help, "float16")
        input_diagonal = topi.cast(input_diagonal, "float32")
        input_help = topi.cast(input_help, "float32")
    if dtype_input_diagonal == "int32" and not product_is_mini():
        input_diagonal = topi.cast(input_diagonal, "float32")
        input_help = topi.cast(input_help, "float32")
    res_vmul = topi.multiply(input_help, input_diagonal)

    if shape_input_diagonal[-2] < shape_input_diagonal[-1]:
        res = topi.sum(res_vmul, -1)
    else:
        res = topi.sum(res_vmul, -2)

    if dtype_input_diagonal == "int32" and product_is_mini():
        res = topi.cast(res, "float16")

    res = topi.cast(res, dtype_input_diagonal)
    return res
def fused_relu_grad_bn_double_update_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout='NHWC'):
    transform_list = [data_2, data_4, data_5, data_6, data_7]
    for i in transform_list:
        if layout == "NCHW":
            i = topi.transpose(i, axes=(0, 2, 3, 1))
        elif layout != "NHWC":
            raise NotImplementedError( 'Layout not supported {} '.format(layout))

    data_tmp1 = topi.full_like(data_7, 0.0)
    data_tmp2 = topi.greater(data_7, data_tmp1)
    data_tmp3 = topi.add(data_5, data_6)
    data_tmp4 = topi.where(data_tmp2, data_tmp3, data_tmp1)
    data_tmp5 = topi.cast(data_tmp4, 'float32')
    data_tmp7 = topi.sum(data_tmp5, axis=(0, 1, 2))

    n, h, w, c = data_7.shape
    data_tmp8 = topi.cast(data_2, 'float32')
    data_tmp9 = topi.full_like(data_tmp7, 1.0/(n*h*w))
    data_tmp10 = topi.multiply(data_1, data_tmp9)
    data_tmp11 = topi.broadcast_to(data_tmp10, data_tmp8.shape)
    data_tmp12 = topi.subtract(data_tmp8, data_tmp11)
    data_tmp13 = topi.multiply(data_tmp5, data_tmp12)
    data_tmp15 = topi.sum(data_tmp13, axis=(0, 1, 2))

    data_tmp16 = topi.cast(data_4, 'float32')
    data_tmp17 = topi.multiply(data_3, data_tmp9)
    data_tmp18 = topi.broadcast_to(data_tmp17, data_tmp16.shape)
    data_tmp19 = topi.subtract(data_tmp16, data_tmp18)
    data_tmp20 = topi.multiply(data_tmp5, data_tmp19)
    data_tmp22 = topi.sum(data_tmp20, axis=(0, 1, 2))

    return [data_tmp7, data_tmp15, data_tmp22]
예제 #5
0
def fused_l2loss_grad(data_f16,
                      data_f32,
                      layout='NHWC',
                      fill_data=4e-05,
                      target=utils.CUDA):
    """
    fused_l2loss_grad.

    Args:
        input: tvm.tensor.Tensor.

    Returns:
        ret.
    """
    if layout == "NCHW":
        data_f16 = topi.transpose(data_f16, axes=(0, 2, 3, 1))
    elif layout != "NHWC":
        raise NotImplementedError('Layout not supported {} '.format(layout))

    data_f16 = topi.cast(data_f16, 'float32')
    constant_tmp = topi.cast(fill_data, 'float32')
    data_constant = topi.full_like(data_f32, constant_tmp)
    data_out = topi.multiply(data_constant, data_f32)
    data_out = topi.add(data_f16, data_out)

    return data_out
예제 #6
0
def selu_compute(input_data):
    """selu compute implemention"""
    # if input_dtype is float16,convert it to float32
    dtype = input_data.dtype
    if dtype == "float16" or dtype == "float32":
        input_data = topi.cast(input_data, "float32")
        type_tmp = "float32"
    else:
        input_data = topi.cast(input_data, "float16")
        type_tmp = "float16"

    # generate tensor_zero to be compared
    tensor_zero = topi.multiply(input_data, tvm.const(0, dtype=type_tmp))
    # generate negative_res and positive_res to compute
    # When the element value is greater than 0 and less than 0
    negative_res = topi.minimum(input_data, tensor_zero)
    positive_res = topi.maximum(input_data, tensor_zero)
    exp_res = exp(negative_res)
    sub_res = topi.add(exp_res, tvm.const(SCALAR_NEGATIVE_ONE, dtype=type_tmp))
    negative_muls_res = topi.multiply(sub_res, tvm.const(SCALE_ALPHA_PRODUCT, dtype=type_tmp))
    if dtype == "int8":
        negative_muls_res = akg.lang.cce.ceil(negative_muls_res)

    positive_muls_res = topi.multiply(positive_res, tvm.const(SCALE, dtype=type_tmp))
    res = topi.add(negative_muls_res, positive_muls_res)
    # cast to ori_dtype
    if dtype == "float16" or dtype == "int8" or dtype == "int32":
        res = topi.cast(res, dtype)

    return res
예제 #7
0
def _compute_mini(data_input, shape):
    """
    Use log and taylor to compute
    arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x))
    """

    data_abs = topi.abs(data_input)
    result_ln = _compute_log(data_abs)
    result_taylor = _compute_taylor(data_abs)

    data_abs = topi.cast(data_abs, "float16")
    data_input = topi.cast(data_input, "float16")
    result_taylor = topi.cast(result_taylor, "float16")
    result_ln = topi.cast(result_ln, "float16")
    # when |x| < 0.5 using taylor computing, and when 0.5<|x|<1 using log()
    data_res = tvm.compute(shape,
                           lambda *i : akg.tvm.expr.Select(data_abs(*i) < dc.half_const("float16"),
                                                           result_taylor(*i),
                                                           result_ln(*i)),
                           name="le")

    # arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x))
    data_res_neg = topi.multiply(data_res, dc.neg_one_const("float16"))
    data_res = tvm.compute(shape,
                           lambda *i : akg.tvm.expr.Select(data_input(*i) < dc.zero_const("float16"),
                                                           data_res_neg(*i),
                                                           data_res(*i)),
                           name="neg")
    return data_res
예제 #8
0
파일: asin_grad.py 프로젝트: zhuyawen/akg
def _asin_grad_compute(x, dy):
    """Compute asin_grad."""

    dtype = x.dtype
    if dtype == "float16":
        x = topi.cast(x, "float32")
        dy = topi.cast(dy, "float32")

    # step 1: calculate num_to_vrsqrt = 1 - x^2
    data = topi.multiply(x, x)
    data = topi.multiply(data, tvm.const(-1, "float32"))
    num_to_vrsqrt = topi.add(data, tvm.const(1, "float32"))

    # step 2: calculate dy * (1 / sqrt(1 - x^2))
    if utils.product_is_mini():
        # mini: use newton's method for high accuracy result
        res = _vrsqrt_newton(num_to_vrsqrt)
        res = topi.multiply(res, dy)
    else:
        # cloud: use vdiv for high efficiency computation
        vsqrt_res = topi.sqrt(num_to_vrsqrt)
        res = topi.divide(dy, vsqrt_res)

    if dtype == "float16":
        res = topi.cast(res, "float16")

    return res
예제 #9
0
def fused_bn_follow_relu_avgpool(data0, data1, data2, data3, data4, data5, layout='NHWC', out_dtype='float16', target=utils.CUDA):
    """
    input:
    data: length is 6
    data0: tensor1 after bn_double_relu
    data1-6: bn parameters for conv2d tensor2
    layout: only (N, H, W, C), (N, C, H, W) supported
    out_dtype: float16

    output:
    avg-pooling( max(batch-normalized tensor1 + batch-normalized tensor2,  0) )
    """
    if layout == 'NCHW':
        data0 = topi.transpose(data0, (0, 2, 3, 1))
        data5 = topi.transpose(data5, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError(
            'Layout not supported {} '.format(layout))

    n, h, w, c = data0.shape
    inter_dtype = 'float32'
    add0 = fused_bn_follow(data1, data2, data3, data4, data5)
    add0 = topi.cast(add0, data0.dtype)
    add1 = topi.add(data0, add0)
    output = topi.maximum(add1, 0)
    output = topi.cast(output, inter_dtype)
    output = topi.sum(output, axis=(1, 2))
    output = topi.divide(output, h * w)
    output = topi.cast(output, out_dtype)

    return output
예제 #10
0
def atanh(input_data):
    """
    Return atanh(x)=0.5*ln((1+x)/(1-x)) if abs(x)<1.

    Args:
        input_data (tvm.tensor.Tensor): Input tensor, only support float16, float32.

    Returns:
        A tvm.tensor.Tensor as result of atanh.

    Supported Platforms:
        'Ascend'
    """
    shape = get_shape(input_data)
    utils.check_shape(shape)

    inp_dtype = input_data.dtype
    utils.ops_dtype_check(inp_dtype, utils.DtypeForDavinci.ALL_FLOAT)

    if inp_dtype == "float16":
        input_data = topi.cast(input_data, "float32")

    if product_is_mini():
        res = _compute_mini(input_data, shape)
    else:
        res = _compute_cloud(input_data)

    res = topi.cast(res, inp_dtype)

    return res
예제 #11
0
def fused_bn_reduce(data, layout, out_dtype):
    """
    input:
    data:  4-D Tensor
    layout: input layout, only 'NCHW', 'NHWC' supported
    out_dtype: "float16" or "float32"
    
    output:
    out1_sum: 1-D tensor (C), sum on the axis "C" of input
    out2_squared_sum: 1-D tensor (C), sum of  squared on the axis "C" of input
    """

    if layout == "NCHW":
        data = topi.transpose(data, axes=(0, 2, 3, 1))
    elif layout != "NHWC":
        raise NotImplementedError('Layout not supported {} '.format(layout))

    inter_dtype = 'float32'
    data_cast = topi.cast(data, inter_dtype)

    out1_sum = topi.sum(data_cast, axis=(0, 1, 2))
    out1_sum = topi.cast(out1_sum, out_dtype)

    squared = topi.multiply(data_cast, data_cast)
    out2_squared_sum = topi.sum(squared, axis=(0, 1, 2))
    out2_squared_sum = topi.cast(out2_squared_sum, out_dtype)

    return [out1_sum, out2_squared_sum]
예제 #12
0
def fused_mul_div_rsqrt_mul_isfinite_red(input1, input2, out_dtype):
    """
    fused operator.

    Args:
        input1: tvm.tensor.Tensor.
        input2: tvm.tensor.Tensor.
        dtype: dtype of Tensor.

    Returns:
        list of tvm.tensor.Tensor.
    """
    mul_param1 = topi.multiply(input2, input2)
    divide_val = topi.divide(1, mul_param1)
    rsqrt_val = topi.rsqrt(divide_val)
    mul_param0 = topi.multiply(input1, rsqrt_val)
    isfinite = topi.isfinite(mul_param0)
    reduce_and = topi.all(isfinite)

    if mul_param0.dtype != out_dtype:
        mul_param0 = topi.cast(mul_param0, out_dtype)
        rsqrt_val = topi.cast(rsqrt_val, out_dtype)
        divide_val = topi.cast(divide_val, out_dtype)

    return [reduce_and, mul_param0, rsqrt_val, divide_val]
def sigmoid_cross_entropy_with_logits_grad_compute(predict, target, dout):
    """sigmoid_cross_entropy_with_logits_grad compute implemention"""
    dtype = predict.dtype
    if dtype == "float16":
        predict = topi.cast(predict, "float32")
        target = topi.cast(target, "float32")
        dout = topi.cast(dout, "float32")

    # e^x
    val1 = exp(predict)
    # 1 + e^x
    val2 = topi.add(val1, tvm.const(SCALAR_ONE, dtype="float32"))
    # e^x / (1 + e^x)
    val3 = topi.divide(val1, val2)
    # -target
    val4 = topi.multiply(target, tvm.const(SCALAR_NEGTIVE_ONE,
                                           dtype="float32"))
    # e^x / (1 + e^x) -y
    val5 = topi.add(val3, val4)

    result = topi.multiply(val5, dout)

    if dtype == "float16":
        result = topi.cast(result, dtype)
    return result
예제 #14
0
파일: asinh.py 프로젝트: mindspore-ai/akg
def asinh(x, target=utils.CCE):
    r"""
    Compute asinh function.

    .. math:: asinh(x) = log(x+\sqrt{x*x+1})

    Args:
        x (tvm.tensor.Tensor): Tensor of type float16, float32. 

    Returns:
       tvm.tensor.Tensor, has the same type and shape as x.
    
    Supported Platforms:
        'Ascend'
    """
    # check shape
    utils.check_shape(x)

    # check input tensor data_type
    utils.ops_dtype_check(x.dtype, utils.DtypeForDavinci.ALL_FLOAT)
    dtype = x.dtype

    # Known that, asinh(x) = log(x + sqrt(x*x+1)), and, asinh(-x) = -asinh(x)
    # If x is a large negative number, (x + sqrt(x*x+1)) will be close to zero.
    # So, asinh(x) = sign(x) * log(|x| + sqrt(|x|*|x| + 1))
    compute_dtype = dtype
    if dtype == "float16":
        # To avoid overflow and higher accuracy, x is casted to float32
        compute_dtype = "float32"
        x = topi.cast(x, compute_dtype)

    x_abs = topi.abs(x)

    if product_is_mini():
        # sqrt(|x|*|x| + 1) = |x| * sqrt(1 + 1/(|x|*|x|))
        vsquare_add_one = topi.add(1,
                                   topi.divide(1, topi.multiply(x_abs, x_abs)))
        sqrt_compute_value = sqrt_mini_newton_iter_impl(vsquare_add_one)
        sqrt_value = topi.multiply(x_abs, sqrt_compute_value)
    else:
        x_abs_square_add_one = topi.add(topi.multiply(x_abs, x_abs), 1)
        sqrt_value = topi.sqrt(x_abs_square_add_one)

    x_add_sqrt = topi.add(x_abs, sqrt_value)

    if product_is_mini():
        log_value = log_compute_mini_impl(x_add_sqrt, target)
    else:
        log_value = topi.log(x_add_sqrt)

    res = topi.multiply(Sign(x, target), log_value)

    if res.dtype != dtype:
        res = topi.cast(res, dtype)

    if product_is_mini():
        attrs = {"enable_auto_inline": False}
        return res, attrs
    return res
예제 #15
0
 def select(l1, tmp_val, gradient_accum):
     """Returns tmp_val if l1 > 0 else gradient_accum."""
     if product_is_mini():
         l1 = topi.cast(l1, "float16")
         tmp_val = topi.cast(tmp_val, "float16")
         gradient_accum = topi.cast(gradient_accum, "float16")
     tmp_val = akg.tvm.compute(
         tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i),
                                                   gradient_accum(*i)))
     return topi.cast(tmp_val, "float32") if product_is_mini() else tmp_val
예제 #16
0
def fused_relu_grad_bn_double_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, data8,
                           data9, data10, data11, data12, data13, data14, data15, layout="NHWC",
                           out_dtype="float16", target=utils.CUDA):
    
    if layout == 'NCHW':
        data5 = topi.transpose(data5, (0, 2, 3, 1))
        data9 = topi.transpose(data9, (0, 2, 3, 1))
        data13 = topi.transpose(data13, (0, 2, 3, 1))
        data14 = topi.transpose(data14, (0, 2, 3, 1))
        data15 = topi.transpose(data15, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError(
            'Layout not supported {} '.format(layout))
    
    inter_dtype = "float32"
    n, h, w, c = data5.shape
    scale = n * h * w

    mul = topi.multiply(data2, data3)
    mul1221 = topi.divide(mul, scale)

    # ReluGrad
    zero = tvm.const(0, data15.dtype)
    add = topi.add(data13, data14)
    addgrad = tvm.compute(add.shape, lambda *i: tvm.if_then_else(data15(*i) >= zero, add(*i), zero), tag=tag.INJECTIVE)
    addgrad = topi.cast(addgrad, inter_dtype)
    mul3283 = topi.multiply(scale, addgrad)
    sub1159 = topi.subtract(mul3283, data6)

    data5_cast = topi.cast(data5, inter_dtype)
    mul2372 = topi.divide(data4, scale)
    sub631 = topi.subtract(data5_cast, mul2372)
    mul1220 = topi.multiply(sub631, data1)
    div = topi.divide(mul1220, data0)
    sub271 = topi.subtract(sub1159, div)
    mul1218 = topi.multiply(mul1221, sub271)
    mul1218_cast = topi.cast(mul1218, out_dtype)

    mul1231 = topi.multiply(data11, data12)
    mul1230 = topi.divide(mul1231, scale)
    data9_cast = topi.cast(data9, inter_dtype)
    mul2364 = topi.divide(data8, scale)
    sub625 = topi.subtract(data9_cast, mul2364)
    mul1229 = topi.multiply(data10, sub625)

    div272 = topi.divide(mul1229, data7)
    sub272 = topi.subtract(sub1159, div272)
    mul1228 = topi.multiply(mul1230, sub272)
    mul1228_cast = topi.cast(mul1228, out_dtype)

    if layout == "NCHW":
        mul1218_cast = topi.transpose(mul1218_cast, (0, 3, 1, 2))
        mul1228_cast = topi.transpose(mul1228_cast, (0, 3, 1, 2))
    
    return [mul1218_cast, mul1228_cast]
예제 #17
0
def _apply_rms_prop_compute(var, ms, mom, grad, lr, momentum, rho, epsilon):
    """Compute apply_rms_prop"""
    compute_dtype = "float32"
    dtype = var.dtype
    if dtype != compute_dtype:
        var, ms, mom, grad, lr, momentum, rho = [
            topi.cast(t, compute_dtype)
            for t in [var, ms, mom, grad, lr, momentum, rho]
        ]
    shape = get_shape(var)
    cons_eps = akg.tvm.const(epsilon, dtype=compute_dtype)
    one_minus_rho = akg.tvm.compute(
        (1, ),
        lambda *indice: akg.tvm.const(1.0, compute_dtype) - rho[0],
        name="one_minus_rho")

    # var_update = var - (momentum * mom + lr * grad / sqrt(rho * ms + (1 - rho) * grad * grad + epsilon))
    mom_1 = akg.tvm.compute(shape,
                            lambda *indice: momentum[0] * mom(*indice),
                            name="mom_1")
    lr_grad = akg.tvm.compute(shape,
                              lambda *indice: grad(*indice) * lr[0],
                              name="lr_grad")
    rho_ms = akg.tvm.compute(shape,
                             lambda *indice: ms(*indice) * rho[0],
                             name="rho_ms")
    rho_grad2 = akg.tvm.compute(
        shape,
        lambda *indice: grad(*indice) * grad(*indice) * one_minus_rho[0],
        name="rho_grad2")
    ms_update = akg.tvm.compute(
        shape,
        lambda *indice: rho_ms(*indice) + rho_grad2(*indice),
        name="ms_update")
    ms_eps = akg.tvm.compute(shape,
                             lambda *indice: ms_update(*indice) + cons_eps,
                             name="ms_eps")
    rsq = rsqrt(ms_eps, target="cce")
    mom_2 = akg.tvm.compute(shape,
                            lambda *indice: lr_grad(*indice) * rsq(*indice),
                            name="mom_2")
    mom_update = akg.tvm.compute(
        shape,
        lambda *indice: mom_1(*indice) + mom_2(*indice),
        name="mom_update")
    var_update = akg.tvm.compute(
        shape,
        lambda *indice: var(*indice) - mom_update(*indice),
        name="var_update")
    if var_update.dtype != dtype:
        var_update, ms_update, mom_update = [
            topi.cast(t, dtype) for t in [var_update, ms_update, mom_update]
        ]

    return var_update, ms_update, mom_update
예제 #18
0
def fused_relu_grad_bn_reduce_grad(data_1,
                                   data_2,
                                   data_3,
                                   data_4,
                                   data_5,
                                   data_6,
                                   data_7,
                                   data_8,
                                   data_9,
                                   layout='NHWC',
                                   target=utils.CUDA):
    """
    fused_relu_grad_bn_reduce_grad.

    Args:
        data_1~data_9: tvm.tensor.Tensor.
        layout: input layout, only 'NCHW', 'NHWC' supported

    Returns:
        tvm.tensor.Tensor.
    """
    transform_list = [data_7, data_8, data_9]
    for i in transform_list:
        if layout == "NCHW":
            i = topi.transpose(i, axes=(0, 2, 3, 1))
        elif layout != "NHWC":
            raise NotImplementedError(
                'Layout not supported {} '.format(layout))

    data_tmp1 = topi.multiply(data_4, data_5)
    n, h, w, c = data_9.shape
    data_tmp2 = topi.full_like(data_tmp1, 1.0 / (n * h * w))
    data_tmp3 = topi.multiply(data_tmp1, data_tmp2)

    data_tmp5 = topi.full_like(data_9, 0.0)
    data_tmp6 = topi.greater(data_9, data_tmp5)

    data_tmp7 = topi.where(data_tmp6, data_8, data_tmp5)

    data_tmp8 = topi.cast(data_tmp7, 'float32')
    data_tmp9 = topi.full_like(data_tmp8, n * h * w)
    data_tmp10 = topi.multiply(data_tmp8, data_tmp9)
    data_tmp12 = topi.subtract(data_tmp10, data_3)
    data_tmp14 = topi.cast(data_7, 'float32')
    data_tmp15 = topi.multiply(data_6, data_tmp2)

    data_tmp17 = topi.subtract(data_tmp14, data_tmp15)
    data_tmp18 = topi.multiply(data_2, data_tmp17)
    data_tmp20 = topi.divide(data_tmp18, data_1)
    data_tmp21 = topi.subtract(data_tmp12, data_tmp20)
    data_tmp22 = topi.multiply(data_tmp3, data_tmp21)
    data_out = topi.cast(data_tmp22, 'float16')

    return data_out
예제 #19
0
파일: asin.py 프로젝트: zhuyawen/akg
def _asin_compute(data_input):
    """Compute asin"""

    dtype = data_input.dtype
    boundary = tvm.const(BOUNDARY, "float32")

    # Change dtype to float32
    if dtype == "float16":
        data_input = topi.cast(data_input, "float32")

    # Sign mask
    data_sign = sign(data_input)

    # All positive
    data1 = topi.multiply(data_input, data_sign)

    # x belongs to (0, 2^(-0.5))
    choice_1 = topi.minimum(data1, boundary)
    choice_1 = topi.subtract(choice_1, boundary)
    choice_1_floor = akg.lang.cce.floor(choice_1)
    # the dtype of choice_1_floor is int32, need to be cast to fp32.
    if utils.product_is_mini():
        choice_1_floor = topi.cast(choice_1_floor, "float16")
        choice_1_floor = topi.cast(choice_1_floor, "float32")
    else:
        choice_1_floor = topi.cast(choice_1_floor, "float32")
    choice_1 = topi.multiply(choice_1_floor, neg_one_const("float32"))

    taylor1 = _taylor_compute(data1)
    res_1 = topi.multiply(taylor1, choice_1)

    # x belongs to (2^(-0.5), 1)
    choice_2 = topi.subtract(one_const("float32"), choice_1)
    data2 = topi.subtract(one_const("float32"), topi.multiply(data1, data1))
    data2_sqrt = _sqrt(data2)

    taylor2 = _taylor_compute(data2_sqrt, data2)

    res_2 = topi.multiply(taylor2, neg_one_const("float32"))
    res_2 = topi.add(res_2, tvm.const(HALF_PI, "float32"))
    res_2 = topi.multiply(res_2, choice_2)

    # Restore sign
    res_1 = topi.add(res_1, res_2)
    res_1 = topi.multiply(res_1, data_sign)

    # Restore dtype
    if dtype == "float16":
        res_1 = topi.cast(res_1, "float16")

    return res_1
예제 #20
0
def _reduce_any_d_compute(x, axis=None, keepdims=None):
    """reduce_any_d compute implemention"""
    dtype = x.dtype
    data_fp16 = topi.cast(x, "float16")
    data_abs = topi.abs(data_fp16)

    res_tmp = akg.lang.ascend.reduce_max(data_abs, axis=axis, keepdims=keepdims)
    shape_len = len(x.shape)
    if axis[-1] == shape_len - 1 and not keepdims:
        res_shape = [x.value for x in res_tmp.shape]
        res_shape.pop()
        res_tmp = tvm.compute(res_shape, lambda *indice: res_tmp(*indice, 0), name="reduce_res")
    res_s8 = topi.cast(res_tmp, dtype)
    return res_s8
예제 #21
0
def fused_bn_reduce_grad(data0,
                         data1,
                         data2,
                         data3,
                         data4,
                         data5,
                         data6,
                         data7,
                         layout='NHWC',
                         out_dtype='float16',
                         target=utils.CUDA):

    if layout == 'NCHW':
        data3 = topi.transpose(data3, (0, 2, 3, 1))
        data7 = topi.transpose(data7, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError('Layout not supported {} '.format(layout))

    n, h, w, c = data3.shape
    const = n * h * w
    inter_dtype = 'float32'
    out1 = topi.multiply(data4, data5)
    out1 = topi.divide(out1, const)
    out1 = topi.expand_dims(out1, axis=0, num_newaxis=3)
    out1 = topi.broadcast_to(out1, (n, h, w, c))

    data3 = topi.cast(data3, inter_dtype)
    data2 = topi.expand_dims(data2, axis=0, num_newaxis=3)
    data2 = topi.broadcast_to(data2, (n, h, w, c))
    out2 = topi.multiply(data3, const)
    out2 = topi.subtract(out2, data2)

    data1 = topi.expand_dims(data1, axis=0, num_newaxis=3)
    data1 = topi.broadcast_to(data1, (n, h, w, c))
    data7 = topi.cast(data7, inter_dtype)
    out3 = topi.divide(data6, const)
    out3 = topi.subtract(data7, out3)
    out3 = topi.multiply(data1, out3)
    out3 = topi.divide(out3, data0)

    output = topi.subtract(out2, out3)
    output = topi.multiply(output, out1)

    output = topi.cast(output, out_dtype)

    if layout == "NCHW":
        output = topi.transpose(output, (0, 3, 1, 2))

    return output
예제 #22
0
def sgd_compute(parameters, gradient, learning_rate, accum, momentum, stat,
                dampening=0.0, weight_decay=0.0, nesterov=False):
    """sgd compute implementation"""
    dtype = parameters.dtype
    if dtype == "float16":
        parameters = topi.cast(parameters, "float32")
        accum = topi.cast(accum, "float32")
        learning_rate = topi.cast(learning_rate, "float32")
        gradient = topi.cast(gradient, "float32")
        momentum = topi.cast(momentum, "float32")
        stat = topi.cast(stat, "float32")

    # if weight_decay != 0.0, need compute grad_delta to update gradient
    if weight_decay != 0.0:
        parameters = topi.multiply(parameters, tvm.const(1.0, 'float32'))
        grad_delta = topi.multiply(parameters, weight_decay)
        gradient = topi.add(gradient, grad_delta)

    stat_mid = topi.multiply(stat, tvm.const(-1, "float32"))
    stat_act = topi.add(stat_mid, tvm.const(1, "float32"))

    dampening_t = topi.multiply(stat_act, dampening)

    # update accum
    accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0])

    gradient_damp = topi.multiply(gradient, dampening_t)
    accum_t = topi.add(accum_delta, gradient)
    if dampening != 0.0:
        accum_t = topi.subtract(accum_t, gradient_damp)

    # update parameters
    if nesterov:
        parameters_delta = tvm.compute(gradient.shape, lambda *indice: gradient(*indice) * learning_rate[0])
        parameters_delta_2 = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0])
        parameters_delta_2 = tvm.compute(parameters_delta_2.shape,
                                         lambda *indice: parameters_delta_2(*indice) * learning_rate[0])
        parameters_delta = topi.add(parameters_delta, parameters_delta_2)
        parameters_t = topi.subtract(parameters, parameters_delta)
    else:
        parameters_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * learning_rate[0])
        parameters_t = topi.subtract(parameters, parameters_delta)

    # update stat
    stat_t = topi.multiply(stat_act, tvm.const(NUM_ZERO, 'float32'))


    if dtype == "float16":
        parameters_t = topi.cast(parameters_t, "float16")
        accum_t = topi.cast(accum_t, "float16")
        stat_t = topi.cast(stat_t, "float16")

    return parameters_t, accum_t, stat_t
예제 #23
0
def fused_pad(input,
              pad_before,
              pad_after,
              layout='NHWC',
              pad_value=0.0,
              target=utils.CUDA):
    """
    fused_pad.
 
    Args:
        input : tvm.Tensor or Expr
        pad_before : list / tuple of n ints. (Pad width on each dimension to pad the before the axis begin.)
        pad_after : list / tuple of n ints. (Pad width each dimension to pad the after the axis end.)
        pad_value : float. (The value to be padded.)

    Returns
        tvm.Tensor
    """
    if layout == "NCHW":
        data = topi.transpose(data, axes=(0, 2, 3, 1))
    elif layout != "NHWC":
        raise NotImplementedError('Layout not supported {} '.format(layout))

    cast_after = topi.cast(input, 'float16')
    output = topi.nn.pad(cast_after, pad_before, pad_after, pad_value)
    return output
예제 #24
0
def _apply_rms_prop_mixed_precision_compute(var, ms, mom, grad, lr, momentum,
                                            rho, epsilon):
    """Compute apply_rms_prop_mixed_precision"""
    out_var, out_ms, out_mom = _apply_rms_prop_compute(var, ms, mom, grad, lr,
                                                       momentum, rho, epsilon)
    out_var_fp16 = topi.cast(out_var, "float16")
    return out_var, out_var_fp16, out_ms, out_mom
예제 #25
0
def fused_bn_follow(data0, data1, data2, data3, data4, target=utils.CUDA):
    """
    input:
    data: length is 5
    data0: param0 beta
    data1: param1 gamma
    data2: param2 BNupdate: xi_variance
    data3: param6 BNreduce: xi_mean
    data4: param7 xi_conv2d

    layout: (N, C, H, W)

    output:
    beta + gamma * xi_variance * ( xi -  xi_mean/(N*H*W) )
    """

    n, h, w, c = data4.shape
    const = n * h * w
    inter_dtype = 'float32'
    data4 = topi.cast(data4, inter_dtype)

    multiply0 = topi.divide(data3, const)
    multiply0 = topi.expand_dims(multiply0, axis=0, num_newaxis=3)
    multiply0 = topi.broadcast_to(multiply0, (n, h, w, c))

    subtract0 = topi.subtract(data4, multiply0)

    multiply1 = topi.multiply(subtract0, data2)
    multiply2 = topi.multiply(multiply1, data1)

    add0 = topi.add(multiply2, data0)

    return add0
예제 #26
0
def fused_bn_follow_relu(data0, data1, data2, data3, data4, layout='NHWC', out_dtype='float16', target=utils.CUDA):
    """
    input:
    data0-4: bn parameters for conv2d tensor, length is 5
    data0: param0 beta
    data1: param1 gamma
    data2: param2 BNupdate: xi_variance
    data3: param6 BNreduce: xi_mean
    data4: param7 xi_conv2d, float16
    layout: only (N, H, W, C), (N, C, H, W) supported
    out_dtype: float16

    output:
    ReLU: max(batch-normalized tensor,  0)
    """
    if layout == 'NCHW':
        data4 = topi.transpose(data4, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError(
            'Layout not supported {} '.format(layout))

    add0 = fused_bn_follow(data0, data1, data2, data3, data4)
    add0 = topi.cast(add0, out_dtype)
    output = topi.maximum(add0, 0)

    if layout == "NCHW":
        output = topi.transpose(output, (0, 3, 1, 2))

    return output
예제 #27
0
파일: sinh.py 프로젝트: mindspore-ai/akg
def sinh_compute(x):
    """Compute sinh."""
    dtype = x.dtype
    # in order to get the precise calcuate result
    if dtype == "float16":
        x = topi.cast(x, "float32")

    data_exp = Exp(x, utils.CCE)
    negative_data = topi.multiply(x, -1)
    negative_data_exp = Exp(negative_data, utils.CCE)
    data_exp_sub = topi.subtract(data_exp, negative_data_exp)

    res = topi.multiply(data_exp_sub, tvm.const(0.5, "float32"))
    if dtype == "float16":
        res = topi.cast(res, "float16")

    return res
예제 #28
0
def reduction_layer(data, axis, op, coeff):
    """
    Reduce data on axis and scale by coeff.

    Args:
        data (tvm.tensor.Tensor): tensor with type float16 or float32, int8, uint8.
        axis (int): the beginning axis to reduce, -1 means the last axis. if 0, reduction to scalar.
        op (str): one of "SUM", "ASUM"(abs and sum), "SUMSQ"(sqr and sum), "MEAN".
        coeff ([int, float]): scale
    Returns:
        tvm.tensor.Tensor.
    """
    dtype = data.dtype
    vc_util.ops_dtype_check(data.dtype, [vc_util.DtypeForDavinci.ALL_FLOAT, 
                                         vc_util.DtypeForDavinci.INT8,
                                         vc_util.DtypeForDavinci.UINT8])

    vc_util.check_shape(data.shape)

    if op not in ["SUM", "ASUM", "SUMSQ", "MEAN"]:
        raise RuntimeError("op can only be one of SUM, ASUM, SUMSQ, MEAN")
    
    shape = get_shape(data)
    
    vc_util.reduce_axis_check(shape, axis)
    axis = _get_axis_list(axis, shape)
    
    if dtype in ["int8", "uint8"]:
        data = topi.cast(data, "float16")
    data = topi.cast(data, "float32")
    cof = tvm.const(coeff, "float32")
   
    if op == "ASUM":
        tmp = _asum(data, axis, cof) 
    elif op == "SUMSQ":
        tmp =_sumsq(data, axis, cof) 
    elif op == "MEAN":
        tmp = _mean(data, axis, cof, shape)
    elif op == "SUM":
        tmp = _sum(data, axis, cof)
    
    if dtype in ["int8", "uint8"]:
        tmp = topi.cast(tmp, "float16")    
    res = topi.cast(tmp, dtype)
    
    return res
예제 #29
0
파일: atan2.py 프로젝트: zhuyawen/akg
def _init_atan2_mask(data_y_, data_x_):
    """
    Compute mask for atan2.

    Args:
        data_y (tvm.tensor.Tensor): The y of atan2(y, x).
        data_x (tvm.tensor.Tensor): The x of atan2(y, x).

    Returns:
        mask (tvm.tensor.Tensor): The mask of x's and y's value.
    """
    is_cast_for_mini = utils.product_is_mini() and data_y_.dtype == "float32"

    # in mini, select only support float16
    if is_cast_for_mini:
        data_x = topi.cast(data_x_, "float16")
        data_y = topi.cast(data_y_, "float16")
    else:
        data_x = data_x_
        data_y = data_y_

    dtype_input = data_y.dtype

    tensor_one = dc.one_const(dtype_input)
    tensor_zero = dc.zero_const(dtype_input)
    tensor_neg_one = dc.neg_one_const(dtype_input)

    y_ge_zero = tvm.compute(
        data_y.shape,
        lambda *i: tvm.expr.Select(
            data_y(*i) >= tensor_zero, tensor_one, tensor_neg_one),
        name="y_ge_zero")

    x_lt_zero_y_mask = tvm.compute(
        data_y.shape,
        lambda *i: tvm.expr.Select(
            data_x(*i) < tensor_zero, y_ge_zero(*i), tensor_zero),
        name="xlt0_y_mask")

    if is_cast_for_mini:
        x_lt_zero_y_mask = topi.cast(x_lt_zero_y_mask, "float32")
        y_ge_zero = topi.cast(y_ge_zero, "float32")

    return (x_lt_zero_y_mask, y_ge_zero)
예제 #30
0
def _apply_adadelta_compute(var, accum, accum_update, grad, lr, rho, epsilon):
    """Compute apply_adadelta"""
    dtype = var.dtype
    if dtype == "float16":
        var = topi.cast(var, "float32")
        accum = topi.cast(accum, "float32")
        accum_update = topi.cast(accum_update, "float32")
        lr = topi.cast(lr, "float32")
        rho = topi.cast(rho, "float32")
        grad = topi.cast(grad, "float32")

    epsilon = tvm.const(epsilon, "float32")
    tensor_one = akg.lang.ascend.broadcast(tvm.const(1, "float32"), var.shape)
    tensor_rho = topi.broadcast_to(rho, var.shape)
    tensor_rho_gs = topi.subtract(tensor_one, tensor_rho)
    tensor_epsilon = akg.lang.ascend.broadcast(epsilon, var.shape)

    # accum = accum * rho + grad ** 2 * (1 - rho)
    rhs = topi.multiply(accum, tensor_rho)
    lhs = topi.multiply(grad, grad)
    lhs = topi.multiply(lhs, tensor_rho_gs)
    accum_res = akg.lang.ascend.vadd(lhs, rhs)

    # update = (accum_update + epsilon).sqrt * (accum + epsilon).rsqrt * grad
    rhs = topi.add(accum_update, tensor_epsilon)
    rhs = sqrt(rhs, target=utils.CCE)
    lhs = topi.add(accum_res, tensor_epsilon)
    lhs = rsqrt(lhs, target=utils.CCE)
    lhs = topi.multiply(grad, lhs)
    update = topi.multiply(lhs, rhs)

    # var -= update * lr
    var_res = topi.broadcast_to(lr, var.shape)
    var_res = topi.multiply(update, var_res)
    var_res = topi.subtract(var, var_res)

    # accum_update = rho * accum_update + (1 - rho) * update.square
    rhs = topi.multiply(accum_update, tensor_rho)
    lhs = topi.multiply(update, update)
    lhs = topi.multiply(lhs, tensor_rho_gs)
    accum_update_res = akg.lang.ascend.vadd(lhs, rhs)

    if dtype == "float16":
        var_res = topi.cast(var_res, "float16")
        accum_res = topi.cast(accum_res, "float16")
        accum_update_res = topi.cast(accum_update_res, "float16")

    return var_res, accum_res, accum_update_res