Example #1
0
File: sin.py Project: zhuyawen/akg
def sin_compute(x):
    """compute for sine"""
    dtype = x.dtype
    shape = get_shape(x)

    # cast to type float32 when type is float16
    if dtype == FLOAT_16:
        x = akg.lang.cce.cast_to(x, FLOAT_32)

    pai_multiple = akg.lang.cce.vmuls(x, 1 / PI)
    round_float = akg.lang.cce.cast_to(akg.lang.cce.round(pai_multiple),
                                       FLOAT_32)
    # to adjust x to [-pai/2,pai/2]
    x = akg.lang.cce.vsub(x, akg.lang.cce.vmuls(round_float, PI))

    res = _sin(x)

    # if round is odd, the final result need to mutiply -1.
    # Need to multipy 1/2 to get the ceil value
    ceil_value = akg.lang.cce.ceil(akg.lang.cce.vmuls(round_float, 1 / 2))
    # if odd, ceil*2-round is 1,if even, the value is 0
    sub_value = akg.lang.cce.vsub(
        akg.lang.cce.vmuls(ceil_value, tvm.const(2, dtype)), round_float)
    tensor_one = akg.lang.cce.broadcast(tvm.const(1, FLOAT_32), shape)
    odd_tensor = akg.lang.cce.vsub(tensor_one, sub_value)
    even_tensor = akg.lang.cce.vsub(odd_tensor, tensor_one)
    odd_even_tensor = akg.lang.cce.vadd(odd_tensor, even_tensor)
    res = akg.lang.cce.vmul(res, odd_even_tensor)

    # cast the dtype to float16
    if dtype == FLOAT_16:
        res = akg.lang.cce.cast_to(res, FLOAT_16)

    return res
Example #2
0
def _tan_2x_multi(input_x, times):
    """calculating tan x by calculating tan (x/2^times) and using double angle formula multiple times"""
    # Calculate tan (x/2^times)
    if input_x.dtype == FLOAT_16 and utils.product_is_mini():
        input_x_divide = topi.multiply(input_x, tvm.const(1.0/(2.0**times), FLOAT_16))
        res = _tan_expand(input_x_divide)
    else:
        input_x_divide = topi.multiply(input_x, 1.0/(2.0**times))
        res = _tan_expand(input_x_divide)
    while times != 0:
        # using double angle formula: tan 2x = 2*tan x/(1-tan x*tan x)
        if input_x.dtype == FLOAT_16 and utils.product_is_mini():
            res_numerator = topi.multiply(res, tvm.const(2.0, FLOAT_16))
            tanx_square = topi.multiply(res, res)
            res_denominator = topi.add(topi.multiply(tanx_square, tvm.const(-1.0, FLOAT_16)), tvm.const(1.0, FLOAT_16))
        else:
            res_numerator = topi.multiply(res, 2.0)
            tanx_square = topi.multiply(res, res)
            res_denominator = topi.add(topi.multiply(tanx_square, -1.0), 1.0)

        if utils.product_is_mini():
            res = mul(res_numerator, reciprocal(res_denominator))
        else:
            res = div(res_numerator, res_denominator)
        times = times - 1
    return res
Example #3
0
def csr_div(inputs, attrs):
    row_idx, col_idx, sparse_data, dense = inputs
    shape = tuple(attrs["dense_shape"])
    feature_shape = get_shape(sparse_data.shape)[1:]
    assert dense.dtype == sparse_data.dtype, "data and weight must have the same dtype"

    num_rows = row_idx.shape[0] - 1
    dense_shape = get_shape(dense.shape)
    sparse_shape = get_shape(shape)
    broadcast_shape = get_broadcast_shape(dense_shape, sparse_shape)
    need_expand = tvm.const(len(dense_shape) < len(broadcast_shape))
    need_broadcast_first_dim = tvm.const(
        len(dense_shape) == len(broadcast_shape)
        and dense_shape[0] < broadcast_shape[0])
    need_broadcast_last_dim = tvm.const(
        len(dense_shape) == len(broadcast_shape)
        and dense_shape[1] < broadcast_shape[1])

    def gen_ir(dense, sparse_data, col_idx, row_idx, output):
        ib = tvm.ir_builder.create()
        ib.scope_attr("INFO", "csr_avg_row",
                      int(sparse_data.shape[0]) // max(int(num_rows), 1))
        with ib.for_range(0, num_rows, name='i') as i:
            start = ib.load(row_idx, i)
            end = ib.load(row_idx, i + 1)
            with ib.for_range(0, end - start, name='j') as j:
                pos = start + j
                with ib.for_range_n(feature_shape, 'k') as k:
                    with ib.if_scope(pos < end):
                        col = ib.load(col_idx, pos)
                        store_loc = [pos] + k
                        val = ib.load(sparse_data, store_loc)
                        with ib.if_scope(need_expand):
                            ib.store(output, store_loc,
                                     val / ib.load(dense, [col] + k))
                        with ib.else_scope():
                            with ib.if_scope(need_broadcast_first_dim):
                                ib.store(output, store_loc,
                                         val / ib.load(dense, [0, col] + k))
                            with ib.else_scope():
                                with ib.if_scope(need_broadcast_last_dim):
                                    ib.store(output, store_loc,
                                             val / ib.load(dense, [i, 0] + k))
                                with ib.else_scope():
                                    ib.store(
                                        output, store_loc,
                                        val / ib.load(dense, [i, col] + k))
        return ib.get()

    output_name = "T_csr_div_" + dense.op.name + "_" + sparse_data.op.name
    out_buf = tvm.decl_buffer(sparse_data.shape, sparse_data.dtype,
                              output_name)
    attrs = {"remove_self_dependence": True, "csr_op": True}
    return tvm.extern(
        [sparse_data.shape], [dense, sparse_data, col_idx, row_idx],
        lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
        dtype=sparse_data.dtype,
        out_buffers=[out_buf],
        name=output_name,
        attrs=attrs)
Example #4
0
def tan_compute(input_x):
    """tan compute implemention"""
    dtype = input_x.dtype

    # cast to type float32 when type is float16 in cloud and mini, or int32 in cloud
    if dtype == FLOAT_16 or dtype == FLOAT_32 or (dtype == INT_32
                                                  and not product_is_mini()):
        input_x = topi.cast(input_x, FLOAT_32)
        # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi
        round_pi_div = akg.lang.ascend.round(
            topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_32)))
        round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_32)
        input_x = topi.subtract(
            input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_32)))
    # cast to type float16 when type is int32 in mini
    elif dtype == INT_32 and product_is_mini():
        input_x = topi.cast(input_x, FLOAT_16)
        # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi
        round_pi_div = akg.lang.ascend.round(
            topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_16)))
        round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_16)
        input_x = topi.subtract(
            input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_16)))

    res = _tan_2x_multi(input_x, TAN_2X_TIMES)
    # cast the dtype to original dtype
    res = topi.cast(res, dtype)
    return res
Example #5
0
def _elu_mini_compute(exp_res, data, shape):
    """
    do element-wise e^x - 1 compute in mini scene

    f(x) = e^x - 1,                   x <= TAYLOR_THRESHOLD or x >= 0
    f(x) = fifth taylor computer,     TAYLOR_THRESHOLD < x < 0

    Args:
        exp_res (tvm.tensor.Tensor): the tensor of e^x -1, float16
        data (tvm.tensor.Tensor): input, float16
        shape (list): the shape of input

    Returns: 
        tvm.tensor.Tensor
    """
    TAYLOR_THRESHOLD = -0.7
    input_right_border = tvm.const(0.0, "float16")
    right_border = tvm.compute(shape, lambda *i: input_right_border)

    taylor_res = _elu_taylor_compute(data)

    input_left_border = tvm.const(TAYLOR_THRESHOLD, "float16")
    left_border = tvm.compute(shape, lambda *i: input_left_border)
    exp_taylor_neg = tvm.compute(shape, lambda *i: tvm.expr.Select\
                    (data(*i) > left_border(*i), taylor_res(*i), exp_res(*i)), name="gt")
    exp_res = tvm.compute(shape, lambda *i: tvm.expr.Select\
              (data(*i) < right_border(*i), exp_taylor_neg(*i), exp_res(*i)), name="lt")
    return exp_res
Example #6
0
def fused_bn_update(input1, input2, input3, input4, dtype, c1, c2, c3, c4):
    """
    fused operator.

    Args:
        input1 ~ input4: tvm.tensor.Tensor.
        dtype: dtype of Tensor.
        c1 ~ c4: const.

    Returns:
        Three output (list of tvm.tensor.Tensor).
    """
    const1 = tvm.const(c1, dtype)
    mul0 = topi.multiply(input2, const1)
    mul1 = topi.multiply(input1, const1)
    mul2 = topi.multiply(mul1, mul1)
    sigma2 = topi.subtract(mul0, mul2)
    const2 = tvm.const(c2, dtype)
    rsqrt_val = topi.rsqrt(topi.add(sigma2, const2))

    const3 = tvm.const(c3, dtype)
    mul3 = topi.multiply(sigma2, const3)
    sub1 = topi.subtract(input3, mul3)
    const4 = tvm.const(c4, dtype)
    data1 = topi.multiply(const4, sub1)

    sub2 = topi.subtract(input4, mul1)
    data2 = topi.multiply(const4, sub2)

    return (rsqrt_val, data1, data2)
def sigmoid_cross_entropy_with_logits_grad_compute(predict, target, dout):
    """sigmoid_cross_entropy_with_logits_grad compute implemention"""
    dtype = predict.dtype
    if dtype == "float16":
        predict = topi.cast(predict, "float32")
        target = topi.cast(target, "float32")
        dout = topi.cast(dout, "float32")

    # e^x
    val1 = exp(predict)
    # 1 + e^x
    val2 = topi.add(val1, tvm.const(SCALAR_ONE, dtype="float32"))
    # e^x / (1 + e^x)
    val3 = topi.divide(val1, val2)
    # -target
    val4 = topi.multiply(target, tvm.const(SCALAR_NEGTIVE_ONE,
                                           dtype="float32"))
    # e^x / (1 + e^x) -y
    val5 = topi.add(val3, val4)

    result = topi.multiply(val5, dout)

    if dtype == "float16":
        result = topi.cast(result, dtype)
    return result
Example #8
0
def selu_compute(input_data):
    """selu compute implemention"""
    # if input_dtype is float16,convert it to float32
    dtype = input_data.dtype
    if dtype == "float16" or dtype == "float32":
        input_data = topi.cast(input_data, "float32")
        type_tmp = "float32"
    else:
        input_data = topi.cast(input_data, "float16")
        type_tmp = "float16"

    # generate tensor_zero to be compared
    tensor_zero = topi.multiply(input_data, tvm.const(0, dtype=type_tmp))
    # generate negative_res and positive_res to compute
    # When the element value is greater than 0 and less than 0
    negative_res = topi.minimum(input_data, tensor_zero)
    positive_res = topi.maximum(input_data, tensor_zero)
    exp_res = exp(negative_res)
    sub_res = topi.add(exp_res, tvm.const(SCALAR_NEGATIVE_ONE, dtype=type_tmp))
    negative_muls_res = topi.multiply(sub_res, tvm.const(SCALE_ALPHA_PRODUCT, dtype=type_tmp))
    if dtype == "int8":
        negative_muls_res = akg.lang.cce.ceil(negative_muls_res)

    positive_muls_res = topi.multiply(positive_res, tvm.const(SCALE, dtype=type_tmp))
    res = topi.add(negative_muls_res, positive_muls_res)
    # cast to ori_dtype
    if dtype == "float16" or dtype == "int8" or dtype == "int32":
        res = topi.cast(res, dtype)

    return res
Example #9
0
def _asin_grad_compute(x, dy):
    """Compute asin_grad."""

    dtype = x.dtype
    if dtype == "float16":
        x = topi.cast(x, "float32")
        dy = topi.cast(dy, "float32")

    # step 1: calculate num_to_vrsqrt = 1 - x^2
    data = topi.multiply(x, x)
    data = topi.multiply(data, tvm.const(-1, "float32"))
    num_to_vrsqrt = topi.add(data, tvm.const(1, "float32"))

    # step 2: calculate dy * (1 / sqrt(1 - x^2))
    if utils.product_is_mini():
        # mini: use newton's method for high accuracy result
        res = _vrsqrt_newton(num_to_vrsqrt)
        res = topi.multiply(res, dy)
    else:
        # cloud: use vdiv for high efficiency computation
        vsqrt_res = topi.sqrt(num_to_vrsqrt)
        res = topi.divide(dy, vsqrt_res)

    if dtype == "float16":
        res = topi.cast(res, "float16")

    return res
Example #10
0
def ReLU6Grad(y_grad, x, target=utils.CUDA):
    """
    Computes Gradients of Rectified Linear 6.

    Args:
        y_grad (tvm.tensor.Tensor): Tensor of type float16, float32, gradients backpropagated to the ReLU6 op.
        x (tvm.tensor.Tensor): Tensor of type float16/float32, inputs that where passed to the ReLU6 op, or its outputs.

    Returns:
        tvm.tensor.Tensor, has same type and shape as x.
    
    Supported Platforms:
        'GPU'
    """
    if target != utils.CUDA:
        raise RuntimeError("the target %s is not supported!" % target)
    shape = x.shape
    dtype = x.dtype

    zero = tvm.const(0, dtype)
    six = tvm.const(6, dtype)

    res0 = tvm.compute(shape,
                       lambda *i: tvm.if_then_else(x(*i) >= zero, x(*i), zero))
    res6 = tvm.compute(
        shape, lambda *i: tvm.if_then_else(x(*i) >= six, zero, res0(*i)))
    res = tvm.compute(
        shape, lambda *i: tvm.if_then_else(res6(*i) == zero, zero, y_grad(*i)))
    return res
Example #11
0
def xdivy_grad_compute(placeholders, shape_max, dtype, rx, ry):
    """
    Do element-wise xdivy_grad compute.

    Args:
        placeholders (Union[list, typle]): the placeholder of data input
        shape_max (Union[list, typle]): the shape of broadcast
        dtype (string): the type of data input
        rx (list): the reduction indices of data input with broadcast
        ry (list): the reduction indices for data input with broadcast

    Returns:
        output_y1 (tvm.tensor.Tensor): result of xdivy_grad
        output_y2 (tvm.tensor.Tensor): result of xdivy_grad
    """
    x1_ori = placeholders[0]
    x2_ori = placeholders[1]
    grad_ori = placeholders[2]

    if dtype == "float16":
        x1 = akg.lang.cce.cast_to(x1_ori, "float32")
        x2 = akg.lang.cce.cast_to(x2_ori, "float32")
        grad = akg.lang.cce.cast_to(grad_ori, "float32")
        x1 = akg.lang.cce.broadcast(x1, shape_max)
        x2 = akg.lang.cce.broadcast(x2, shape_max)
        grad = akg.lang.cce.broadcast(grad, shape_max)
    else:
        x1 = akg.lang.cce.broadcast(x1_ori, shape_max)
        x2 = akg.lang.cce.broadcast(x2_ori, shape_max)
        grad = akg.lang.cce.broadcast(grad_ori, shape_max)

    esp_min = tvm.const(1.18e-38, dtype="float32")
    x1_addepsmin = akg.lang.cce.vadds(x1, esp_min)

    if utils.product_is_mini():
        x1_addepsmin_rec = reciprocal(x1_addepsmin)
        not_zero_x1 = akg.lang.cce.vmul(x1, x1_addepsmin_rec)
        x2_rec = reciprocal(x2)
        partial_x1 = akg.lang.cce.vmul(not_zero_x1, x2_rec)
    else:
        not_zero_x1 = div(x1, x1_addepsmin)
        partial_x1 = div(not_zero_x1, x2)

    partial_x1g = akg.lang.cce.vmul(partial_x1, grad)

    neg_one = tvm.const(-1, dtype="float32")
    neg_x1 = akg.lang.cce.vmuls(x1, neg_one)
    partial_x1pow = akg.lang.cce.vmul(partial_x1, partial_x1)
    partial_x2 = akg.lang.cce.vmul(neg_x1, partial_x1pow)
    partial_x2g = akg.lang.cce.vmul(partial_x2, grad)

    output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True)
    output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True)

    if dtype == "float16":
        output_y1 = akg.lang.cce.cast_to(output_y1, "float16")
        output_y2 = akg.lang.cce.cast_to(output_y2, "float16")

    return output_y1, output_y2
Example #12
0
def _update_m(m, beta, grad):
    """Update m_out = m * beta + grad * (1 - beta)"""
    m_beta = topi.multiply(m, beta)
    beta_neg = topi.multiply(beta, tvm.const(-1, beta.dtype))
    beta_1 = topi.add(beta_neg, tvm.const(1, beta_neg.dtype))
    grad_beta_gs = topi.multiply(grad, beta_1)
    m_out = topi.add(m_beta, grad_beta_gs)
    return m_out
Example #13
0
 def kernel_ir(dst, data):
     ib = tvm.ir_builder.create()
     with ib.for_range_n(data.shape, "ax") as i:
         zero = tvm.const(0, data.dtype)
         one = tvm.const(1, out_dtype)
         with ib.if_scope(ib.load(data, i) > zero):
             ib.store(dst, 0, one)
     return ib.get()
Example #14
0
File: asin.py Project: zhuyawen/akg
def _newton_iter(data, init_x):
    """Do element-wise Newton compute."""
    # Newton begin:x(n+1) = x(n)*(3-a*x(n)^2)/2
    init_square = topi.multiply(init_x, init_x)
    newton_res = topi.multiply(init_square, data)
    newton_res = topi.multiply(newton_res, neg_one_const("float32"))
    newton_res = topi.add(newton_res, tvm.const(3, "float32"))
    newton_res = topi.multiply(newton_res, init_x)
    newton_res = topi.multiply(newton_res, tvm.const(0.5, "float32"))
    return newton_res
Example #15
0
def _elu_taylor_compute(data):
    """
    Calculate e^x - 1, Use fifth order taylor expansion

    e^x = 1 + x + (x^2 / 2!) + (x^3 / 3!) +  (x^4 / 4!) + (x^5 / 5!)
    e^x - 1 = x + (x^2 / 2!) + (x^3 / 3!) +  (x^4 / 4!) + (x^5 / 5!)

    Args:
        data (tvm.tensor.Tensor):  input

    Returns : 
        tvm.tensor.Tensor
    """
    TAYLOR_SECOND_ORDER_PARAM = 1 / 2.0
    TAYLOR_THIRD_ORDER_PARAM = 1 / 6.0
    TAYLOR_FOURTH_ORDER_PARAM = 1 / 24.0
    TAYLOR_FIFTH_ORDER_PARAM = 1 / 120.0

    dtype = data.dtype
    if dtype == "float16":
        data = akg.lang.ascend.cast_to(data, "float32")

    # x^2 / 2!
    taylor_second_order_param = tvm.const(TAYLOR_SECOND_ORDER_PARAM, "float32")
    data_power_2 = akg.lang.ascend.vmul(data, data)
    data_power_2_div_2 = akg.lang.ascend.vmuls(data_power_2,
                                               taylor_second_order_param)

    # x^3 / 3!
    taylor_third_order_param = tvm.const(TAYLOR_THIRD_ORDER_PARAM, "float32")
    data_power_3 = akg.lang.ascend.vmul(data_power_2, data)
    data_power_3_div_6 = akg.lang.ascend.vmuls(data_power_3,
                                               taylor_third_order_param)

    # x^4 / 4!
    taylor_fourth_order_param = tvm.const(TAYLOR_FOURTH_ORDER_PARAM, "float32")
    data_power_4 = akg.lang.ascend.vmul(data_power_3, data)
    data_power_4_div_24 = akg.lang.ascend.vmuls(data_power_4,
                                                taylor_fourth_order_param)

    # x^5 / 5!
    taylor_fifth_order_param = tvm.const(TAYLOR_FIFTH_ORDER_PARAM, "float32")
    data_power_5 = akg.lang.ascend.vmul(data_power_4, data)
    data_power_5_div_120 = akg.lang.ascend.vmuls(data_power_5,
                                                 taylor_fifth_order_param)

    res = akg.lang.ascend.vadd(data, data_power_2_div_2)
    res = akg.lang.ascend.vadd(res, data_power_3_div_6)
    res = akg.lang.ascend.vadd(res, data_power_4_div_24)
    res = akg.lang.ascend.vadd(res, data_power_5_div_120)

    if dtype == "float16":
        res = akg.lang.ascend.cast_to(res, "float16")
    return res
Example #16
0
def _newton(start_value, num_to_vrsqrt):
    """Do newton's method to calculate vrsqrt."""

    x0_square = topi.multiply(start_value, start_value)
    mul_res = topi.multiply(x0_square, num_to_vrsqrt)
    mul_res = topi.multiply(mul_res, tvm.const(-1, "float32"))
    head0_tmp = topi.add(mul_res, tvm.const(3, "float32"))
    head0 = topi.multiply(head0_tmp, start_value)
    newton_res = topi.multiply(head0, tvm.const(0.5, "float32"))

    return newton_res
Example #17
0
def sgd_compute(parameters, gradient, learning_rate, accum, momentum, stat,
                dampening=0.0, weight_decay=0.0, nesterov=False):
    """sgd compute implementation"""
    dtype = parameters.dtype
    if dtype == "float16":
        parameters = topi.cast(parameters, "float32")
        accum = topi.cast(accum, "float32")
        learning_rate = topi.cast(learning_rate, "float32")
        gradient = topi.cast(gradient, "float32")
        momentum = topi.cast(momentum, "float32")
        stat = topi.cast(stat, "float32")

    # if weight_decay != 0.0, need compute grad_delta to update gradient
    if weight_decay != 0.0:
        parameters = topi.multiply(parameters, tvm.const(1.0, 'float32'))
        grad_delta = topi.multiply(parameters, weight_decay)
        gradient = topi.add(gradient, grad_delta)

    stat_mid = topi.multiply(stat, tvm.const(-1, "float32"))
    stat_act = topi.add(stat_mid, tvm.const(1, "float32"))

    dampening_t = topi.multiply(stat_act, dampening)

    # update accum
    accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0])

    gradient_damp = topi.multiply(gradient, dampening_t)
    accum_t = topi.add(accum_delta, gradient)
    if dampening != 0.0:
        accum_t = topi.subtract(accum_t, gradient_damp)

    # update parameters
    if nesterov:
        parameters_delta = tvm.compute(gradient.shape, lambda *indice: gradient(*indice) * learning_rate[0])
        parameters_delta_2 = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0])
        parameters_delta_2 = tvm.compute(parameters_delta_2.shape,
                                         lambda *indice: parameters_delta_2(*indice) * learning_rate[0])
        parameters_delta = topi.add(parameters_delta, parameters_delta_2)
        parameters_t = topi.subtract(parameters, parameters_delta)
    else:
        parameters_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * learning_rate[0])
        parameters_t = topi.subtract(parameters, parameters_delta)

    # update stat
    stat_t = topi.multiply(stat_act, tvm.const(NUM_ZERO, 'float32'))


    if dtype == "float16":
        parameters_t = topi.cast(parameters_t, "float16")
        accum_t = topi.cast(accum_t, "float16")
        stat_t = topi.cast(stat_t, "float16")

    return parameters_t, accum_t, stat_t
Example #18
0
def my_dsl(dtype, kernel_name, attrs):
    m = tvm.var("M")
    n = tvm.var("N")
    A = tvm.placeholder((m, ), name="A", dtype=dtype)
    B = tvm.placeholder((m, ), name="B", dtype=dtype)

    if insn == "add":
        C = topi.add(A, B)
    elif insn == "sub":
        C = topi.subtract(A, B)
    if insn == "mul":
        C = topi.multiply(A, B)
    elif insn == "div":
        C = topi.divide(A, B)
    elif insn == "max":
        C = topi.maximum(A, B)
    elif insn == "min":
        C = topi.minimum(A, B)

    elif insn == "abs":
        C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C')
    elif insn == "exp":
        C = topi.exp(A)
    elif insn == "log":
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)

    elif insn == "adds":
        C = A + tvm.const(2, dtype)
    elif insn == "muls":
        C = A * tvm.const(2, dtype)

    # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C")
    s = tvm.create_schedule([C.op])
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        if insnType == "binary":
            mod = akg.build(s, [A, B, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
        else:
            mod = akg.build(s, [A, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
    return mod
Example #19
0
def select_compute(condition, x1, x2):
    """select compute implementation"""
    shape = get_shape(x1)
    con_shape = get_shape(condition)
    num_dtype = x1.dtype
    bool_dtype = condition.dtype

    if num_dtype in ("int8", "uint8"):
        x1_dtype = "float32"
        ones = akg.lang.cce.broadcast(tvm.const(VALUE_ONE, dtype="float32"),
                                      shape,
                                      output_dtype="float32")
        x1 = akg.lang.cce.cast_to(x1, "float32")
        x2 = akg.lang.cce.cast_to(x2, "float32")
    else:
        x1_dtype = num_dtype
        ones = akg.lang.cce.broadcast(tvm.const(VALUE_ONE, dtype=num_dtype),
                                      shape,
                                      output_dtype=num_dtype)

    if bool_dtype == "int8":
        if x1_dtype == "int32":
            condition_dtype = akg.lang.cce.ceil(condition)
        else:
            condition_dtype = akg.lang.cce.cast_to(condition, x1_dtype)
    else:
        if x1_dtype == "int32":
            condition_dtype = condition
        else:
            condition_dtype = akg.lang.cce.cast_to(condition, x1_dtype)

    if list(con_shape) != list(shape):
        condition_dtype = akg.lang.cce.broadcast(condition_dtype, shape)

    vinsn_support_dtype = ("float16", "float32")
    if utils.product_is_mini():
        vinsn_support_dtype = ("float16", )
    if num_dtype in vinsn_support_dtype:
        res = topi.where(condition_dtype, x1, x2)
    else:
        # For data types that are not supported by the vector instruction (vcmp and vsel),
        # if the `topi.where` is directly used, the related instructions generated in the .cce file
        # are scalar instructions such as `cond ? x1 : x2`, which is very inefficient.
        # Therefore, other equivalent calculation methods are adopted.
        condition_opp = akg.lang.cce.vsub(ones, condition_dtype)
        temp_x = akg.lang.cce.vmul(x1, condition_dtype)
        temp_y = akg.lang.cce.vmul(x2, condition_opp)
        res = akg.lang.cce.vadd(temp_x, temp_y)
    if num_dtype in ("int8", "uint8"):
        res = akg.lang.cce.cast_to(res, num_dtype)
    return res
Example #20
0
File: asin.py Project: zhuyawen/akg
def _asin_compute(data_input):
    """Compute asin"""

    dtype = data_input.dtype
    boundary = tvm.const(BOUNDARY, "float32")

    # Change dtype to float32
    if dtype == "float16":
        data_input = topi.cast(data_input, "float32")

    # Sign mask
    data_sign = sign(data_input)

    # All positive
    data1 = topi.multiply(data_input, data_sign)

    # x belongs to (0, 2^(-0.5))
    choice_1 = topi.minimum(data1, boundary)
    choice_1 = topi.subtract(choice_1, boundary)
    choice_1_floor = akg.lang.cce.floor(choice_1)
    # the dtype of choice_1_floor is int32, need to be cast to fp32.
    if utils.product_is_mini():
        choice_1_floor = topi.cast(choice_1_floor, "float16")
        choice_1_floor = topi.cast(choice_1_floor, "float32")
    else:
        choice_1_floor = topi.cast(choice_1_floor, "float32")
    choice_1 = topi.multiply(choice_1_floor, neg_one_const("float32"))

    taylor1 = _taylor_compute(data1)
    res_1 = topi.multiply(taylor1, choice_1)

    # x belongs to (2^(-0.5), 1)
    choice_2 = topi.subtract(one_const("float32"), choice_1)
    data2 = topi.subtract(one_const("float32"), topi.multiply(data1, data1))
    data2_sqrt = _sqrt(data2)

    taylor2 = _taylor_compute(data2_sqrt, data2)

    res_2 = topi.multiply(taylor2, neg_one_const("float32"))
    res_2 = topi.add(res_2, tvm.const(HALF_PI, "float32"))
    res_2 = topi.multiply(res_2, choice_2)

    # Restore sign
    res_1 = topi.add(res_1, res_2)
    res_1 = topi.multiply(res_1, data_sign)

    # Restore dtype
    if dtype == "float16":
        res_1 = topi.cast(res_1, "float16")

    return res_1
Example #21
0
def _apply_adadelta_compute(var, accum, accum_update, grad, lr, rho, epsilon):
    """Compute apply_adadelta"""
    dtype = var.dtype
    if dtype == "float16":
        var = topi.cast(var, "float32")
        accum = topi.cast(accum, "float32")
        accum_update = topi.cast(accum_update, "float32")
        lr = topi.cast(lr, "float32")
        rho = topi.cast(rho, "float32")
        grad = topi.cast(grad, "float32")

    epsilon = tvm.const(epsilon, "float32")
    tensor_one = akg.lang.ascend.broadcast(tvm.const(1, "float32"), var.shape)
    tensor_rho = topi.broadcast_to(rho, var.shape)
    tensor_rho_gs = topi.subtract(tensor_one, tensor_rho)
    tensor_epsilon = akg.lang.ascend.broadcast(epsilon, var.shape)

    # accum = accum * rho + grad ** 2 * (1 - rho)
    rhs = topi.multiply(accum, tensor_rho)
    lhs = topi.multiply(grad, grad)
    lhs = topi.multiply(lhs, tensor_rho_gs)
    accum_res = akg.lang.ascend.vadd(lhs, rhs)

    # update = (accum_update + epsilon).sqrt * (accum + epsilon).rsqrt * grad
    rhs = topi.add(accum_update, tensor_epsilon)
    rhs = sqrt(rhs, target=utils.CCE)
    lhs = topi.add(accum_res, tensor_epsilon)
    lhs = rsqrt(lhs, target=utils.CCE)
    lhs = topi.multiply(grad, lhs)
    update = topi.multiply(lhs, rhs)

    # var -= update * lr
    var_res = topi.broadcast_to(lr, var.shape)
    var_res = topi.multiply(update, var_res)
    var_res = topi.subtract(var, var_res)

    # accum_update = rho * accum_update + (1 - rho) * update.square
    rhs = topi.multiply(accum_update, tensor_rho)
    lhs = topi.multiply(update, update)
    lhs = topi.multiply(lhs, tensor_rho_gs)
    accum_update_res = akg.lang.ascend.vadd(lhs, rhs)

    if dtype == "float16":
        var_res = topi.cast(var_res, "float16")
        accum_res = topi.cast(accum_res, "float16")
        accum_update_res = topi.cast(accum_update_res, "float16")

    return var_res, accum_res, accum_update_res
Example #22
0
 def _sinh_taylor_compute(x):
     """sinh(x) value is x * (1 + x^2( 1/3! + x^2(1/5! + x^2/7!)))"""
     taylor_params = [
         tvm.const(0.1666666666666666666666666666666666, dtype),
         tvm.const(0.0083333333333333333333333333333333, dtype),
         tvm.const(0.0001984126984126984126984126984126, dtype)
     ]
     x_square = topi.multiply(x, x)
     sinh_taylor = tvm.compute(
         x.shape,
         lambda *indice: x(*indice) *
         (1 + x_square(*indice) *
          (taylor_params[0] + x_square(*indice) *
           (taylor_params[1] + x_square(*indice) * taylor_params[2]))),
         name="sinh_taylor")
     return sinh_taylor
Example #23
0
 def kernel_ir(data, dst):
     ib = tvm.ir_builder.create()
     # axes before cumm-axis
     with ib.for_range_n(shape[:axis], "i0") as i0:
         # axes after cumm-axis
         with ib.for_range_n(shape[axis + 1:], "i1") as i1:
             idx_0 = i0 + [0] + i1 if not reverse else i0 + [
                 shape[axis] - 1
             ] + i1
             ib.store(
                 dst, idx_0,
                 tvm.const(1, data.dtype) if exclusive else ib.load(
                     data, idx_0))
             # iterate the cumm-axis to do cumulated production (start from 1)
             with ib.for_range(1, shape[axis], name="cum_idx") as m:
                 idx_pre = i0 + [m - 1] + i1 if not reverse else i0 + [
                     shape[axis] - m
                 ] + i1
                 idx_cur = i0 + [m] + i1 if not reverse else i0 + [
                     shape[axis] - 1 - m
                 ] + i1
                 ib.store(
                     dst, idx_cur,
                     ib.load(dst, idx_pre) *
                     ib.load(data, idx_pre if exclusive else idx_cur))
     return ib.get()
Example #24
0
def _atan_compute(data):
    """compute for atan"""
    dtype = data.dtype

    if dtype == "float16":
        data = topi.cast(data, "float32")

    abs_data = topi.abs(data)
    tensor_one = dc.one_const(abs_data.dtype)

    abs_data_sub_one = topi.subtract(abs_data, tensor_one)
    abs_data_add_one = topi.add(abs_data, tensor_one)
    abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one))

    # calucate data less than one
    res = _do_atan_taylor(abs_data)
    # calucate data more than one
    res_mt_one = topi.add(_do_atan_taylor(abs_data2),
                          tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype))
    res = topi.minimum(res, res_mt_one)

    if utils.product_is_mini() and data.dtype == "float32":
        sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32")
    else:
        sign_mask = topi.sign(data)

    res = topi.multiply(res, sign_mask)

    if dtype == "float16":
        res = topi.cast(res, "float16")

    return res
Example #25
0
def _taylor_compute(data_x, x_square=None):
    """Do arcsinx compute use the 15th order taylor expansion when 0 <= x <= BOUNDARY."""

    if x_square is None:
        x_square = topi.multiply(data_x, data_x)
    else:
        x_square = x_square
    """asin(x) = x + 1/6*x^3 + 3/40*x^5 + 5/112*x^7 + ... + 13!!/(14!!*15)*x^15"""
    res = topi.multiply(x_square, tvm.const(COEF[TAYLOR_COUNT], "float32"))
    for temp in reversed(range(TAYLOR_COUNT)):
        res = topi.add(res, tvm.const(COEF[temp], "float32"))
        if temp == 0:
            res = topi.multiply(res, data_x)
        else:
            res = topi.multiply(x_square, res)

    return res
Example #26
0
def _update_var(decay_gm, alpha, lr, grad, var):
    """Update var_out = var - lr * (alpha + decay_gm) * grad"""
    decay_gm_alpha = topi.add(decay_gm, alpha)
    res = topi.multiply(decay_gm_alpha, lr)
    res = topi.multiply(res, grad)
    res_neg = topi.multiply(res, tvm.const(-1, res.dtype))
    var_out = topi.add(var, res_neg)
    return var_out
Example #27
0
def _mean(data, axis, cof, shape):
    size = 1
    for i, _ in enumerate(axis):
        size = size * shape[axis[i]]
    cof = cof / tvm.const(size, "float32")
    tmp = topi.multiply(data, cof)
    res = topi.sum(tmp, axis)
    return res
Example #28
0
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry):
    """
    do element-wise xlogy_grad compute

    Args:
        placeholders (Union[list, typle]): the placeholder of data input
        shape_max (Union[list, typle]): the shape of broadcast
        dtype (string): the type of data input
        rx (list): the reduction indices of data input with broadcast
        ry (list): the reduction indices for data input with broadcast

    Returns
        output_y1 (tvm.tensor.Tensor): result of xlogy_grad
        output_y2 (tvm.tensor.Tensor): result of xlogy_grad
    """
    x1_ori = placeholders[0]
    x2_ori = placeholders[1]
    grad_ori = placeholders[2]

    if dtype == "float16":
        x1 = akg.lang.cce.cast_to(x1_ori, "float32")
        x2 = akg.lang.cce.cast_to(x2_ori, "float32")
        grad = akg.lang.cce.cast_to(grad_ori, "float32")
        x1 = akg.lang.cce.broadcast(x1, shape_max)
        x2 = akg.lang.cce.broadcast(x2, shape_max)
        grad = akg.lang.cce.broadcast(grad, shape_max)
    else:
        x1 = akg.lang.cce.broadcast(x1_ori, shape_max)
        x2 = akg.lang.cce.broadcast(x2_ori, shape_max)
        grad = akg.lang.cce.broadcast(grad_ori, shape_max)

    esp_min = tvm.const(1.18e-38, dtype="float32")
    x1_addespmin = akg.lang.cce.vadds(x1, esp_min)

    if utils.product_is_mini():
        not_zero_x1 = akg.lang.cce.vmul(x1, reciprocal(x1_addespmin))
        log_x2 = tvm.compute(
            x2.shape,
            lambda *i: (tvm.log(x2(*i).astype("float16"))).astype("float32"),
            name="log_x2")
    else:
        not_zero_x1 = div(x1, x1_addespmin)
        log_x2 = akg.lang.cce.vlog(x2)

    partial_x1 = akg.lang.cce.vmul(not_zero_x1, log_x2)
    partial_x1g = akg.lang.cce.vmul(partial_x1, grad)

    partial_x2 = div(x1, x2) if not utils.product_is_mini() else \
        akg.lang.cce.vmul(x1, reciprocal(x2))
    partial_x2g = akg.lang.cce.vmul(partial_x2, grad)

    output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True)
    output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True)

    if dtype == "float16":
        output_y1 = akg.lang.cce.cast_to(output_y1, "float16")
        output_y2 = akg.lang.cce.cast_to(output_y2, "float16")
    return output_y1, output_y2
Example #29
0
def fake_quant_with_min_max_args(input_data,
                                 min_=-6,
                                 max_=6,
                                 num_bits=8,
                                 narrow_range=False):
    """
    Computes Fake-quantize the 'input_data' tensor,
    type float32 to 'output_data' tensor of same type

    output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale
                  + nudged_min
    scale = (max-min) / (quant_max-quant_min)

    Args:
        data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32"
        min ([float, int]): scalar, defaults to -6
        max ([float, int]): scalar, defaults to 6. [min; max] define the
                            clamping range for the input_data data
        num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth
                                 of the quantization,between 2 and 16
        narrow_range ([bool]):
            True, quantized into the quantization range [1; 2^num_bits - 1]
            False,quantized into the quantization range [0; 2^num_bits - 1]

    Returns:
        tvm.tensor.Tensor
    """
    shape = get_shape(input_data)
    utils.check_shape(shape)

    dtype = input_data.dtype
    utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32)

    nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits,
                                                  narrow_range)

    zero_tensor = tvm.compute(input_data.shape,
                              lambda *i: tvm.const(0, dtype="float32"),
                              name="zero_tensor")
    nudged_max_tensor = topi.add(zero_tensor, nudged_max)
    nudged_min_tensor = topi.add(zero_tensor, nudged_min)
    inv_nudged_scale = 1.00 / scale

    # Transform the input between nudged_max and nudged_min
    clamped_vmin = topi.minimum(input_data, nudged_max_tensor)
    clamped = topi.maximum(clamped_vmin, nudged_min_tensor)

    # Calculate the quantized and dequantized results
    clamped_shifted = topi.subtract(clamped, nudged_min_tensor)
    vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale)
    vadds_shifted = topi.add(vmul_shifted, 0.5)
    floor_vadds_shifted = floor(vadds_shifted)
    floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype)
    res_scale = topi.multiply(floor_cast, scale)
    res = topi.add(res_scale, nudged_min_tensor)

    return res
Example #30
0
def TensorcoreConv(data,
                   weight,
                   stride=[1, 1],
                   pad=[0, 0, 0, 0],
                   dilation=[1, 1],
                   out_dtype="float32",
                   name="out",
                   target=utils.CUDA):
    batch, in_h, in_w, in_c = data.shape
    out_c, k_h, k_w, _ = weight.shape
    pad_top, pad_bottom, pad_left, pad_right = pad
    s_h, s_w = stride
    d_h, d_w = dilation
    k_h_d = (k_h - 1) * d_h + 1
    k_w_d = (k_w - 1) * d_w + 1
    o_h = (in_h + pad_top + pad_bottom - k_h_d) // s_h + 1
    o_w = (in_w + pad_left + pad_right - k_w_d) // s_w + 1

    has_pad = not (pad_left == 0 and pad_right == 0 and pad_top == 0
                   and pad_bottom == 0)

    if has_pad:
        data_pad = tvm.compute(
            (batch, in_h + pad_top + pad_bottom, in_w + pad_left + pad_right,
             in_c),
            lambda n, h, w, i: tvm.if_then_else(
                tvm.all(h >= pad_top, h - pad_bottom < in_h, w >= pad_left, w -
                        pad_right < in_w),
                data[n, h - pad_top, w - pad_left, i],
                tvm.const(0.0, "float16"),
            ),
            name="Pad",
        )
    else:
        data_pad = data

    rc = tvm.reduce_axis((0, in_c), name="rc")
    rh = tvm.reduce_axis((0, k_h), name="rh")
    rw = tvm.reduce_axis((0, k_w), name="rw")

    if out_dtype == "float32":
        out = tvm.compute(
            (batch, o_h, o_w, out_c),
            lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), (
                w * s_w + rw * d_w), rc].astype("float32") * weight[
                    o, rh, rw, rc].astype("float32"),
                                       axis=[rc, rh, rw]),
            name=name)
    else:
        out = tvm.compute(
            (batch, o_h, o_w, out_c),
            lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), (
                w * s_w + rw * d_w), rc] * weight[o, rh, rw, rc],
                                       axis=[rc, rh, rw]),
            name=name)

    return out