Beispiel #1
0
def selu_compute(input_data):
    """selu compute implemention"""
    # if input_dtype is float16,convert it to float32
    dtype = input_data.dtype
    if dtype == "float16" or dtype == "float32":
        input_data = topi.cast(input_data, "float32")
        type_tmp = "float32"
    else:
        input_data = topi.cast(input_data, "float16")
        type_tmp = "float16"

    # generate tensor_zero to be compared
    tensor_zero = topi.multiply(input_data, tvm.const(0, dtype=type_tmp))
    # generate negative_res and positive_res to compute
    # When the element value is greater than 0 and less than 0
    negative_res = topi.minimum(input_data, tensor_zero)
    positive_res = topi.maximum(input_data, tensor_zero)
    exp_res = exp(negative_res)
    sub_res = topi.add(exp_res, tvm.const(SCALAR_NEGATIVE_ONE, dtype=type_tmp))
    negative_muls_res = topi.multiply(sub_res, tvm.const(SCALE_ALPHA_PRODUCT, dtype=type_tmp))
    if dtype == "int8":
        negative_muls_res = akg.lang.cce.ceil(negative_muls_res)

    positive_muls_res = topi.multiply(positive_res, tvm.const(SCALE, dtype=type_tmp))
    res = topi.add(negative_muls_res, positive_muls_res)
    # cast to ori_dtype
    if dtype == "float16" or dtype == "int8" or dtype == "int32":
        res = topi.cast(res, dtype)

    return res
Beispiel #2
0
def fused_bn_follow_relu_avgpool(data0, data1, data2, data3, data4, data5, layout='NHWC', out_dtype='float16', target=utils.CUDA):
    """
    input:
    data: length is 6
    data0: tensor1 after bn_double_relu
    data1-6: bn parameters for conv2d tensor2
    layout: only (N, H, W, C), (N, C, H, W) supported
    out_dtype: float16

    output:
    avg-pooling( max(batch-normalized tensor1 + batch-normalized tensor2,  0) )
    """
    if layout == 'NCHW':
        data0 = topi.transpose(data0, (0, 2, 3, 1))
        data5 = topi.transpose(data5, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError(
            'Layout not supported {} '.format(layout))

    n, h, w, c = data0.shape
    inter_dtype = 'float32'
    add0 = fused_bn_follow(data1, data2, data3, data4, data5)
    add0 = topi.cast(add0, data0.dtype)
    add1 = topi.add(data0, add0)
    output = topi.maximum(add1, 0)
    output = topi.cast(output, inter_dtype)
    output = topi.sum(output, axis=(1, 2))
    output = topi.divide(output, h * w)
    output = topi.cast(output, out_dtype)

    return output
Beispiel #3
0
def fused_bn_follow_relu(data0, data1, data2, data3, data4, layout='NHWC', out_dtype='float16', target=utils.CUDA):
    """
    input:
    data0-4: bn parameters for conv2d tensor, length is 5
    data0: param0 beta
    data1: param1 gamma
    data2: param2 BNupdate: xi_variance
    data3: param6 BNreduce: xi_mean
    data4: param7 xi_conv2d, float16
    layout: only (N, H, W, C), (N, C, H, W) supported
    out_dtype: float16

    output:
    ReLU: max(batch-normalized tensor,  0)
    """
    if layout == 'NCHW':
        data4 = topi.transpose(data4, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError(
            'Layout not supported {} '.format(layout))

    add0 = fused_bn_follow(data0, data1, data2, data3, data4)
    add0 = topi.cast(add0, out_dtype)
    output = topi.maximum(add0, 0)

    if layout == "NCHW":
        output = topi.transpose(output, (0, 3, 1, 2))

    return output
Beispiel #4
0
def maximum(data1, data2, target=utils.CCE):
    """
    Take element-wise maximum of two tensors with auto-broadcasting.

    Args:
        data1: tvm.tensor.Tensor
        data2: tvm.tensor.Tensor

    Returns:
        tvm.tensor.Tensor of maximum of two tensors.

    Supported Platforms:
        'Ascend', 'GPU', 'CPU'
    """
    utils.check_supported_target(target)
    shape1 = [x.value for x in data1.shape]
    shape2 = [x.value for x in data2.shape]
    utils.check_shape(shape1)
    utils.check_shape(shape2)
    utils.auto_broadcast_check(shape1, shape2)
    utils.elemwise_dtype_check(data1.dtype, data2.dtype)

    dtype = data1.dtype
    need_cast = True if target == utils.CCE and dtype in ["int8", "uint8"
                                                          ] else False
    if need_cast:
        data1 = Cast(data1, "float16")
        data2 = Cast(data2, "float16")
    res = topi.maximum(data1, data2)
    if need_cast:
        res = Cast(res, dtype)
    return res
Beispiel #5
0
def fake_quant_with_min_max_args(input_data,
                                 min_=-6,
                                 max_=6,
                                 num_bits=8,
                                 narrow_range=False):
    """
    Computes Fake-quantize the 'input_data' tensor,
    type float32 to 'output_data' tensor of same type

    output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale
                  + nudged_min
    scale = (max-min) / (quant_max-quant_min)

    Args:
        data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32"
        min ([float, int]): scalar, defaults to -6
        max ([float, int]): scalar, defaults to 6. [min; max] define the
                            clamping range for the input_data data
        num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth
                                 of the quantization,between 2 and 16
        narrow_range ([bool]):
            True, quantized into the quantization range [1; 2^num_bits - 1]
            False,quantized into the quantization range [0; 2^num_bits - 1]

    Returns:
        tvm.tensor.Tensor
    """
    shape = get_shape(input_data)
    utils.check_shape(shape)

    dtype = input_data.dtype
    utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32)

    nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits,
                                                  narrow_range)

    zero_tensor = tvm.compute(input_data.shape,
                              lambda *i: tvm.const(0, dtype="float32"),
                              name="zero_tensor")
    nudged_max_tensor = topi.add(zero_tensor, nudged_max)
    nudged_min_tensor = topi.add(zero_tensor, nudged_min)
    inv_nudged_scale = 1.00 / scale

    # Transform the input between nudged_max and nudged_min
    clamped_vmin = topi.minimum(input_data, nudged_max_tensor)
    clamped = topi.maximum(clamped_vmin, nudged_min_tensor)

    # Calculate the quantized and dequantized results
    clamped_shifted = topi.subtract(clamped, nudged_min_tensor)
    vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale)
    vadds_shifted = topi.add(vmul_shifted, 0.5)
    floor_vadds_shifted = floor(vadds_shifted)
    floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype)
    res_scale = topi.multiply(floor_cast, scale)
    res = topi.add(res_scale, nudged_min_tensor)

    return res
Beispiel #6
0
def my_dsl(dtype, kernel_name, attrs):
    m = tvm.var("M")
    n = tvm.var("N")
    A = tvm.placeholder((m, ), name="A", dtype=dtype)
    B = tvm.placeholder((m, ), name="B", dtype=dtype)

    if insn == "add":
        C = topi.add(A, B)
    elif insn == "sub":
        C = topi.subtract(A, B)
    if insn == "mul":
        C = topi.multiply(A, B)
    elif insn == "div":
        C = topi.divide(A, B)
    elif insn == "max":
        C = topi.maximum(A, B)
    elif insn == "min":
        C = topi.minimum(A, B)

    elif insn == "abs":
        C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C')
    elif insn == "exp":
        C = topi.exp(A)
    elif insn == "log":
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)

    elif insn == "adds":
        C = A + tvm.const(2, dtype)
    elif insn == "muls":
        C = A * tvm.const(2, dtype)

    # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C")
    s = tvm.create_schedule([C.op])
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        if insnType == "binary":
            mod = akg.build(s, [A, B, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
        else:
            mod = akg.build(s, [A, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
    return mod
Beispiel #7
0
def _apply_ada_max_compute(var, m, v, grad, lr, beta1, beta1_power, beta2,
                           epsilon):
    """Compute ada_max."""
    # cast to float32 for improved accuracy
    inp_dtype = var.dtype
    if inp_dtype == 'float16':
        var = topi.cast(var, 'float32')
        m = topi.cast(m, 'float32')
        v = topi.cast(v, 'float32')
        lr = topi.cast(lr, 'float32')
        beta1_power = topi.cast(beta1_power, 'float32')
        beta1 = topi.cast(beta1, 'float32')
        beta2 = topi.cast(beta2, 'float32')
        grad = topi.cast(grad, 'float32')
    epsilon = tvm.const(epsilon, 'float32')

    # m += (grad - m) * (1 - beta1)
    rhs = tvm.compute(beta1.shape,
                      lambda *i: beta1(*i) * neg_one_const("float32"))
    rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) + one_const("float32"))
    lhs = topi.subtract(grad, m)
    rhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) * rhs[0])
    m = topi.add(m, rhs)

    # v = max(beta2*v, abs(grad))
    lhs = tvm.compute(v.shape, lambda *i: v(*i) * beta2[0])
    rhs = topi.abs(grad)
    v = topi.maximum(lhs, rhs)

    # var -= lr / (1 - beta1_power) * (m / (v + epsilon))
    # lr * m / (1 - beta1_power) * (v + epsilon)
    # v + epsilon
    rhs = tvm.compute(v.shape, lambda *i: v(*i) + epsilon)
    # 1 - beta1_power
    lhs = tvm.compute(beta1_power.shape,
                      lambda *i: beta1_power(*i) * neg_one_const("float32"))
    lhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) + one_const("float32"))
    # (1 - beta1_power) * (v + epsilon)
    rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) * lhs[0])
    # lr * m
    lhs = tvm.compute(m.shape, lambda *i: m(*i) * lr[0])
    # lr * m / (1 - beta1_power) * (v + epsilon)
    rhs = reciprocal(rhs)
    rhs = topi.multiply(lhs, rhs)
    var = topi.subtract(var, rhs)

    if inp_dtype == 'float16':
        var = topi.cast(var, inp_dtype)
        m = topi.cast(m, inp_dtype)
        v = topi.cast(v, inp_dtype)

    return var, m, v
Beispiel #8
0
def _cmpare_value(input_data, nudged_min, nudged_max):
    """
    where((input_data<=nudged_max)&(x>=nudged_min),1,0)

    Args:  
        input_data (tvm.tensor.Tensor): Input data
        nudged_min (tvm.tensor.Tensor): Minimum value of comparison
        nudged_max (tvm.tensor.Tensor): Maximum value of comparison

    Returns:
        tvm.tensor.Tensor
    """
    min_value = tvm.const(2**(-126), dtype="float32")
    # (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1
    # so min_value*max_value*max_value*max_value_one = 1
    max_value = tvm.const(2**(62), dtype="float32")
    max_value_one = tvm.const(2**(2), dtype="float32")
    data_zero = topi.multiply(input_data, 0)
    max_value_tensor = topi.add(data_zero, max_value)
    min_value_tensor = topi.add(data_zero, min_value)
    max_value_one_tensor = topi.add(data_zero, max_value_one)

    sub_tmp = topi.subtract(input_data, nudged_min)
    sub_min = topi.add(sub_tmp, min_value)
    vmax_tmp = topi.maximum(sub_min, data_zero)

    sub_tmp_max = topi.subtract(nudged_max, input_data)
    sub_max = topi.add(sub_tmp_max, min_value)
    vmin_tmp = topi.maximum(sub_max, data_zero)

    one_tmp = topi.multiply(vmax_tmp, vmin_tmp)
    one_min = topi.minimum(one_tmp, min_value_tensor)

    vmul_max_value = topi.multiply(one_min, max_value_tensor)
    vmul_max_value_one = topi.multiply(vmul_max_value, max_value_tensor)
    between_nudged_min_max = topi.multiply(vmul_max_value_one,
                                           max_value_one_tensor)

    return between_nudged_min_max
Beispiel #9
0
def truncate_div_compute(input_x1, input_x2):
    """compute for truncate_div"""
    int_list = ("int32", "int8", "uint8")

    if input_x1.dtype in int_list:
        data_zero = dc.zero_const("float32")
        data_x_broad = cast(input_x1, "float32")
        data_y_broad = cast(input_x2, "float32")
        res_div = topi.divide(data_x_broad, data_y_broad)
        res_min_int = ceil(topi.minimum(res_div, data_zero))
        res_max_int = floor(topi.maximum(res_div, data_zero))
        res_trunc = topi.add(res_min_int, res_max_int)
        res_trunc = cast(res_trunc, "float32")
    else:
        res_trunc = topi.divide(input_x1, input_x2)

    return cast(res_trunc, input_x1.dtype)
Beispiel #10
0
def less_compare_float32(data_x, data_y):
    """if x is less than y, then return 1, else return 0"""
    shape_inputs = get_shape(data_x)
    # minimun num of float32 2**(-126)
    data_min = akg.lang.ascend.broadcast(tvm.const(2**(-126), dtype="float32"),
                                         shape_inputs, "float32")
    data_zero = akg.lang.ascend.broadcast(dc.zero_const("float32"),
                                          shape_inputs, "float32")
    res_sub = topi.subtract(data_y, data_x)
    res_min = topi.minimum(res_sub, data_min)
    res_max = topi.maximum(res_min, data_zero)
    # max num of float32 is 2**126
    # but cce can only support 2**62, so use 62 * 62 * 2 to adaptor 126
    res_mul_fierst = topi.multiply(res_max, tvm.const(2**62, dtype="float32"))
    res_mul_second = topi.multiply(res_mul_fierst,
                                   tvm.const(2**62, dtype="float32"))
    res = topi.multiply(res_mul_second, tvm.const(2**2, dtype="float32"))

    return res
Beispiel #11
0
def fused_bn_double_follow_relu(data0,
                                data1,
                                data2,
                                data3,
                                data4,
                                data5,
                                data6,
                                data7,
                                data8,
                                data9,
                                layout='NHWC',
                                out_dtype='float16',
                                target=utils.CUDA):
    """
    input:
    data: length is 5
    data0-4: bn parameters for conv2d tensor 1
    data5-9: bn parameters for conv2d tensor 2
    layout: only (N, H, W, C), (N, C, H, W) supported
    out_dtype: float16

    output:
    ReLU: max(batch-normalized tensor1 + batch-normalized tensor2,  0)
    """

    if layout == 'NCHW':
        data4 = topi.transpose(data4, (0, 2, 3, 1))
        data9 = topi.transpose(data9, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError('Layout not supported {} '.format(layout))

    add0 = fused_bn_follow(data0, data1, data2, data3, data4)
    add1 = fused_bn_follow(data5, data6, data7, data8, data9)
    add0 = topi.cast(add0, out_dtype)
    add1 = topi.cast(add1, out_dtype)
    add2 = topi.add(add0, add1)
    output = topi.maximum(add2, 0)

    if layout == "NCHW":
        output = topi.transpose(output, (0, 3, 1, 2))

    return output
Beispiel #12
0
def maximum(data1, data2):
    """
    Take element-wise maximum of two tensors with auto-broadcasting.

    Args:
        data1: tvm.tensor.Tensor
        data2: tvm.tensor.Tensor

    Returns:
        tvm.tensor.Tensor of maximum of two tensors.
    """
    shape1 = [x.value for x in data1.shape]
    shape2 = [x.value for x in data2.shape]
    vc_util.check_shape(shape1)
    vc_util.check_shape(shape2)
    vc_util.auto_broadcast_check(shape1, shape2)
    vc_util.elemwise_dtype_check(data1.dtype, data2.dtype)

    res = topi.maximum(data1, data2)
    return res
Beispiel #13
0
def fake_quant_with_min_max_vars_per_channel_compute(input_data,
                                                     input_min,
                                                     input_max,
                                                     num_bits=8,
                                                     narrow_range=False):
    """fake_quant_with_min_max_vars_per_channel compute implemention"""
    shape = get_shape(input_data.shape)
    dtype = input_data.dtype
    min_broadcast = akg.lang.ascend.broadcast(input_min, shape, dtype)
    max_broadcast = akg.lang.ascend.broadcast(input_max, shape, dtype)
    # get nudged_min and nudged_max by nudged_min_max_compute function
    nudged_min_nudged_max = nudged_min_max_compute(min_broadcast,
                                                   max_broadcast, num_bits,
                                                   narrow_range)
    # transform the input between nudged_max and nudged_min
    clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1])
    clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0])

    # calculate the quantized and dequantized results
    clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0])
    if product_is_mini():
        clamped_shifted_div_scale = mul(clamped_shifted,
                                        reciprocal(nudged_min_nudged_max[2]),
                                        target=utils.CCE)
    else:
        clamped_shifted_div_scale = Divide(clamped_shifted,
                                           nudged_min_nudged_max[2],
                                           target=utils.CCE)
    result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype))
    floor_result_tmp = akg.lang.ascend.floor(result_tmp)
    if product_is_mini():
        floor_result_tmp = topi.cast(floor_result_tmp, "float16")

    floor_result_tmp = topi.cast(floor_result_tmp, "float32")
    scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2])
    tmp_res = topi.add(scale_product, nudged_min_nudged_max[0])
    # get bool_both_zero_value by bool_both_zero_compute function
    bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast)
    res = topi.multiply(tmp_res, bool_both_zero_value)

    return res
Beispiel #14
0
def _apply_adagrad_da_compute(var, gradient_accum, gradient_squared_accum,
                              grad, lr, l1, l2, global_step):
    """Compute adagrad_da."""
    dtype = var.dtype
    # cast to float32 for higher precision
    if dtype == "float16":
        gradient_accum = topi.cast(gradient_accum, "float32")
        gradient_squared_accum = topi.cast(gradient_squared_accum, "float32")
        grad = topi.cast(grad, "float32")
        lr = topi.cast(lr, "float32")
        l1 = topi.cast(l1, "float32")
        l2 = topi.cast(l2, "float32")
    if product_is_mini():
        global_step = topi.cast(global_step, "float16")
        global_step = topi.cast(global_step, "float32")
    else:
        global_step = topi.cast(global_step, "float32")

    # 1.grad_accum += grad
    gradient_accum = topi.add(gradient_accum, grad)

    # 2.grad_squared_accum += grad * grad
    gs = topi.multiply(grad, grad)
    gradient_squared_accum = topi.add(gradient_squared_accum, gs)

    # 3.if l1 > 0: tmp_val = Sign(grad_accum) * max(|grad_accum|-l1*global_step, 0)
    #   else:      tmp_val = grad_accum
    sign_val = Sign(gradient_accum)
    abs_val = topi.abs(gradient_accum)
    mul_val = topi.multiply(global_step, l1)
    sub_val = topi.subtract(abs_val, mul_val)
    max_val = topi.maximum(sub_val, tvm.const(0, sub_val.dtype))
    tmp_val = topi.multiply(sign_val, max_val)

    def select(l1, tmp_val, gradient_accum):
        """Returns tmp_val if l1 > 0 else gradient_accum."""
        if product_is_mini():
            l1 = topi.cast(l1, "float16")
            tmp_val = topi.cast(tmp_val, "float16")
            gradient_accum = topi.cast(gradient_accum, "float16")
        tmp_val = akg.tvm.compute(
            tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i),
                                                      gradient_accum(*i)))
        return topi.cast(tmp_val, "float32") if product_is_mini() else tmp_val

    tmp_val = select(l1, tmp_val, gradient_accum)

    # 4.x_value = -1 * lr * tmp_val
    x_value = topi.multiply(lr, tvm.const(-1, "float32"))
    x_value = topi.multiply(x_value, tmp_val)

    # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum)
    pro_val = topi.multiply(l2, global_step)
    pro_val = topi.multiply(pro_val, lr)
    sqrt_val = sqrt(gradient_squared_accum, target=utils.CCE)
    y_value = topi.add(pro_val, sqrt_val)

    # 6.var = x_value / y_value
    if product_is_mini():
        y_rec = reciprocal(y_value, target=utils.CCE)
        var_out = topi.multiply(x_value, y_rec)
    else:
        var_out = topi.divide(x_value, y_value)

    if dtype == "float16":
        var_out = akg.lang.ascend.cast_to(var_out, "float16")
        gradient_accum = akg.lang.ascend.cast_to(gradient_accum, "float16")
        gradient_squared_accum = akg.lang.ascend.cast_to(
            gradient_squared_accum, "float16")

    return var_out, gradient_accum, gradient_squared_accum