Esempio n. 1
0
def _tiling_axis(shape, dtype):
    """
    Calculate the tile parameters.

    Parameters
    ----------
    shape: list or tuple
        the shape of tensor.
    dtype: str
        the dtype of tensor.

    Returns
    -------
    split_axis: int
        the target axis that is used for tile the tensor.
    split_factor: int
        the factor used when tile the target axis.
    """
    total_ele, ele_each_block, _ = _get_public_param(dtype)

    tiling_shape = [dim for dim in shape]
    if shape[-1] % ele_each_block != 0:
        last_ele = ((shape[-1] + ele_each_block - 1) //
                    ele_each_block) * ele_each_block
        tiling_shape[-1] = int(last_ele)

    split_axis = 0
    split_factor = 1
    for index, _ in enumerate(tiling_shape):
        ele_cnt = function_reduce(lambda x, y: x * y, tiling_shape[index:])
        if ele_cnt <= total_ele:
            split_axis = index - 1
            split_factor = total_ele // ele_cnt
            break
        elif index == len(tiling_shape) - 1:
            split_axis = index
            split_factor = total_ele
            break

    if split_axis < 0:
        split_axis = 0
        split_factor = tiling_shape[0]

    return split_axis, split_factor
Esempio n. 2
0
def _tilling_axis(shape, dtype):
    """
    calculate the split parameters according to different shapes

    Parameters
    ----------
    shape : list or tuple
        shape of tensor
    dtype : string
        buffer date type

    Returns
    -------
    split_axis : the target axis that is used for spliting the tensor to find
        the maximum amount of data can be stored and processed every time on UB.
    split_factor : the factor used when spliting the target axis.
        For example, for data of float16, [1024, 1024, 256] will be split to
        [1024, 7, 164, 256], UB processes 164*256 elements every time.
        In this case, the split_axis is 1 and the split_factor is 164.
    """

    # Number of Tensor in assign_sub
    tensor_num = 2

    # ub_size_bytes is the size of the UB expressed by bytes(mod 8 bits).
    ub_size_bytes = cce.CceProductParams().getParams("Unified_Buffer")

    # dtype_bytes_size for float16 is 2, for float32 is 4
    dtype_bytes_size = cce.cce_intrin.get_bit_len(dtype) // BYTES_TO_BITS
    # total_ele is the maximum amount of data that can be stored in UB.
    if dtype in ("int8", "uint8"):
        dtype_bytes_size_fp16 = cce.cce_intrin.get_bit_len(
            "float16") // BYTES_TO_BITS
        total_ele = ub_size_bytes // (dtype_bytes_size +
                                      dtype_bytes_size_fp16) // tensor_num
    else:
        total_ele = ub_size_bytes // dtype_bytes_size // tensor_num

    shape_value = shape[-1]
    if dtype in ("int8", "uint8"):
        bytes_size = dtype_bytes_size + dtype_bytes_size_fp16
    else:
        bytes_size = dtype_bytes_size

    ele_num = total_ele // 16 * (shape_value * bytes_size // SHAPE_THREHOLD +
                                 1)

    if ele_num > total_ele // 2:
        total_ele = total_ele // 2
    else:
        total_ele = total_ele // 16 * \
                    (shape_value * bytes_size // SHAPE_THREHOLD // 2 + 1)

    # To initialize the split_axis and the split_factor.
    split_axis = 0
    split_factor = 1

    # To find the appropriate axis from the first one to the last
    # by comparing the amount of the elements of the split tensor with
    # the maximum amount of data that can be stored in UB.
    for index, _ in enumerate(shape):
        ele_cnt = function_reduce(lambda x, y: x * y, shape[index:])
        if ele_cnt <= total_ele:
            split_axis = index - 1
            split_factor = total_ele // ele_cnt
            break

    # when the last axis is still over the size of UB, we choose to split the
    # last axis, and the split_factor is set as the maximum amount of data
    # that can be stored in UB.
    if shape[-1] > total_ele:
        split_axis = len(shape) - 1
        split_factor = (total_ele // TILING_SIZE) * TILING_SIZE

    # when the amount of the elements of the tensor is less than the size of UB,
    # it means UB can process the whole tensor in one time. But the split_axis
    # has already been set to "-1", split_axis and split_factor
    # should be initialized into "0" and shape[0]
    if split_axis < 0:
        split_axis = 0
        split_factor = shape[0]

    return split_axis, split_factor
Esempio n. 3
0
def ascend_requant_compute(x,
                           req_scale,
                           y,
                           relu_flag=False,
                           kernel_name='ascend_requant'):
    """
    int32 -> int8

    Parameters:
     ----------
    x : the placeholder of input

    req_scale: the placeholder of requant num

    y : the dict of output.

    relu_flag : the relu mode when true the result to do relu

    kernel_name : cce kernel name, default value is "ascend_requant"

    Returns:

    res : the result of ascend_requant
    -------
    None
    """

    x_shape = x.shape
    x_shape_list = te.lang.cce.util.shape_to_list(x_shape)
    align_shape = x_shape_list.copy()

    # the tensor is a constant or vector based on the original shape
    ori_shape_req = req_scale.op.attrs['ori_shape']
    ori_shape_req_list = te.lang.cce.util.shape_to_list(ori_shape_req)
    req_dim = function_reduce(lambda x, y: x * y, ori_shape_req_list[:])
    tensor_flag = False
    if req_dim > 1:
        tensor_flag = True

    c1_index = 1
    if _is_nz_format(x):
        c1_index = len(x_shape) - 4

    if x.op.tag == "depthwise_conv2d":
        align_shape[4] = 16
        align_shape[3] = (x_shape_list[3] + 15) // 16 * 16
        align_shape[2] = 1
        if tensor_flag:
            align_shape[1] = (x_shape_list[1] * x_shape_list[2] * 16 + 31) \
                             // 32 * 32 // 16
        else:
            align_shape[1] = x_shape_list[1] * x_shape_list[2]
        align_shape[0] = x_shape_list[0]

        if tensor_flag:
            res_ub = tvm.compute(
                align_shape,
                lambda i, j, a, k, l: tvm.vdeq_cast(x(i, j // 2, j % 2, k, l),
                                                    req_scale(0, j, 0, 0, l),
                                                    "int8",
                                                    do_relu=relu_flag),
                name='s32_to_s8',
                tag="requant_vector")
        else:
            res_ub = tvm.compute(
                align_shape,
                lambda i, j, a, k, l: tvm.deq_cast(x(
                    i, j // 2, j % 2, k, l), req_scale(0, 0, 0, 0, 0), "int8"),
                name='s32_to_s8',
                tag="requant_scale")
    else:
        align_shape[c1_index] = (align_shape[c1_index] + 1) // 2 * 2
        align_shape[-2] = (align_shape[-2] + 15) // 16 * 16
        res_ub = _s32_to_s8_normal_compute(x, req_scale, align_shape, c1_index,
                                           tensor_flag, relu_flag)

    if _is_nz_format(x):
        res = _format_transfer_nz(align_shape, res_ub, c1_index)
        return res

    res_ub_reform = _format_transfer(align_shape, res_ub, c1_index)
    res_shape = te.lang.cce.util.shape_to_list(res_ub_reform.shape)

    res_shape[-2] = x.shape[-2]

    res = tvm.compute(res_shape,
                      lambda *indice: res_ub_reform(*indice),
                      name='requant_remove_pad',
                      tag="requant_remove_pad")
    return res
Esempio n. 4
0
def ascend_dequant_s16_compute(x0,
                               deq_scale,
                               x1,
                               y,
                               relu_flag=False,
                               kernel_name='ascend_dequant_s16'):
    """
    int32 -> int16

    Parameters:
     ----------
    x : the placeholder of input

    deq: the placeholder of requant num

    x1:  the placeholder of add input tensor

    y: the dict of output

    relu_flag : the relu mode when true the result to do relu,
                default value is False

    kernel_name : cce kernel name, default value is "ascend_dequant_s16"

    Returns:

    res : the result of ascend_dequant_s16
    -------
    None
    """

    x0_shape = x0.shape
    x0_shape_list = te.lang.cce.util.shape_to_list(x0_shape)
    align_shape = x0_shape_list.copy()

    ori_shape_deq = deq_scale.op.attrs['ori_shape']
    ori_shape_deq_list = te.lang.cce.util.shape_to_list(ori_shape_deq)
    deq_dim = function_reduce(lambda x, y: x * y, ori_shape_deq_list[:])
    tensor_flag = False
    if deq_dim > 1:
        tensor_flag = True

    c1_index = 1
    if _is_nz_format(x0):
        c1_index = len(x0_shape) - 4

    align_shape[-2] = (align_shape[-2] + 15) // 16 * 16
    res_ub = _s32_to_s16_normal_compute(x0, deq_scale, x1, align_shape,
                                        c1_index, tensor_flag, relu_flag)

    if _is_nz_format(x0):
        res = tvm.compute(align_shape,
                          lambda *i: res_ub[i],
                          name='res',
                          tag='dequant_s16_NZ')
        return res

    res_shape = te.lang.cce.util.shape_to_list(res_ub.shape)
    res_shape[-2] = x0.shape[-2]
    res = tvm.compute(res_shape,
                      lambda *indice: res_ub(*indice),
                      name='dequant_s16_remove_pad',
                      tag="dequant_s16_remove_pad")

    return res
 def reduce_2_tuple(shape):
     return (function_reduce(operator.mul, shape), )
Esempio n. 6
0
def ascend_requant_s16_compute(x,
                               req_scale,
                               x1,
                               y,
                               y1,
                               dual_output,
                               relu_flag,
                               kernel_name='ascend_requant_s16'):
    """
    int16 -> int8

    Parameters:
     ----------
    x : the placeholder of input

    req_scale: the placeholder of req_scale

    x1: the placeholder of x1

    y : the dict of output.

    y1 : the dict of output1.

    dual_output : the sqrt mode when true return 2 result,
                  default value is False

    relu_flag : the relu mode when true the result to do relu,
                default value is False

    kernel_name : cce kernel name, default value is "ascend_requant_s16"

    Returns:

    res : the result of ascend_requant_s16 which is list
    -------
    None
    """
    x_shape = x.shape
    x_shape_list = te.lang.cce.util.shape_to_list(x_shape)
    align_shape = x_shape_list.copy()

    ori_shape_req = req_scale.op.attrs['ori_shape']
    ori_shape_req_list = te.lang.cce.util.shape_to_list(ori_shape_req)
    req_dim = function_reduce(lambda x, y: x * y, ori_shape_req_list[:])
    tensor_flag = False
    if req_dim > 1:
        tensor_flag = True

    c1_index = 1
    if _is_nz_format(x):
        c1_index = len(x_shape) - 4

    align_shape[c1_index] = (align_shape[c1_index] + 1) // 2 * 2
    res_s16, res_ub = _s16_to_s8_normal_compute(x, x1, req_scale, x_shape,
                                                align_shape, c1_index,
                                                tensor_flag, relu_flag)

    res = _format_transfer(align_shape, res_ub, c1_index)
    if _is_nz_format(x):
        res = tvm.compute(align_shape,
                          lambda *i: res[i],
                          name='res',
                          tag='requant_s16_NZ')

    if dual_output:
        return [res, res_s16]

    return [res]
Esempio n. 7
0
def ascend_quant(x,
                 y,
                 scale,
                 offset,
                 sqrt_mode=False,
                 round_mode="Round",
                 kernel_name="ascend_quant"):
    """
    float16/float32 -> int8

    Parameters:
    ----------
    x : the dict of input

    y : the dict of output

    scale : the data of scale

    offset : the data of offset

    sqrt_mode : the sqrt mode when true the result to do sqrt

    round_mode : the data conversion mode

    kernel_name : cce kernel name, default value is "ascend_quant"

    Returns:
    -------
    None
    """
    _check_params(x, y, scale, offset, sqrt_mode, round_mode, kernel_name)
    shape = x.get("shape")
    input_dtype = x.get("dtype").lower()
    input_format = x.get("format")

    x_l1_fusion_type, y_l1_fusion_type, attr = _check_l1_fusion(x, y)

    if input_format == "NC1HWC0":
        if x_l1_fusion_type != -1:
            input_shape = shape
            attr["l1_fusion_flag"] = x_l1_fusion_type
        else:
            # change to N,C1,H*W,C0
            input_shape = (shape[0], shape[1], shape[2] * shape[3], shape[4])
    else:
        # nz change to 1,C1,N1*N0,C0 equivalence N,C1,H*W,C0
        batch = 1
        if len(shape) > 4:
            batch = function_reduce(lambda x, y: x * y, shape[:-4])
        input_shape = (batch, shape[-4], shape[-3] * shape[-2], shape[-1])
    input_x = tvm.placeholder(input_shape,
                              name="input_x",
                              dtype=input_dtype,
                              attrs=attr)

    res = ascend_quant_compute(input_x, y, scale, offset, sqrt_mode,
                               round_mode, kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [input_x, res]}

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 8
0
def ascend_dequant(x,
                   deq_scale,
                   y,
                   sqrt_mode=False,
                   relu_mode=False,
                   kernel_name='ascend_dequant'):
    """
    int32 -> fp16

    Parameters:
    ----------
    x : the dict of input

    deq_scale: the dict of dequant num

    offset: the dict of offset num

    y : the dict of output.

    sqrt_mode : the sqrt mode when true the result to do sqrt

    relu_flag : the relu mode when true the result to do relu

    kernel_name : cce kernel name, default value is "ascend_dequant"

    Returns:
    -------
    None
    """

    _check_params(x, deq_scale, kernel_name)

    shape_x = x.get("shape")
    shape_deq = deq_scale.get("shape")

    dtype_x = x.get("dtype")
    dtype_deq = deq_scale.get("dtype")
    x_format = x.get("format")
    ori_shape_deq = deq_scale.get("ori_shape")
    attr = {"ori_shape": ori_shape_deq}

    if dtype_deq == "uint64" and sqrt_mode:
        raise RuntimeError("ascend dequant when deq_scale dtype is uint64,"
                           "sqrt_mode only support False ")

    if x_format == "NC1HWC0":
        # n, C1, H*W, C0
        shape_x = [shape_x[0], shape_x[1], shape_x[2] * shape_x[3], shape_x[4]]
        shape_deq = [
            shape_deq[0], shape_deq[1], shape_deq[2] * shape_deq[3],
            shape_deq[4]
        ]
    else:
        # C1,N1,N0,C0 change to 1,C1,N1*N0,C0 equivalence N,C1,H*W,C0
        x_batch = 1
        if len(shape_x) > 4:
            x_batch = function_reduce(lambda x, y: x * y, shape_x[:-4])
        shape_x = [
            x_batch, shape_x[-4], shape_x[-3] * shape_x[-2], shape_x[-1]
        ]
        shape_deq = [
            shape_deq[0], shape_deq[1], shape_deq[2] * shape_deq[3],
            shape_deq[4]
        ]

    input_x = tvm.placeholder(shape_x, dtype_x, "x")
    input_deq = tvm.placeholder(shape_deq,
                                name="deq_scale",
                                dtype=dtype_deq,
                                attrs=attr)

    with tvm.target.cce():
        res = ascend_dequant_compute_v2(input_x, input_deq, y, sqrt_mode,
                                        relu_mode, kernel_name)
        sch = generic.auto_schedule(res)
        config = {
            "name": kernel_name,
            "tensor_list": [input_x, input_deq, res]
        }
        te.lang.cce.cce_build_code(sch, config)
Esempio n. 9
0
def ascend_dequant_compute_v2(x,
                              deq_scale,
                              y,
                              sqrt_mode=False,
                              relu_flag=False,
                              kernel_name='ascend_dequant'):
    """
    int32 -> fp16

    Parameters:
     ----------
    x : the placeholder of input

    deq_scale: the placeholder of dequant num

    offset: the placeholder of offset num

    y : the dict of output.

    sqrt_mode : the sqrt mode when true the result to do sqrt

    relu_flag : the relu mode when true the result to do relu

    kernel_name : cce kernel name, default value is "ascend_dequant"

    Returns:

    res : the result of ascend_dequant
    -------
    None
    """
    ori_shape_deq = deq_scale.op.attrs['ori_shape']
    ori_shape_deq_list = te.lang.cce.util.shape_to_list(ori_shape_deq)
    deq_dim = function_reduce(lambda x, y: x * y, ori_shape_deq_list[:])
    tensor_flag = False
    if deq_dim > 1:
        tensor_flag = True

    align_shape = te.lang.cce.util.shape_to_list(x.shape)
    align_shape[-2] = (align_shape[-2] + 15) // 16 * 16

    x_ub = tvm.compute(x.shape,
                       lambda *i: x(*i),
                       name='x_ub',
                       tag="dequant_x_ub")
    deq_ub = tvm.compute(deq_scale.shape,
                         lambda *i: deq_scale(*i),
                         name='deq_ub',
                         tag="dequant_deq_ub")
    x_l0c = tvm.compute(align_shape,
                        lambda *i: x_ub(*i),
                        name='x_l0c',
                        tag="dequant_x_l0c")

    if tensor_flag:
        if _is_support_v200_instruction():
            res = _dequant_v200_v2(x_l0c, deq_ub, align_shape, x.shape,
                                   relu_flag, tensor_flag)
        else:
            res = _vector_dequant_v100_v2(x_l0c, deq_ub, align_shape, x.shape,
                                          relu_flag, sqrt_mode)
    else:
        if _is_support_v200_instruction():
            res = _dequant_v200_v2(x_l0c, deq_ub, align_shape, x.shape,
                                   relu_flag, tensor_flag)
        else:
            res = _scalar_dequant_v100_v2(x_l0c, deq_ub, align_shape, x.shape,
                                          relu_flag, sqrt_mode)
    return res
Esempio n. 10
0
def ascend_dequant_compute(x,
                           deq_scale,
                           y,
                           sqrt_mode=False,
                           relu_flag=False,
                           kernel_name='ascend_dequant'):
    """
    int32 -> fp16

    Parameters:
     ----------
    x : the placeholder of input

    deq_scale: the placeholder of dequant num

    offset: the placeholder of offset num

    y : the dict of output.

    sqrt_mode : the sqrt mode when true the result to do sqrt

    relu_flag : the relu mode when true the result to do relu

    kernel_name : cce kernel name, default value is "ascend_dequant"

    Returns:

    res : the result of ascend_dequant
    -------
    None
    """
    def shape_to_list(shape):
        """
        trans shape to list shape
        """
        tmp = []
        for i in shape:
            tmp.append(i.value)
        return tmp

    x_shape = x.shape
    deq_shape = deq_scale.shape
    x_shape_list = shape_to_list(x_shape)
    deq_shape_list = shape_to_list(deq_shape)
    ori_shape_deq = deq_scale.op.attrs['ori_shape']
    ori_shape_deq_list = te.lang.cce.util.shape_to_list(ori_shape_deq)
    deq_dim = function_reduce(lambda x, y: x * y, ori_shape_deq_list[:])
    tensor_flag = False
    if deq_dim > 1:
        tensor_flag = True

    align_shape = x_shape_list.copy()
    if x.op.tag != "depthwise_conv2d":
        align_shape[2] = (align_shape[2] + 15) // 16 * 16

    if x.op.tag == "matmul" or x.op.tag == "matmul_gemv":
        shape_matmul_origin = x.op.attrs['shape']
        c1_index = len(x_shape) - 4
        res = _matmul_compute(x, x_shape, deq_scale, sqrt_mode, relu_flag,
                              shape_matmul_origin, c1_index, tensor_flag)
        return res
    if x.op.tag == "depthwise_conv2d":
        align_shape[4] = 16
        align_shape[3] = (x_shape_list[3] + 15) // 16 * 16
        align_shape[2] = 1
        if deq_shape_list[1] == 1:
            tensor_dict = {}
            tensor_dict["mad_ubuf"] = x.op.input_tensors[0]
            if x.op.attrs['bias_flag'].value == 1:
                tensor_dict["flag_is_dequant_bias"] = True
                tensor_dict["mad_after_bias"] = tensor_dict[
                    "mad_ubuf"].op.input_tensors[0]
                tensor_dict["mad_bias"] = tensor_dict[
                    "mad_after_bias"].op.input_tensors[0]
                tensor_dict["mad"] = \
                tensor_dict["mad_after_bias"].op.input_tensors[1]
                tensor_dict["mad_bias_ub_brc"] = tensor_dict[
                    "mad_bias"].op.input_tensors[0]
                tensor_dict["bias_gm"] = tensor_dict[
                    "mad_bias_ub_brc"].op.input_tensors[0]
            else:
                tensor_dict["mad"] = \
                    tensor_dict["mad_ubuf"].op.input_tensors[0]
            tensor_dict["im2col_fractal"] = \
                tensor_dict["mad"].op.input_tensors[0]
            tensor_dict["filter_reshape"] = \
                tensor_dict["mad"].op.input_tensors[1]
            tensor_dict["filter_buf"] = \
            tensor_dict["filter_reshape"].op.input_tensors[
                0]
            tensor_dict["im2col_row_major"] = tensor_dict[
                "im2col_fractal"].op.input_tensors[0]
            tensor_dict["fmap"] = \
            tensor_dict["im2col_row_major"].op.input_tensors[0]
            x_ori_shape = tensor_dict["fmap"].op.attrs["ori_shape"]
            x_ori_shape_list = te.lang.cce.util.shape_to_list(x_ori_shape)
            align_shape[1] = (x_ori_shape_list[3] + 15) // 16
        else:
            align_shape[1] = (deq_shape_list[1] * deq_shape_list[4]) // 16
        align_shape[0] = x_shape_list[0]

        if tensor_flag:
            if _is_support_v200_instruction():
                res = _vector_depthwise_fused_v200(x, x_shape, align_shape,
                                                   deq_scale, relu_flag)
            else:
                res = _vector_depthwise_fused_v100(x, x_shape, align_shape,
                                                   deq_scale, relu_flag,
                                                   sqrt_mode)
        else:
            if _is_support_v200_instruction():
                res = _scalar_depthwise_fused_v200(x, x_shape, align_shape,
                                                   deq_scale, relu_flag)
            else:
                res = _scalar_depthwise_fused_v100(x, x_shape, align_shape,
                                                   deq_scale, relu_flag,
                                                   sqrt_mode)

        return res

    if tensor_flag:
        if _is_support_v200_instruction():
            res = _vector_dequant_v200(x, x_shape, align_shape, deq_scale,
                                       relu_flag)
        else:
            res = _vector_dequant_v100(x, x_shape, align_shape, deq_scale,
                                       relu_flag, sqrt_mode)
    else:
        if _is_support_v200_instruction():
            res = _scalar_dequant_v200(x, x_shape, align_shape, deq_scale)
        else:
            res = _scalar_dequant_v100(x, x_shape, align_shape, deq_scale,
                                       relu_flag, sqrt_mode)

    return res