Ejemplo n.º 1
0
def zn_2_hwcn(src, dst, src_format, dst_format, kernel_name='zn_2_hwcn'):
    """
    algorithm: zn_2_hwcn
    calculating: change data format from Zn to HWCN

    Parameters
    ----------
    src: dict
        contains shape and dtype information of input tensor
    dst: dict
        contains shape and dtype information of output tensor
    src_format: str
        represents the format of input tensor, only support "Zn"
    dst_format: str
        represents the format of output tensor, only support "HWCN"
    kernel_name: str
        cce kernel name, default value is "zn_2_hwcn"

    Returns
    -------
    None
    """
    _check_parameters(src, dst, src_format, dst_format, kernel_name)
    dst_shape = dst.get("shape")
    dtype = src.get("dtype")

    h_i, w_i, c_i, n_i = dst_shape
    c_0 = 16
    if dtype == "int8":
        c_0 = 32
    c_1 = _ceil_div(c_i, c_0)
    n_ni = 16
    n_no = _ceil_div(n_i, n_ni)
    shape_zn = [c_1*h_i*w_i, n_no, n_ni, c_0]

    branch = _get_ir_branch(shape_zn, dtype)
    data = tvm.placeholder(shape_zn, dtype=dtype, name="data")
    if branch == "more_row":
        res = tvm.extern(dst_shape, [data],
                         lambda ins, outs: _more_row_ir(outs[0], ins[0], c_0),
                         name="res", dtype=dtype)
    else:
        res = tvm.extern(dst_shape, [data],
                         lambda ins, outs: _split_row_ir(outs[0], ins[0]),
                         name="res", dtype=dtype)

    tensor_list = [data, res]
    sch = tvm.create_schedule(res.op)
    with build_config:
        tvm.build(sch, tensor_list, "cce", name=kernel_name)
Ejemplo n.º 2
0
def depthwise_weight_6d_2_4d(x,
                             y,
                             src_format,
                             dst_format,
                             kernel_name="depthwise_weight_6d_2_4d"):
    """Operation and Schedule for depthwise_weight_6d_2_4d.

    Parameters
    ----------
    x: shape and dtype of input, the dtype support float16, float32,
    int32, uint16.

    y: the shape and dtype of outputs, the dtype same as input.

    src_format: the source data_format

    dst_format: the target data_format

    kernel_name : cce kernel name, default value is "depthwise_weight_6d_2_4d"

    Returns
    -------
        convert C1HWNCoC0 tp HWCN
    """
    _check_parameters(x, y, src_format, dst_format)
    output_shape = y.get("shape")
    channel_size = output_shape[2]
    input_shape = x.get("shape")
    dtype = x.get("dtype")
    channel_4d = channel_size
    op_utils.check_shape(input_shape, param_name="x")

    check_list = ("float16", "float32", "int32", "uint16")
    dtype = dtype.lower()
    op_utils.check_dtype(dtype, check_list, param_name="x")

    input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype)

    six2four = _Six2FourParam(input_shape, channel_4d)

    res = tvm.extern(
        [six2four.get_out_shape()], [input_data],
        lambda ins, outs: _intrin_factor(six2four, dtype, ins, outs),
        name="res",
        dtype=dtype)

    sch = tvm.create_schedule(res.op)
    build_list = [input_data, res]

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)
Ejemplo n.º 3
0
def depthwise_weight_4d_2_6d(x,
                             y,
                             src_format,
                             dst_format,
                             kernel_name="depthwise_weight_4d_2_6d"):
    """Operation and Schedule for depthwise_weight_4d_2_6d.

    Parameters
    ----------
    x: shape and dtype of input, the dtype support float16,
    float32, int32, uint16.

    y: the shape and dtype of outputs, the dtype same as input.

    src_format: the source data_format

    dst_format: the target data_format

    kernel_name : cce kernel name, default value is "depthwise_weight_4d_2_6d"

    Returns
    -------
        convert HWCN to C1HWNCoC0
    """
    if src_format.lower() != "hwcn":
        raise RuntimeError("dst_format must be HWCN!")

    if dst_format.lower() != "c1hwncoc0":
        raise RuntimeError("src_format must be C1HWNCoC0 !")

    input_shape = x.get("shape")
    dtype = x.get("dtype")
    op_utils.check_shape(input_shape, param_name="x")
    check_list = ("float16", "float32", "int32", "uint16")
    dtype = dtype.lower()
    op_utils.check_dtype(dtype, check_list, param_name="x")

    input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype)
    four2six = _Four2SixParam(input_shape)

    res = tvm.extern(
        [four2six.get_out_shape()], [input_data],
        lambda ins, outs: _intrin_factor(four2six, dtype, ins, outs),
        name="res",
        dtype=dtype)

    sch = tvm.create_schedule(res.op)
    build_list = [input_data, res]

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)
Ejemplo n.º 4
0
def histogram_fixed_width_d_compute(x,
                                    range,
                                    y,
                                    nbins,
                                    kernel_name="histogram_fixed_width_d"):
    """TVM calculation process, used for fusion operation
    compute for histogram_fixed_width

    Parameters
    ----------
    x: TVM tensor
        the placeholders of x
    range: TVM tensor
        the placeholders of range
    y: dict
        dict info of output, not used
    nbins: int
        number of histogram bins.
    dtype: str
        data type for returned histogram.
    kernel_name: str
        cce kernel name, not used

    Returns
    -------
    res: TVM tensor
        the result histogram_fixed_width
    """
    dtype = "int32"
    input_values_shape = te.lang.cce.util.shape_to_list(x.shape)
    value_range_shape = te.lang.cce.util.shape_to_list(range.shape)
    res = tvm.extern(
        [nbins], [x, range],
        lambda ins, outs: _histogram_fixed_width_ir(
            outs, ins, nbins, [input_values_shape, value_range_shape]),
        name="res",
        dtype=dtype)
    return res
Ejemplo n.º 5
0
def custom_Exp(shape,
               dtype,
               gamma,
               alpha,
               beta,
               kernel_name="cce_exp",
               need_build=False,
               need_print=False):
    """
    calculate gamma **(alpha * data + beta),
    calculate exp(log(gamma) * alpha * data) * (gamma ** beta)

    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support \
    float16, float32

    gamma : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, base

    alpha : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, scale

    beta : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, shift

    kernel_name : cce kernel name, default value is "cce_exp"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32"]
    device_api = "DeviceExp"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError(
            "caffe_exp_layer_cce only support %s while dtype is %s" %
            (",".join(supported_dtypes), dtype))

    if gamma != -1 and gamma <= 0:
        # api  cc_device_exp_c handle gamma == -1 as e
        raise ValueError(
            "please ensure gamma is greater than 0, where gamma = %s" %
            str(gamma))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([gamma], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    # scale --> alpha, shitf --> beta, base --> gamma
    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input, output], "cce", name=kernel_name)
Ejemplo n.º 6
0
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod",
                       need_build=False, need_print=False):
    """
    do element-wise truncatemod operation between two input tensors

    Parameters:
    ----------
    shape1 : shape of input data1

    shape2 : shape of input data2

    dtype : source data type, support float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_tf_truncatemod"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    max_dim = 8
    shape1_len = len(shape1)
    shape2_len = len(shape2)
    if shape1_len > max_dim or shape2_len > max_dim:
        raise RuntimeError(
            "mod_cce only support up to %d dimensions while the shape's \
            dimensions is %d, %d" % (max_dim, shape1_len, shape2_len))
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape1)
    util.check_shape_rule(shape2)

    util.check_shape_size(shape1, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape2, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    device_api_map = {"float16": "cc_device_truncatemod_float16",
                      "float32": "cc_device_truncatemod_float",
                      "int32": "cc_device_truncatemod_int32"}

    dtype = dtype.lower()
    if dtype not in check_list:
        raise RuntimeError(
            "tf_truncatemod_cce only support %s while dtype is %s" % (
                ",".join(check_list), dtype))

    shape1, shape2, shape_out = util.produce_shapes(shape1, shape2)
    util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    device_api = device_api_map[inp_dtype]

    # block
    block_num = "block_num"
    block_idx = "block_idx"
    # x param
    v_xndim_cnt = tvm.const(len(shape1), "int32")
    p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape")
    xpad_c0 = tvm.const(0, "int32")
    data_input_x = tvm.placeholder(shape1, name="data_input_x",
                                   dtype=inp_dtype)
    # y param
    v_yndim_cnt = tvm.const(len(shape2), "int32")
    p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape")
    ypad_c0 = tvm.const(0, "int32")
    data_input_y = tvm.placeholder(shape2, name="data_input_y",
                                   dtype=inp_dtype)
    # output
    v_out_ndim_cnt = tvm.const(len(shape_out), "int32")
    p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape")
    out_padc0 = tvm.const(0, "int32")

    output = tvm.extern(shape_out,
                        [p_xshape, data_input_x, p_yshape, data_input_y,
                         p_out_shape], lambda ins, outs:
                        tvm.call_extern("int32_t", device_api,
                                        block_num,
                                        block_idx,
                                        v_xndim_cnt,
                                        ins[0].access_ptr("r"),  # shape x
                                        xpad_c0,
                                        ins[1].access_ptr("r"),  # input x
                                        v_yndim_cnt,
                                        ins[2].access_ptr("r"),  # shape y
                                        ypad_c0,
                                        ins[3].access_ptr("r"),  # input y
                                        v_out_ndim_cnt,
                                        ins[4].access_ptr("r"),  # shape out
                                        out_padc0,
                                        outs[0].access_ptr("w")),
                        name="output", dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    # print IR
    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input_x, data_input_y, output],
                            simple_mode=True))
            # Compile to generate the cce file
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input_x, data_input_y, output], "cce",
                      name=kernel_name)
Ejemplo n.º 7
0
def cast(input_x, output_y, dst_type, kernel_name="cast"):
    """
    cast a tensor/scaler with input shape form src data type to dst data
    type. restrictions of input algorithms are as follow
    only types' groups blow are support tensor process:
        float16->float32
        float16->int32
        float32->float16
        float32->int32
        int8->float32
        uint8->float32
        int8->float16
        uint8->float16
        int8->int32
        uint8->int32
        int32->uint8 // number out of [0,255] can get unexpected result
        int32->int8 // number out of [-128,127] can get unexpected result
        int32->float32 // For tans with fp16, only guarantees
                        number in [-1023,1023] get correct result
        int32->float16 // only guarantees
                        number in [-1023,1023] get correct result
    scale convert support:(means only support shape [1,])
        int64->int32
        int64->float32

    Parameters
    ----------
    input_x : dict
        shape and dtype of input, only support float16, float32
    output_y: dict
        shape and dtype of output, should be same shape as input,
        and the dtype is the dst dtype need to cast
    kernel_name : str
        cce kernel name, default value is cast

    Returns
    -------
    None
    """
    shape = util.scalar2tensor_one(input_x.get("shape"))
    src_type = input_x.get("dtype").lower()
    check_shape(shape, param_name="input_x")

    if src_type == "bool":
        src_type = "int8"

    dst_type = _cast_dsttype_conversion(dst_type)
    fuseshape = [1]
    fuseshape[0] = reduceIns(lambda x, y: x * y, shape)
    data = tvm.placeholder(fuseshape, name="data", dtype=src_type)
    if src_type == "int64":
        check_dtype(dst_type, ("float32", "int32"), param_name="dst_type")
        res = tvm.extern(
            [fuseshape], [data],
            lambda ins, outs: _kernel_ir(outs, ins, dst_type, "int64"),
            name="res",
            dtype=dst_type)
        tensor_list = [data, res]
        schedule = tvm.create_schedule(res.op)
        with build_config:
            tvm.build(schedule, tensor_list, "cce", name=kernel_name)
    else:
        with tvm.target.cce():
            res = cast_compute(data, output_y, dst_type, kernel_name)
            sch = generic.auto_schedule(res)
        config = {
            "print_ir": False,
            "name": kernel_name,
            "tensor_list": [data, res]
        }
        te.lang.cce.cce_build_code(sch, config)
Ejemplo n.º 8
0
def custom_round(shape,
                 dtype,
                 kernel_name="cce_round",
                 need_build=False,
                 need_print=False):
    """
    doing round operations, calculating data type is float16 or float32 or int32
    
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype

    kernel_name : cce kernel name, default value is "cce_round"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
        
    """
    check_list = ["float16", "float32", "int32"]
    device_api_map = {
        "float16": "cc_device_round_float16",
        "float32": "cc_device_round_float",
        "int32": "cc_device_round_int32"
    }

    max_dim = 8
    shape_len = len(shape)
    if shape_len > max_dim:
        raise RuntimeError(
            "round_cce only support up to %d dimensions while the shape's dimension is %d"
            % (max_dim, shape_len))

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in check_list):
        raise RuntimeError("round_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)
    device_api = device_api_map[inp_dtype]

    block_num = "block_num"
    block_idx = "block_idx"
    v_ndim = tvm.const(len(shape), "int32")
    padC0 = tvm.const(0, "int32")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_input, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_ndim,
            ins[1].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Ejemplo n.º 9
0
def drop_out_do_mask(input_tensor,
                     input_mask,
                     input_keep_prob,
                     output,
                     kernel_name="dropout_do_mask"):
    """
    algorithm: tf_dropout_do_mask
    scale_x = x*(1 / keep_prob)
    res = select(mask == 1, scale_x, 0)

    Parameters
    ----------
    input_tensor : dict,shape and dtype of input_tensor,only support float16 and float32
    input_mask : dict,shape and dtype of input_mask
        shape of mask,1D, dtype == uint8
        length=(size(shape_tensor)+ELEMS_BATCH_PROCESS_FP16
        -1)/ELEMS_BATCH_PROCESS_FP16*ELEMS_BATCH_PROCESS_FP16/8
        eg. shape_tensor=[2,5,8] shape_mask=[16] shape_res=[2,5,8]
        shape_tensor=[15,17,19] shape_mask=[608] shape_res=[15,17,19]
    input_keep_prob : dict,shape and dtype of input_keep_prob
        shape of keep_prob, only 1 parament and equals to (1)
        prob scale (0.0,1.0] NOTICE: type same as dytpe
    output : dict,shape and dtype of output
    kernel_name : str
        cce kernel name, default value is "dropout_do_mask"

    Returns
    -------
    None
    """
    shape_tensor = input_tensor.get("shape")
    shape_mask = input_mask.get("shape")
    shape_keep_prob = input_keep_prob.get("shape")
    dtype = input_tensor.get("dtype")
    if shape_keep_prob == 1:
        shape_keep_prob = (shape_keep_prob, )
    check_shape(shape_tensor, param_name="input_tensor")
    check_dtype(dtype.lower(), ["float16", "float32"],
                param_name="input_tensor")
    if len(shape_mask) != 1:
        raise RuntimeError("The length of mask shape must be 1")
    if shape_keep_prob not in [(1, ), [
            1,
    ]]:
        raise RuntimeError("Only support shape (1, ) or [1, ]")
    # functools_reduce: product of all dimension
    # Align to ELEMS_BATCH_PROCESS_FP16
    product_mask = (functools_reduce(lambda x, y: x*y, shape_tensor[:]) +
                    ELEMS_BATCH_PROCESS_FP16 - 1) // \
                   ELEMS_BATCH_PROCESS_FP16 * ELEMS_BATCH_PROCESS_FP16 // 8
    if product_mask != shape_mask[0]:
        raise RuntimeError("The mask[0] should=%d, but now=%d" %
                           (product_mask, shape_mask[0]))
    data_tensor = tvm.placeholder(
        (functools_reduce(lambda x, y: x * y, shape_tensor), ),
        dtype=dtype,
        name="data_tensor")
    data_mask = tvm.placeholder(
        (functools_reduce(lambda x, y: x * y, shape_mask), ),
        dtype='uint8',
        name="data_mask")
    keep_prob_tensor = tvm.placeholder(shape_keep_prob,
                                       dtype=dtype,
                                       name="keep_prob_tensor")
    const_1 = tvm.const(1.0, dtype=dtype)

    res = tvm.extern([shape_tensor, shape_mask, shape_keep_prob],
                     [data_tensor, data_mask, keep_prob_tensor],
                     lambda ins, outs: _kernel_ir(outs, ins, const_1),
                     name="res",
                     dtype=dtype)

    tensor_list = [data_tensor, data_mask, keep_prob_tensor, res]
    schedule = tvm.create_schedule(res.op)

    with build_config:
        tvm.build(schedule, tensor_list, "cce", name=kernel_name)
Ejemplo n.º 10
0
def custom_pow(shape,
               shape_y,
               dtype,
               kernel_name="cce_tf_pow",
               need_build=False,
               need_print=False):
    """
    calculate x^y, calculating data type is float16 or float32 or int32
    when x < 0 , the output is a meaningless value.
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support
    float16, float32, int32

    kernel_name : cce kernel name, default value is "tf_pow_cce"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32", "int32"]
    device_api = "cc_device_pow"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError("tf_pow_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_lhs = tvm.placeholder(shape, name="data_lhs", dtype=inp_dtype)
    data_rhs = tvm.placeholder(shape, name="data_rhs", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([0], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_power = util.create_param_ptr([0], inp_dtype, "p_power")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_lhs, data_rhs, p_scale, p_shift, p_power, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[2].access_ptr("r"),  # scale
            ins[3].access_ptr("r"),  # shift
            ins[4].access_ptr("r"),  # power
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            v_ndim,
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[1].access_ptr("r"),  # input y
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(
                tvm.lower(schedule, [data_lhs, data_rhs, output],
                          simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_lhs, data_rhs, output],
                      "cce",
                      name=kernel_name)
Ejemplo n.º 11
0
def custom_expm1(shape,
                 dtype,
                 kernel_name="cce_tf_expm1",
                 need_build=False,
                 need_print=False):
    """
    algorithm: expm1

    calculating data's expm1, y= (e ** x) - 1,dtype is float16 or float32.

    Parameters
    ----------
    shape : shape of data.

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32.

    kernel_name : cce kernel name, default value is "cce_tf_expm1".

    need_buid : if need to build CCEC kernel, default value is False.

    need_print : if need to print the ir, default value is False.

    Returns
    -------
    None

    """

    # [aicpu] int32_t cc_device_exp(uint32_t blockNum, uint32_t blockIdx, int32_t dataType, const void *scale, const void *shift,
    # const void *base, int32_t dimCnt, int32_t *shape, uint32_t padC0, const void *x, void *y);

    supported_dtypes = ["float16", "float32"]

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("tf_expm1_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    # step 1. calculate y = exp ** x by aicpu api
    device_api = "DeviceExp"
    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0
    p_scale = util.create_param_ptr([1], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([-1], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output_exp = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output_exp",
        dtype=inp_dtype)

    offset = tvm.const((-1), dtype=inp_dtype)

    # step 2. cauculate y = exp ** x - 1 by tvm
    output = tvm.compute(
        shape,
        lambda *indice: output_exp(*indice) + offset.astype(inp_dtype),
        name="output")

    # step 3. schedule the computation by tvm
    s = tvm.create_schedule(output.op)

    # step 4. build by tvm
    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Ejemplo n.º 12
0
def custom_Power(shape,
                 dtype,
                 gamma,
                 alpha,
                 beta,
                 kernel_name="cce_caffe_power",
                 need_build=False,
                 need_print=False):
    """
    calculate (alpha * data + beta) ** gamma, calulation method exp(gamma * log(alpha * data + beta)).
    when alpha * data + beta < 0 , the output is a meaningless value.
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32

    gamma : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    alpha : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    beta : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    kernel_name : string
        kernel name in generated CCE kernal. default value is "cce_caffe_power"


    need_buid : bool
        if need to build CCEC kernel

    need_print : bool
        if need to print Halide IR

    Returns
    -------
    None
        
    """
    supported_dtypes = ["float16", "float32"]
    device_api = "cc_device_pow"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("power_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim_x = len(shape)
    v_ndim_y = 0
    p_shape_y = 0
    p_input_y = "nullptr"
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0

    p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift")
    p_power = util.create_param_ptr([gamma], inp_dtype, "p_power")
    p_shape_x = util.create_param_ptr(shape, "int32", "p_shape_x")

    # scale --> alpha, shitf --> beta, power --> gamma
    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_power, p_shape_x],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # power
            v_ndim_x,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            v_ndim_y,
            v_ndim_y,
            p_shape_y,
            padC0,
            p_input_y,
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Ejemplo n.º 13
0
def custom_exp(shape,
               dtype,
               kernel_name="cce_tf_exp",
               need_build=False,
               need_print=False):
    """
    algorithm: exp  

    calculating data's exp,y= e ** x ,dtype is float16,
    
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32

    kernel_name : cce kernel name, default value is "cce_tf_exp"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32"]
    device_api = "DeviceExp"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("tf_exp_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0
    p_scale = util.create_param_ptr([1], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([-1], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Ejemplo n.º 14
0
def _tranpose_notchange_last_two(data, shape_5hd, dst_shape_full, dst_shape,
                                 perm, dtype, max_dim, shape_all):
    """
    permutes the dimensions and the last axis is not transposed

    Parameters
    ----------
    data: tvm.tensor
        tensor of input data
    shape_res: list or tuple
        shape of output tensor
    perm: list or tuple
        permutation of the dimension of tensor
    dtype: str
        the data type

    Returns
    -------
    sch: tvm.schedule
        the compute schedule
    tensor_list: list
        list of tensor
    """
    def _permute(*index):
        """
        function of permute the dimensions of data

        """
        for i, item in enumerate(_perm_to_flag(perm)):
            if i == 0:
                res_axis = (index[item], )
            else:
                res_axis = res_axis + (index[item], )

        return res_axis

    # c1hwnc0 to nc1hwc0
    data_ub = tvm.compute(shape_5hd,
                          lambda *index: data(*_permute(*index)),
                          name="data_ub")
    res_5hd = tvm.compute(shape_5hd,
                          lambda *index: data_ub(*index),
                          name="res_5hd")

    # nc1hwc0 to nchw
    if dtype == "float32":
        branch = _get_ir_branch(shape_5hd, dtype, shape_all)
        if branch == "more_dim":
            res = tvm.extern(dst_shape_full, [res_5hd],
                             lambda ins, outs: _more_dim_ir(
                                 outs[0], ins[0], max_dim, shape_all),
                             name="res",
                             dtype=dtype)
        elif branch == "one_dim":
            res = tvm.extern(dst_shape_full, [res_5hd],
                             lambda ins, outs: _one_dim_ir(
                                 outs[0], ins[0], max_dim, shape_all),
                             name="res",
                             dtype=dtype)
        else:
            res = tvm.extern(dst_shape_full, [res_5hd],
                             lambda ins, outs: _split_dim_ir(
                                 outs[0], ins[0], max_dim, shape_all),
                             name="res",
                             dtype=dtype)
    else:
        branch_fp16 = _get_ir_branch_fp16(dst_shape_full, dtype, shape_all)
        if branch_fp16 == "more_dim_fp16":
            res = tvm.extern(dst_shape_full, [res_5hd],
                             lambda ins, outs: _more_dim_ir_fp16(
                                 outs[0], ins[0], max_dim, shape_all),
                             name="res",
                             dtype=dtype)
        else:
            res = tvm.extern(dst_shape_full, [res_5hd],
                             lambda ins, outs: _split_dim_ir_fp16(
                                 outs[0], ins[0], max_dim, shape_all),
                             name="res",
                             dtype=dtype)

    res_end = tvm.extern(dst_shape, [res],
                         lambda ins, outs: _temp_ir(outs[0], ins[0]),
                         name="res_end",
                         dtype=dtype)

    sch = tvm.create_schedule(res_end.op)
    args = [sch, data, res_5hd, data_ub, shape_5hd, dtype]
    sch, _ = _schedule_for_not_change_last(args)
    tensor_list = [data, res_end, res_5hd, res]

    return sch, tensor_list