Beispiel #1
0
def custom_Exp(shape,
               dtype,
               gamma,
               alpha,
               beta,
               kernel_name="cce_exp",
               need_build=False,
               need_print=False):
    """
    calculate gamma **(alpha * data + beta),
    calculate exp(log(gamma) * alpha * data) * (gamma ** beta)

    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support \
    float16, float32

    gamma : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, base

    alpha : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, scale

    beta : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, shift

    kernel_name : cce kernel name, default value is "cce_exp"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32"]
    device_api = "DeviceExp"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError(
            "caffe_exp_layer_cce only support %s while dtype is %s" %
            (",".join(supported_dtypes), dtype))

    if gamma != -1 and gamma <= 0:
        # api  cc_device_exp_c handle gamma == -1 as e
        raise ValueError(
            "please ensure gamma is greater than 0, where gamma = %s" %
            str(gamma))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([gamma], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    # scale --> alpha, shitf --> beta, base --> gamma
    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input, output], "cce", name=kernel_name)
Beispiel #2
0
def custom_pow(shape,
               shape_y,
               dtype,
               kernel_name="cce_tf_pow",
               need_build=False,
               need_print=False):
    """
    calculate x^y, calculating data type is float16 or float32 or int32
    when x < 0 , the output is a meaningless value.
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support
    float16, float32, int32

    kernel_name : cce kernel name, default value is "tf_pow_cce"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32", "int32"]
    device_api = "cc_device_pow"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError("tf_pow_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_lhs = tvm.placeholder(shape, name="data_lhs", dtype=inp_dtype)
    data_rhs = tvm.placeholder(shape, name="data_rhs", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([0], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_power = util.create_param_ptr([0], inp_dtype, "p_power")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_lhs, data_rhs, p_scale, p_shift, p_power, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[2].access_ptr("r"),  # scale
            ins[3].access_ptr("r"),  # shift
            ins[4].access_ptr("r"),  # power
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            v_ndim,
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[1].access_ptr("r"),  # input y
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(
                tvm.lower(schedule, [data_lhs, data_rhs, output],
                          simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_lhs, data_rhs, output],
                      "cce",
                      name=kernel_name)
def custom_Power(shape,
                 dtype,
                 gamma,
                 alpha,
                 beta,
                 kernel_name="cce_caffe_power",
                 need_build=False,
                 need_print=False):
    """
    calculate (alpha * data + beta) ** gamma, calulation method exp(gamma * log(alpha * data + beta)).
    when alpha * data + beta < 0 , the output is a meaningless value.
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32

    gamma : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    alpha : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    beta : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    kernel_name : string
        kernel name in generated CCE kernal. default value is "cce_caffe_power"


    need_buid : bool
        if need to build CCEC kernel

    need_print : bool
        if need to print Halide IR

    Returns
    -------
    None
        
    """
    supported_dtypes = ["float16", "float32"]
    device_api = "cc_device_pow"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("power_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim_x = len(shape)
    v_ndim_y = 0
    p_shape_y = 0
    p_input_y = "nullptr"
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0

    p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift")
    p_power = util.create_param_ptr([gamma], inp_dtype, "p_power")
    p_shape_x = util.create_param_ptr(shape, "int32", "p_shape_x")

    # scale --> alpha, shitf --> beta, power --> gamma
    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_power, p_shape_x],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # power
            v_ndim_x,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            v_ndim_y,
            v_ndim_y,
            p_shape_y,
            padC0,
            p_input_y,
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Beispiel #4
0
def custom_expm1(shape,
                 dtype,
                 kernel_name="cce_tf_expm1",
                 need_build=False,
                 need_print=False):
    """
    algorithm: expm1

    calculating data's expm1, y= (e ** x) - 1,dtype is float16 or float32.

    Parameters
    ----------
    shape : shape of data.

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32.

    kernel_name : cce kernel name, default value is "cce_tf_expm1".

    need_buid : if need to build CCEC kernel, default value is False.

    need_print : if need to print the ir, default value is False.

    Returns
    -------
    None

    """

    # [aicpu] int32_t cc_device_exp(uint32_t blockNum, uint32_t blockIdx, int32_t dataType, const void *scale, const void *shift,
    # const void *base, int32_t dimCnt, int32_t *shape, uint32_t padC0, const void *x, void *y);

    supported_dtypes = ["float16", "float32"]

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("tf_expm1_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    # step 1. calculate y = exp ** x by aicpu api
    device_api = "DeviceExp"
    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0
    p_scale = util.create_param_ptr([1], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([-1], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output_exp = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output_exp",
        dtype=inp_dtype)

    offset = tvm.const((-1), dtype=inp_dtype)

    # step 2. cauculate y = exp ** x - 1 by tvm
    output = tvm.compute(
        shape,
        lambda *indice: output_exp(*indice) + offset.astype(inp_dtype),
        name="output")

    # step 3. schedule the computation by tvm
    s = tvm.create_schedule(output.op)

    # step 4. build by tvm
    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
def custom_exp(shape,
               dtype,
               kernel_name="cce_tf_exp",
               need_build=False,
               need_print=False):
    """
    algorithm: exp  

    calculating data's exp,y= e ** x ,dtype is float16,
    
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32

    kernel_name : cce kernel name, default value is "cce_tf_exp"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32"]
    device_api = "DeviceExp"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("tf_exp_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0
    p_scale = util.create_param_ptr([1], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([-1], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)