Python check_dtypeの例、te.utils.op_utils.check_dtype Pythonの例

コード例 #1

0

ファイルを表示

ファイル: assign.py プロジェクト: gekowa/ascend-opp

def _check_params(ref_shape, value_shape, dtype, kernel_name):
    """
    check the parameters including ref_shape, value_shape, dtype and kernel_name

    Parameters
    ----------
    ref_shape: list or tuple
        shape of ref_tensor
    value_shape: list or tuple
        shape of value_tensor
    dtype: str
        the data type
    kernel_name: str
        cce kernel name, default value is "cce_assign"

    Returns
    -------
    None
    """

    check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16",
                  "uint32", "uint64", "float16", "float32")
    check_dtype(dtype, check_list, param_name="ref")

    _check_shape(ref_shape, value_shape)

コード例 #2

0

ファイルを表示

ファイル: apply_adam_with_amsgrad_d.py プロジェクト: gekowa/ascend-opp

def _check_para_and_getplaceholder(scalar_input, tensor_input, input_dict):
    check_list = ("float32", )
    var_shape = input_dict["var"].get("shape")
    var_dtype = input_dict["var"].get("dtype")
    list_placeholder = []
    for key, value in input_dict.items():
        shape = util.scalar2tensor_one(value.get("shape"))
        op_utils.check_shape(shape)
        if value in scalar_input:
            if not util.is_scalar(shape):
                raise RuntimeError("The shape of ", key, " must be scalar")
        if value in tensor_input:
            if shape != var_shape:
                raise RuntimeError("The shape of", key,
                                   "must be the same as the var")

        dtype = value.get("dtype").lower()
        op_utils.check_dtype(dtype, check_list, param_name="var")
        if dtype != var_dtype:
            raise RuntimeError("The dtype of", key,
                               "must be the same as the var")

        shape_refine = (functools_reduce(operator.mul, shape), )
        list_placeholder.append(
            tvm.placeholder(shape=shape_refine, name=key, dtype=dtype))
    return list_placeholder

コード例 #3

0

ファイルを表示

def _check_dtype(dtype_x, dtype_sum, dtype_square_sum,
                 dtype_scale, dtype_offset):
    check_dtype(dtype_x, ("float16", "float32"))
    check_dtype(dtype_sum, ("float32",))
    check_dtype(dtype_square_sum, ("float32",))
    check_dtype(dtype_scale, ("float32",))
    check_dtype(dtype_offset, ("float32",))

コード例 #4

0

ファイルを表示

ファイル: sigmoid_cross_entropy_with_logits_grad_v2.py プロジェクト: gekowa/ascend-opp

def optional_weight(tensor_list, predict_shape, dtype_list, weight,
                    pos_weight):
    weight_data = None
    pos_weight_data = None
    if weight is not None:
        weight_shape = weight.get("shape")
        weight_dtype = weight.get("dtype").lower()
        op_utils.check_dtype(weight_dtype, dtype_list)
        _broadcast_shape_check(weight_shape, predict_shape)

        weight_shape = tuple(
            [1] *
            (len(predict_shape) - len(weight_shape))) + tuple(weight_shape)
        weight_data = tvm.placeholder(weight_shape,
                                      weight_dtype,
                                      name="weight_data")
        tensor_list.append(weight_data)

    if pos_weight is not None:
        pos_weight_shape = pos_weight.get("shape")
        pos_weight_dtype = pos_weight.get("dtype").lower()

        op_utils.check_dtype(pos_weight_dtype, dtype_list)
        _broadcast_shape_check(pos_weight_shape, predict_shape)

        pos_weight_shape = tuple([1] *
                                 (len(predict_shape) - len(pos_weight_shape))
                                 ) + tuple(pos_weight_shape)
        pos_weight_data = tvm.placeholder(pos_weight_shape,
                                          pos_weight_dtype,
                                          name="pos_weight_data")
        tensor_list.append(pos_weight_data)

    return weight_data, pos_weight_data

コード例 #5

0

ファイルを表示

ファイル: square_sum_all.py プロジェクト: gekowa/ascend-opp

    def check_param(self):
        """
        Check parameter

        Parameters
        ----------
        None

        Returns
        -------
        None
        """
        op_utils.check_shape(self.input_x_shape, param_name="input_x")
        op_utils.check_shape(self.input_y_shape, param_name="input_y")
        op_utils.check_dtype(self.input_x_dtype, ("float32", ),
                             param_name="input_x")
        op_utils.check_dtype(self.input_y_dtype, ("float32", ),
                             param_name="input_y")

        add_support = tbe_platform.cce_conf.api_check_support(
            "tik.vadd", "float32")

        if self.input_x_dtype != self.input_y_dtype:
            raise RuntimeError(
                "input_x and input_y do not have the same dtype")

        if self.input_x_dtype == "float32" and not add_support:
            raise RuntimeError(
                "Input dtype is float32, but do not support on the platform")

コード例 #6

0

ファイルを表示

def check_supported(x,
                    segment_ids,
                    y,
                    num_segments,
                    kernel_name="unsorted_segment_max_d"):
    """
    fusion pass test if num_segments is int32
    """
    shape = x.get("shape")
    dtype = x.get("dtype").lower()
    segment_ids_shape = segment_ids.get("shape")
    segment_ids_dtype = segment_ids.get("dtype").lower()
    check_list = ("float16", "float32", "int32", "int16")
    op_utils.check_dtype(dtype, check_list, param_name="x")
    op_utils.check_shape(shape, param_name="x")
    check_list_ids = ("int32")
    op_utils.check_dtype(segment_ids_dtype,
                         check_list_ids,
                         param_name="segment_ids")
    if num_segments <= 0:
        return False
    first_shape = int(shape[0])
    ids_length = int(segment_ids_shape[0])
    if first_shape != ids_length:
        return False
    total_ub_size = (num_segments + first_shape) * BLOCK_LENGTH + (
        (BLOCK_LENGTH // 2 - first_shape %
         (BLOCK_LENGTH // 4)) + first_shape) * (BLOCK_LENGTH // 8)
    if total_ub_size > UB_SIZE_MAX // 2:
        return False
    return True

コード例 #7

0

ファイルを表示

ファイル: kl_div.py プロジェクト: gekowa/ascend-opp

def _check_parameter(input_x, input_target):
    """
    Parameters
    ----------
    input_x : dict
        shape and dtype of input_x
    input_target : dict
        shape and dtype of input_target.Shape and dtype must be same as input_x
    Returns
    ------
    None
    """
    shape_x = input_x.get("shape")
    shape_target = input_target.get("shape")
    op_utils.check_shape(shape_x, param_name="input_x")
    if list(shape_x) != list(shape_target):
        raise RuntimeError("input_x and input_target must "
                           "have the same shape.")

    # check input tensor data_type
    dtype_x = input_x.get("dtype").lower()
    dtype_target = input_target.get("dtype").lower()
    check_list = ("float16", "float32")
    op_utils.check_dtype(dtype_x, check_list, param_name="input_x")
    if dtype_x != dtype_target:
        raise RuntimeError("input_x and input_target must "
                           "have the same dtype.")

    if dtype_x == "float32" and not tbe_platform.cce_conf.api_check_support(
            "te.lang.cce.vmul", "float32"):
        raise RuntimeError(
            "Instric only support float16 while input dtype is float32")

コード例 #8

0

ファイルを表示

ファイル: reduce_max_d.py プロジェクト: gekowa/ascend-opp

def reduce_max_d(x, y, axes=None, keepdims=None, kernel_name="reduce_max_d"):
    """
    reduce a tensor on a certain axes based on max.

    Parameters
    ----------
    x : dict
        shape and dtype of input
    y : dict
        shape and dtype of output, should be same shape and type as input
    axes: list
        the first axes to reduce,may be negative to index from the end
        (e.g., -1 for the last axes).
        axes may be int or list(e.g. [1,2])
    keepdims: bool
        if true, retains reduced dimensions with length 1,
        default value is None
    kernel_name : str
        kernel name, default value is "reduce_max_d"

    Returns
    -------
    None
    """

    dtype = x["dtype"]
    dtype_lower = dtype.lower()
    check_list = ("float16", "float32", "int8", "uint8", "int32")
    check_dtype(dtype_lower, check_list)

    with te.op.compute():
        shape = x["shape"]
        shape_range = x["range"]

        shape_len = len(shape)
        if not axes:
            axes = range(shape_len)
        if hasattr(axes, 'index'):
            axes = list(axes)
        axes = cce_util.axis_check(shape_len, axes)

        shape_new, shape_range_new, axes_new, fused_rel_dic = \
            fused_reduce_axis(shape, shape_range, axes)
        add_compile_info("fused_rel_dic", fused_rel_dic)

        x["shape"] = shape_new
        x["range"] = shape_range_new
        shape_var_new = variable_shape([x])[0]

        data_input = tvm.placeholder(shape_var_new, name="data_input",
                                     dtype=dtype_lower)
        res = reduce_max_d_compute(data_input, y, axes_new, keepdims)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    # build
    config = {"name": kernel_name,
              "tensor_list": [data_input, res]}
    te.lang.dynamic.build(sch, config)

コード例 #9

0

ファイルを表示

def log(input_x, output_y, base=-1.0, scale=1.0, shift=0.0, kernel_name="log"):
    """
    calculating data

    Parameters
    ----------
    input_x : dict
        shape and dtype of input
    output_y : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "log"

    Returns
    -------
    None
    """

    shape = input_x.get("shape")
    dtype = input_x.get("dtype")
    input_dtype = dtype.lower()

    # input_x' shape check
    op_utils.check_shape(shape, param_name="input_x")

    # input_x' dtype check, only supports fp16 and fp32
    check_list = ("float16", "float32")
    op_utils.check_dtype(input_dtype, check_list, param_name="input_x")

    if base <= 0 and (not isclose(base, -1.0)):
        error_info = {}
        error_info['errCode'] = 'E80000'
        error_info['param_name'] = 'base'
        error_info['op_name'] = 'log'
        error_info['expect_value'] = "strictly positive or -1"
        error_info['real_value'] = base
        raise RuntimeError("In op[%s], the parameter[%s] should be [%s], but actually is [%s]."
                           % (error_info['op_name'], error_info['param_name'], \
                              error_info['expect_value'], error_info['real_value']))

    fused_shape = [reduceIns(lambda x, y: x * y, shape[:])]
    data_input = tvm.placeholder(fused_shape,
                                 name="data_input",
                                 dtype=input_dtype)

    res = log_compute(data_input, output_y, base, scale, shift, kernel_name)

    # auto schedule
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    # operator build
    config = {
        "name": kernel_name,
        "need_build": True,
        "tensor_list": (data_input, res)
    }

    te.lang.cce.cce_build_code(sch, config)

コード例 #10

0

ファイルを表示

ファイル: mul.py プロジェクト: gekowa/ascend-opp

def mul(x, y, output, kernel_name="mul"):
    """
    do element-wise mul operation between two input tensors

    Parameters:
    ----------
    x : dict.
        shape, dtype of input x
    y : dict.
        shape, dtype of input y
    output : dict.
        shape, dtype of ouput
    kernel_name : str.
        cce kernel name, default value is "mul"

    Returns
    -------
    None
    """
    # format_pattern = 1  Nz and vector
    # format_pattern = 2  vector and Nz
    # format_pattern = 0  Nz scalar  Nz Nz  ND ND
    format_pattern = _mul_check_format(x, y)
    shape_x, shape_y = _infer_shape(format_pattern, x, y)

    shape_x = util.scalar2tensor_one(shape_x)
    dtype_x = x.get("dtype").lower()
    shape_y = util.scalar2tensor_one(shape_y)
    dtype_y = y.get("dtype").lower()

    op_utils.check_shape(shape_x, param_name="x")
    op_utils.check_shape(shape_y, param_name="y")

    if dtype_x != dtype_y:
        raise RuntimeError("dtype of inputs should be consistent")
    dtype = dtype_x
    check_list = ("int32", "float16", "float32", "int16")
    op_utils.check_dtype(dtype, check_list, param_name="x")

    vmul_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.vmul", "float32")
    if dtype_x == "float32" and not vmul_support:
        raise RuntimeError(
            "Input dtype is float32, but do not support on the platform")

    shape_x, shape_y, shape_max = op_utils.broadcast_shapes(
        shape_x, shape_y, param_name_input1="x", param_name_input2="y")

    shape_x, shape_y = op_utils.refine_shapes_for_broadcast(shape_x, shape_y)
    input_x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    input_y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    res = _mul_compute(input_x, input_y, output, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": (input_x, input_y, res)}
    te.lang.cce.cce_build_code(sch, config)

コード例 #11

0

ファイルを表示

ファイル: logical_or.py プロジェクト: gekowa/ascend-opp

def logical_or(x1, x2, y, kernel_name="logical_or"):
    """
    algorithm : logical_or
    calculating the value of x1 OR x2 element-wise

    Parameters
    ----------
    x1 : the dict of x1,
         include shape and dtype,
         dtype support int8, the value only support 0, 1

    x2 : the dict of x2,
         include shape and dtype,
         dtype support int8, the value only support 0, 1

    y : the dict of y, include shape and dtype

    kernel_name : string, cce kernel name, default value is "logical_or"

    Returns
    -------
    None
    """

    shape_x1 = x1.get("shape")
    shape_x2 = x2.get("shape")
    dtype_x1 = x1.get("dtype")
    dtype_x2 = x2.get("dtype")
    if dtype_x1 == "bool" or dtype_x2 == "bool":
        dtype_x1 = "int8"
        dtype_x2 = "int8"

    check_shape(shape_x1, param_name="x1")
    check_shape(shape_x2, param_name="x2")

    check_tuple = ("int8", )
    check_dtype(dtype_x1, check_tuple, param_name="x1")
    check_dtype(dtype_x2, check_tuple, param_name="x2")

    shape_x1, shape_x2, shape_max = broadcast_shapes(shape_x1,
                                                     shape_x2,
                                                     param_name_input1="x1",
                                                     param_name_input2="x2")
    dtype = dtype_x1.lower()
    data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype)
    data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype)

    res = logical_or_compute(data_x1, data_x2, y, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "need_build": False,
        "name": kernel_name,
        "tensor_list": (data_x1, data_x2, res)
    }
    te.lang.cce.cce_build_code(schedule, config)

コード例 #12

0

ファイルを表示

ファイル: atan_grad.py プロジェクト: gekowa/ascend-opp

def atan_grad(y, dy, z, kernel_name="atan_grad"):
    """
    Gradient calculation for atan(x)

    Parameters:
    ----------
    y : dict of y, include shape and dtype, dtype support float16, float32
    dy : dict of dy, include shape and dtype, dtype support float16, float32
    z : dict of output, include shape and dtype
    kernel_name : cce kernel name, default value is atan_grad

    Algorithm :
    ----------
    forward :
        y = atan(x)
    backward gradient :
        de/dx = dy/dx*de/dy = 1/(1+x^2)*grad

    Returns
    ----------
    None
    """

    # get the shape and dtype
    shape = y.get("shape")
    shape_grad = dy.get("shape")
    dtype = y.get("dtype")
    dtype_grad = dy.get("dtype")

    # check whether kernel name is unique

    # check whether the shape is right
    check_shape(shape, param_name="y")
    check_shape(shape_grad, param_name="dy")
    if not operator.eq(shape, shape_grad):
        raise RuntimeError("all input shape must be the same")
    shape, _ = refine_shape_axes(shape, [])

    # check whether dtypes are fp16,fp32 and whether they are the same
    check_list = ("float16", "float32")
    check_dtype(dtype, check_list, param_name="y")
    check_dtype(dtype_grad, check_list, param_name="dy")
    dtype = dtype.lower()
    if dtype != dtype_grad.lower():
        raise RuntimeError("all input dtype must be same")

    # get 2 input placeholders: data_input, grad
    data_input = tvm.placeholder(shape, name="input_data", dtype=dtype)
    grad = tvm.placeholder(shape, name="input_grad", dtype=dtype)

    # compute the backward gradient
    res = atan_grad_compute(data_input, grad, z, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name,
              "tensor_list": [data_input, grad, res]}
    te.lang.cce.cce_build_code(sch, config)

コード例 #13

0

ファイルを表示

ファイル: floor_div.py プロジェクト: gekowa/ascend-opp

def floor_div(input_x, input_y, output_z, kernel_name="floor_div"):
    """
      algorithm: floordiv
      calculating data's floordiv, res =floor(x / y)

      Parameters
      ----------
      input_x: dict
      input_y: dict
      output_z: dict
      kernel_name: str, default value is "floor_div"

      Returns
      -------
      None
    """
    # check dtype of input_x/input_y
    input_dtype_x = input_x.get("dtype").lower()
    input_dtype_y = input_y.get("dtype").lower()
    check_list = ('int8', 'uint8', 'int32', 'float16', 'float32')
    check_dtype(input_dtype_x, check_list, param_name="input_x")
    check_dtype(input_dtype_y, check_list, param_name="input_y")
    check_elewise_shape_range([input_x, input_y], support_broadcast=True)
    if input_dtype_x != input_dtype_y:
        error_info = {}
        error_info['errCode'] = OP_ERROR_CODE_018
        error_info['op_name'] = 'floor_div'
        error_info['param_name1'] = 'input_dtype_x'
        error_info['param_name2'] = 'input_dtype_y'
        error_info['param1_dtype'] = str(input_dtype_x)
        error_info['param2_dtype'] = str(input_dtype_y)
        raise RuntimeError(error_info,
                           "In op[%s], the parameter[%s][%s] are not equal in "
                           "dtype with dtype[%s][%s]." % (
                               error_info['op_name'],
                               error_info['param_name1'],
                               error_info['param_name2'],
                               error_info['param1_dtype'],
                               error_info['param2_dtype']))

    ins = classify([input_x, input_y], Mode.ELEWISE_WITH_BROADCAST)
    schedules, tensors = [], []
    for (input_x, input_y) in ins:
        with te.op.compute():
            x_shape, y_shape = variable_shape([input_x, input_y],
                                              support_broadcast=True)
            x_shape, y_shape = refine_shapes_for_broadcast(x_shape, y_shape)
            tensor_x = tvm.placeholder(x_shape, input_dtype_x, "tensor_x")
            tensor_y = tvm.placeholder(y_shape, input_dtype_y, "tensor_y")
            res = floor_div_compute(tensor_x, tensor_y, output_z, kernel_name)

            tensors.append([tensor_x, tensor_y, res])
        with tvm.target.cce():
            sch = generic.auto_schedule(res)
        schedules.append(sch)

    config = {"name": kernel_name, "tensor_list": tensors}
    te.lang.dynamic.build(schedules, config)

コード例 #14

0

ファイルを表示

def data_format_dim_map(x,
                        y,
                        src_format="NHWC",
                        dst_format="NCHW",
                        kernel_name="data_format_dim_map"):
    """
    Returns the dimension index in the destination data format given the one in.

    Parameters
    ----------
    x : A Tensor with each element as a dimension index in source data format.
        Must be the following types: `int32`. Must be in the range [-4, 4).
    y : Shape and dtype of y, reserved parameter, not used now.
    src_format : An optional `string`. Defaults to `"NHWC"`. source data format.
    dst_format : An optional `string`. Defaults to `"NCHW"`. destination data format.
    kernel_name : CCE kernel name, default value is "data_format_dim_map" (optional).

    Returns
    -------
    None
    """

    shape_input = x.get("shape")
    dtype_input = x.get("dtype")

    # check kernel name, shape, size, dtype
    check_shape(shape_input, param_name="x")
    shape_input, _ = refine_shape_axes(shape_input, [])
    check_list = ("int32", )
    dtype_input = dtype_input.lower()
    check_dtype(dtype_input, check_list, param_name="x")

    # check length of format
    if len(src_format) != 4:
        raise ValueError(
            "source format must of length 4, received src_format = %s" %
            src_format)

    if len(dst_format) != 4:
        raise ValueError(
            "destination format must of length 4, received dst_format = %s" %
            dst_format)
    # get data and compute
    data_input = tvm.placeholder(shape_input,
                                 dtype=dtype_input,
                                 name="data_input")
    res = _data_format_dim_map_compute(data_input, y, src_format, dst_format,
                                       kernel_name)

    with tvm.target.cce():
        sch = topi.generic.auto_schedule(res)
    config = {
        "name": kernel_name,
        "print_ir": False,
        "tensor_list": (data_input, res),
        "bool_storage_as_1bit": False
    }
    te.lang.cce.cce_build_code(sch, config)

コード例 #15

0

ファイルを表示

def reduce_sum_d(x, y, axis=None, keepdims=None, kernel_name="reduce_sum_d"):
    """reduce a tensor on a certain axis based on sum.

    Parameters:
    ----------
    x: dict
        the dict of input tensor.
    y: dict
        the dict of output tensor.
    axis: int, list, tuple or NONETYPE
        the axis for reduce.
    keepdims: bool or NONETYPE
        if true, retains reduced dimensions with length 1.
    kernel_name: str
        cce kernel name, default value is "reduce_sum_d".

    Returns
    -------
    None
    """

    dtype = x["dtype"]
    dtype_lower = dtype.lower()
    check_list = ("float16", "float32")
    check_dtype(dtype_lower, check_list, param_name="x")

    with te.op.compute():
        shape = x["shape"]
        shape_range = x["range"]

        axes = []
        shape_len = len(shape)
        if not axis:
            for i, _ in enumerate(shape):
                axes.append(i)
        else:
            axes = list(axis)
        axes = cce_util.axis_check(shape_len, axes)

        shape_new, shape_range_new, axes_new, fused_rel_dic = \
            fused_reduce_axis(shape, shape_range, axes)

        add_compile_info("fused_rel_dic", fused_rel_dic)
        x["shape"] = shape_new
        x["range"] = shape_range_new
        shape_var_new = variable_shape([x])[0]

        data_input = tvm.placeholder(shape_var_new,
                                     name="data_input",
                                     dtype=dtype_lower)
        res = reduce_sum_d_compute(data_input, y, axes_new, keepdims)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    # build
    config = {"name": kernel_name, "tensor_list": [data_input, res]}
    te.lang.dynamic.build(sch, config)

コード例 #16

0

ファイルを表示

ファイル: assign_sub.py プロジェクト: gekowa/ascend-opp

def assign_sub(var, value, out, kernel_name='assign_sub'):
    """
    Update var by subtracting value from it.

    Parameters:
    ----------
    var : dict
        dict of input_var, include shape and dtype,
        dtype support int8, uint8, int32, float16, float32

    value : dict
        dict of input_value, include shape and dtype,
        dtype support int8, uint8, int32, float16, float32.
        Must have the same shape and dtype as input_var

    out : dict
        dict of out

    kernel_name : str
        cce kernel name, default value is "assign_sub"

    Returns
    -------
    None
    """

    # get the shape and dtype
    shape_var = var.get("shape")
    shape_value = value.get("shape")
    dtype_var = var.get("dtype")
    dtype_value = value.get("dtype")

    # kernel name check: should be unique

    # check whether the shape is right
    check_shape(shape_var, param_name="var")
    check_shape(shape_value, param_name="value")
    if not operator.eq(shape_var, shape_value):
        raise RuntimeError("all input shape must be the equal")

    # check whether dtypes are fp16, fp32, int8, uint8, int32
    # and whether they are the same
    check_list = ("float16", "float32", "int8", "uint8", "int32")
    check_dtype(dtype_var, check_list, param_name="var")
    check_dtype(dtype_value, check_list, param_name="value")
    dtype_var = dtype_var.lower()
    dtype_value = dtype_value.lower()
    if dtype_var != dtype_value:
        raise RuntimeError("all input dtype must be same")

    shape, _ = refine_shape_axes(shape_var, [])
    data_var = tvm.placeholder(shape, dtype=dtype_var, name='data_var')
    data_value = tvm.placeholder(shape, dtype=dtype_value, name='data_value')
    sch, res = _assign_sub_compute(data_var, data_value, out, kernel_name)

    with set_bool_storage_config():
        tvm.build(sch, [data_var, data_value, res], "cce", name=kernel_name)

コード例 #17

0

ファイルを表示

    def __init__(self, var, indices, updates, var_out, use_locking,
                 kernel_name):
        self.tik_instance = tik.Tik(tik.Dprofile())
        self.var_dtype = var.get("dtype").lower()
        self.indices_dtype = indices.get("dtype").lower()
        self.updates_dtype = updates.get("dtype").lower()
        self.out_dtype = var_out.get("dtype").lower()
        indices_support_dtype_list = ("int32", )
        var_support_dtype_list = ("float32", )
        check_dtype(self.indices_dtype,
                    indices_support_dtype_list,
                    param_name="indices")
        check_dtype(self.var_dtype, var_support_dtype_list, param_name="var")
        if self.var_dtype != self.updates_dtype:
            error_manager_vector.raise_err_inputs_dtype_not_equal(
                kernel_name, "updates", "var", self.updates_dtype,
                self.var_dtype)
        if self.var_dtype != self.out_dtype:
            error_manager_vector.raise_err_inputs_dtype_not_equal(
                kernel_name, "out", "var", self.out_dtype, self.var_dtype)
        self.kernel_name = kernel_name

        self.ai_core_num = tbe_platform.cce_conf.get_soc_spec(
            tbe_platform.cce_conf.CORE_NUM)
        self.ub_size_bytes = (
            tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) -
            RESERVED_UB_SIZE)
        self.var_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(
            self.var_dtype) // 8
        self.indices_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(
            self.indices_dtype) // 8
        self.var_data_each_block = 32 // self.var_dtype_bytes_size
        self.indices_data_each_block = 32 // self.indices_dtype_bytes_size

        self.tiling_gm = self.tik_instance.Tensor("int32", (TILING_ARG_NUM, ),
                                                  name="tiling_gm",
                                                  scope=tik.scope_gm)
        self.var_gm = self.tik_instance.Tensor(self.var_dtype, (MAX_INT32, ),
                                               name="var_gm",
                                               scope=tik.scope_gm)
        self.indices_gm = self.tik_instance.Tensor(self.indices_dtype,
                                                   (MAX_INT32, ),
                                                   name="indices_gm",
                                                   scope=tik.scope_gm)
        self.updates_gm = self.tik_instance.Tensor(self.updates_dtype,
                                                   (MAX_INT32, ),
                                                   name="updates_gm",
                                                   scope=tik.scope_gm)
        self.out_gm = self.tik_instance.Tensor(self.var_dtype, (MAX_INT32, ),
                                               name="out_gm",
                                               scope=tik.scope_gm)

        self.updates_ub = None
        self.indices_ub = None
        self.var_read_index = None
        self.updates_read_index = None
        self.indices_loop_index = None

コード例 #18

0

ファイルを表示

def bn_training_reduce(x, sum, square_sum, kernel_name="bn_training_reduce"):
    """
    algorithm: part of fused_batch_norm_v2
    The first step of batch_norm
    which to calculate the sum and square sum of x.
    The major component of this operator is reduce operation.

    Parameters
    ----------
    x: dict
        dict of input, A 5HD Tensor for input data.
    sum: dict
        dict of sum, A `Tensor`. Sum of x.
    square_sum: dict
        dict of square_sum, A `Tensor`. Square sum of x.
    kernel_name: str
        kernel name, default value is "bn_training_reduce"

    Returns
    -------
    None
    """
    data_format = x.get("format").upper()
    origin_format = x.get("ori_format").upper()
    dtype = x.get("dtype").lower()

    # check and format
    check_list = ("NC1HWC0", "NCHW")
    check_format(data_format, check_list, param_name="x")
    if data_format == "NCHW" and origin_format not in ("NCHW", ):
        raise RuntimeError("The origin format only supports "
                           "NCHW when format is NCHW")

    # check dtype
    check_list = ("float16", "float32")
    check_dtype(dtype, check_list, param_name="x")

    # get dynamic shape, x.get("shape"), x.get("range")
    shape_x = variable_shape([x])[0]

    # compute
    with te.op.compute():
        data_input = tvm.placeholder(shape_x, name="data_input", dtype=dtype)
        res = bn_training_reduce_compute(data_input,
                                         sum,
                                         square_sum,
                                         kernel_name=kernel_name)

    # schedule
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    # build
    tensor_list = [data_input] + list(res)
    config = {"name": kernel_name, "tensor_list": tensor_list}
    te.lang.dynamic.build(sch, config)

コード例 #19

0

ファイルを表示

def _float32_process(data, dst_type):
    """
    deal with src dtype=float32 case
    """
    check_list_value = ("int32", "float16")
    check_dtype(dst_type, check_list_value, param_name="from_fp32_to_dsttype")
    if dst_type == "int32":
        return te.lang.dynamic.cast_to(data, "int32")
    if dst_type == "float16":
        return te.lang.dynamic.cast_to(data, "float16")

コード例 #20

0

ファイルを表示

def lp_loss(predict, label, y, p, reduction="mean", kernel_name="lp_loss"):
    """
    :param predict: dict
        shape and dtype of input
    :param label: dict
        shape and dtype of label, should be same shape and type as predict
    :param y: dict
        shape and dtype of y, should be same shape and type as predict
    :param p: int
        decides which loss to compute, now the p only can be 1 to compute l1_loss
    :param reduction: str
        reduce mode,can be 'mean','sum' or 'none'
    :param kernel_name: kernel name, default value is "lp_loss"
    :return:
        None
    """
    predict_shape = predict.get("shape")
    predict_dtype = predict.get("dtype").lower()
    label_shape = label.get("shape")
    label_dtype = label.get("dtype").lower()

    dtype_list = ["float16", "float32"]
    reduction_list = ["none", "mean", "sum"]

    op_utils.check_dtype(predict_dtype, dtype_list)
    op_utils.check_dtype(label_dtype, dtype_list)
    op_utils.check_shape(predict_shape)
    op_utils.check_shape(label_shape)

    util.compare_tensor_dict_key(predict, label, "shape")
    util.compare_tensor_dict_key(predict, label, "dtype")

    if p != 1:
        raise RuntimeError("lp_loss only supports l1_loss")

    if reduction not in reduction_list:
        raise RuntimeError("reduction should be one of ['none','mean','sum']")

    predict_data = tvm.placeholder(predict_shape,
                                   dtype=predict_dtype,
                                   name="predict_data")
    label_data = tvm.placeholder(label_shape,
                                 dtype=label_dtype,
                                 name="label_data")

    res = lp_loss_compute(predict_data, label_data, p, reduction, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [predict_data, label_data, res]
    }
    te.lang.cce.cce_build_code(schedule, config)

コード例 #21

0

ファイルを表示

def elu_grad(grads, activations, y, kernel_name="elu_grad"):
    """
    do element-wise elu_grad operation

    Parameters:
    ----------
    grads: the dict of gradient input, only support float16, float32

    activations: the dict of activation input, only support float16, float32

    y : the dict of output

    kernel_name : cce kernel name, default value is "cce_elu_grad"

    Returns
    -------
    None
    """

    shape_gradient = grads.get("shape")
    shape_activation = activations.get("shape")
    dtype_gradient = grads.get("dtype")
    dtype_activation = activations.get("dtype")

    check_shape(shape_gradient, param_name="grads")
    check_shape(shape_activation, param_name="activations")
    if not operator.eq(shape_gradient, shape_activation):
        raise RuntimeError("all input shape must be equal")
    shape_gradient, _ = refine_shape_axes(shape_gradient, [])
    shape_activation, _ = refine_shape_axes(shape_activation, [])

    check_list = ("float16", "float32")
    check_dtype(dtype_gradient, check_list, param_name="grads")
    check_dtype(dtype_activation, check_list, param_name="activations")
    if dtype_gradient.lower() != dtype_activation.lower():
        raise RuntimeError("all input dtype must be same")

    dtype = dtype_gradient.lower()
    data_gradient = tvm.placeholder(shape_gradient,
                                    dtype=dtype,
                                    name="data_gradient")
    data_activation = tvm.placeholder(shape_activation,
                                      dtype=dtype,
                                      name="data_activation")
    res = elu_grad_compute(data_gradient, data_activation, y, kernel_name)

    with tvm.target.cce():
        auto_sch = topi.generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "print_ir": False,
        "tensor_list": [data_gradient, data_activation, res]
    }
    te.lang.cce.cce_build_code(auto_sch, config)

コード例 #22

0

ファイルを表示

ファイル: scatter_add.py プロジェクト: gekowa/ascend-opp

    def __init__(self, var, indices, updates, var_out, use_locking,
                 kernel_name):
        self.tik_instance = tik.Tik(tik.Dprofile())
        self.indicesdtype = indices.get("dtype").lower()
        self.updatesdtype = updates.get("dtype").lower()
        self.vardtype = var.get("dtype").lower()
        self.var_out_dtype = var_out.get("dtype").lower()
        indices_support_dtype_list = ("int32", )
        check_dtype(self.indicesdtype,
                    indices_support_dtype_list,
                    param_name="indices")
        updates_support_dtype_list = ("float32", )
        check_dtype(self.updatesdtype,
                    updates_support_dtype_list,
                    param_name="updates")
        self.tiling_dtype = "int32"
        if self.updatesdtype != self.vardtype:
            error_manager_vector.raise_err_inputs_dtype_not_equal(
                kernel_name, "updates", "var", self.updatesdtype,
                self.vardtype)
        if self.vardtype != self.var_out_dtype:
            error_manager_vector.raise_err_inputs_dtype_not_equal(
                kernel_name, "var_out", "var", self.var_out_dtype,
                self.vardtype)
        self.kernel_name = kernel_name
        self.var_read_index = self.tik_instance.Scalar("int32")
        self.updates_read_index = self.tik_instance.Scalar("int32")
        self.indices_loop_index = self.tik_instance.Scalar("int32")
        self.zero_var = self.tik_instance.Scalar(dtype=self.updatesdtype,
                                                 name="zero_var")
        self.zero_var.set_as(0)
        self.indices_ub = None
        self.updates_ub = None
        self.core_num = self._tik_get_core_num()
        self.ub_size = self._tik_get_ub_size()

        self.tiling_gm = self.tik_instance.Tensor(self.tiling_dtype, (32, ),
                                                  name="tiling_gm",
                                                  scope=tik.scope_gm)
        self.input_var = self.tik_instance.Tensor(self.updatesdtype,
                                                  (MAX_ZERO_DIM_VAR, ),
                                                  name="input_var",
                                                  scope=tik.scope_gm)
        self.input_indices = self.tik_instance.Tensor(self.indicesdtype,
                                                      (MAX_ZERO_DIM_INDICE, ),
                                                      name="input_indices",
                                                      scope=tik.scope_gm)
        self.input_updates = self.tik_instance.Tensor(self.updatesdtype,
                                                      (MAX_ZERO_DIM_INDICE, ),
                                                      name="input_updates",
                                                      scope=tik.scope_gm)
        self.output_var = self.tik_instance.Tensor(self.updatesdtype,
                                                   (MAX_ZERO_DIM_VAR, ),
                                                   name="output_var",
                                                   scope=tik.scope_gm)

コード例 #23

0

ファイルを表示

def relu6_grad(input_grad, input_x, output_y, kernel_name="relu6_grad"):
    """
    Parameters
    ----------
    input_grad : dict
        shape and dtype of input_grad
    input_x : dict
        shape and dtype of input_x
    output_y : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is "relu6_grad"

    Returns
    ------
    None
    """
    # check input shape
    shape_x = input_x.get("shape")
    shape_grad = input_grad.get("shape")
    op_utils.check_shape(shape_x, param_name="input_x")
    op_utils.check_shape(shape_grad, param_name="input_grad")
    if list(shape_x) != list(shape_grad):
        raise RuntimeError("input_grad and input_x must have the same shape.")

    # check input tensor data_type and kernel_name
    check_list = ("float16", "float32")
    input_dtype = input_x.get("dtype").lower()
    grad_dtype = input_grad.get("dtype").lower()
    op_utils.check_dtype(input_dtype, check_list, param_name="input_x")
    op_utils.check_dtype(grad_dtype, check_list, param_name="input_grad")
    if input_dtype == "float32" and not tbe_platform.cce_conf.api_check_support(
            "te.lang.cce.vmuls", "float32"):
        raise RuntimeError(
            "Input dtype only support float16 while input dtype is float32")

    shape_x = [reduce_ins(lambda x, y: x * y, shape_x[:])]
    input_data_orginal = tvm.placeholder(shape_x,
                                         name="input_data",
                                         dtype=input_dtype)
    input_grad = tvm.placeholder(shape_x, name="input_grad", dtype=grad_dtype)

    final_res = relu6_grad_compute(input_grad,
                                   input_data_orginal,
                                   output_y,
                                   kernel_name="relu6_grad")
    with tvm.target.cce():
        auto_sch = generic.auto_schedule(final_res)

    config = {
        "name": kernel_name,
        "tensor_list": (input_grad, input_data_orginal, final_res)
    }

    te.lang.cce.cce_build_code(auto_sch, config)

コード例 #24

0

ファイルを表示

def acos_grad(y, dy, z, kernel_name="acos_grad"):
    """
    do element-wise acos_grad operation between two input tensors

    Parameters:
    ----------
    y : dict of y, include shape and dtype, dtype support float16, float32

    dy : dict of dy, include shape and dtype, dtype support float16, float32

    z : dict of z, include shape and dtype, dtype support float16, float32

    kernel_name : cce kernel name, default value is "acos_grad"
    -------
    """

    # get the shape and dtype for input_1,input_2
    shape_y = y.get("shape")
    shape_dy = dy.get("shape")
    dtype = y.get("dtype")
    dtype1 = dy.get("dtype")

    check_shape(shape_y, param_name="y")
    check_shape(shape_dy, param_name="dy")
    shape_y, _ = refine_shape_axes(shape_y, [])
    shape_dy, _ = refine_shape_axes(shape_dy, [])

    # raise runtimeerror if the input paras are invalid
    check_list = ("float16", "float32")
    check_dtype(dtype, check_list, param_name="y")
    check_dtype(dtype1, check_list, param_name="dy")
    dtype = dtype.lower()
    dtype1 = dtype1.lower()
    if not operator.eq(shape_y, shape_dy):
        raise RuntimeError(
            "acos_grad only support input shape while input_shape1 equals"
            " to input_shape2")
    if dtype != dtype1:
        raise RuntimeError(
            "acos_grad only support dtype while input_dtype1 equals"
            " to input_dtype2")
    shape_y, _ = refine_shape_axes(shape_y, [])
    shape_dy, _ = refine_shape_axes(shape_dy, [])

    data_y = tvm.placeholder(shape_y, dtype=dtype, name="data1")
    data_dy = tvm.placeholder(shape_dy, dtype=dtype, name="data2")

    res = acos_grad_compute(data_y, data_dy, z, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": (data_y, data_dy, res)}
    te.lang.cce.cce_build_code(sch, config)

コード例 #25

0

ファイルを表示

def apply_power_sign_d(var,
                       m,
                       lr,
                       logbase,
                       sign_decay,
                       beta,
                       grad,
                       var_out,
                       m_out,
                       kernel_name="apply_power_sign_d"):
    """
    Update '*var' according to the AddSign update

    Parameters:
    ----------
    var: dict of Variable, only support float16, float32
    m : dict of input_grad, only support float16, float32
    lr : dict of lr, only support float16, float32
    logbase : dict of logbase, only support float16, float32
    sign_decay : dict of sign_decay, only support float16, float32
    grad : dict of grad, only support float16, float32
    beta : dict of beta, only support float16, float32
    var_out : dict of output, only support float16, float32
    m_out : dict of output, only support float16, float32
    kernel_name : cce kernel name, default value is apply_power_sign

    Algorithm :
    ----------
    m_t <- beta * m_{t-1} + (1 - beta) * grad
    update <- exp(logbase * sign_decay * sign(grad) * sign(m_t)) * grad
    variable <- variable - lr_t * update


    Returns
    ----------
    None
    """
    input_dict = (var, m, lr, logbase, sign_decay, beta, grad)

    check_list = ('float16', 'float32')
    dtype = var.get('dtype')
    check_dtype(dtype, check_list, param_name="var")
    dtype = dtype.lower()

    args = ApplyOpConfig.TensorArgs(input_dict, apply_power_sign_d_compute,
                                    [var_out, m_out],
                                    6 if dtype == 'float32' else 10)
    name = ApplyOpConfig.TensorName(all=('var', 'm', 'lr', 'logbase',
                                         'sign_decay', 'beta', 'grad'),
                                    scalar=('lr', 'logbase', 'sign_decay',
                                            'beta'),
                                    reuse=('m', 'var'))

    common_apply_op_process(ApplyOpConfig(args, name), kernel_name)

コード例 #26

0

ファイルを表示

def asin_grad(y, dy, z, kernel_name="asin_grad"):
    """
    do element-wise asin_grad operation between two input tensors

    Parameters:
    ----------
    y : dict of y, include shape and dtype, dtype support float16, float32

    dy : dict of dy, include shape and dtype, dtype support float16, float32

    z : dict of output

    kernel_name : cce kernel name, default value is "asin_grad"

    Returns
    -------
    None
    """

    # get the shape and dtype
    shape_y = y.get("shape")
    shape_dy = dy.get("shape")
    dtype_y = y.get("dtype")
    dtype_dy = dy.get("dtype")

    # kernel name check: should be unique

    # check whether the shape is right
    check_shape(shape_y, param_name="y")
    check_shape(shape_dy, param_name="dy")
    if not operator.eq(shape_y, shape_dy):
        raise RuntimeError("all input shape must be the same")
    shape_y, _ = refine_shape_axes(shape_y, [])
    shape_dy, _ = refine_shape_axes(shape_dy, [])

    # check whether dtypes are fp16,fp32 and whether they are the same
    check_list = ("float16", "float32")
    check_dtype(dtype_y, check_list, param_name="y")
    check_dtype(dtype_dy, check_list, param_name="dy")
    dtype_y = dtype_y.lower()
    if dtype_y != dtype_dy.lower():
        raise RuntimeError("all input dtype must be same")

    # get 2 input tensors: data_y, data_dy
    data_y = tvm.placeholder(shape_y, name="data_y", dtype=dtype_y)
    data_dy = tvm.placeholder(shape_y, name="data_dy", dtype=dtype_y)
    res = asin_grad_compute(data_y, data_dy, z, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_y, data_dy, res]}
    te.lang.cce.cce_build_code(sch, config)

コード例 #27

0

ファイルを表示

ファイル: atan2.py プロジェクト: gekowa/ascend-opp

def atan2(x1, x2, y, kernel_name="atan2"):
    """
    Algorithm: arctan2
        arctan2(y, x) = arctan(y/x)
    ----------------------------------
    Parameters:

        x1: the dict of input data x1, only support float16, float32.

        x2: the dict of input data x2, only support float16, float32.

        y: the dict of output

        kernel_name: default value is "atan2".
    ----------------------------------
    Returns:
        None
    """

    y_shape = x1.get("shape")
    x_shape = x2.get("shape")

    y_dtype = x1.get("dtype")
    x_dtype = x2.get("dtype")

    check_shape(y_shape, param_name="x1")
    check_shape(x_shape, param_name="x2")

    shape_y, shape_x, shape_max = broadcast_shapes(
        y_shape, x_shape, param_name_input1="x1", param_name_input2="x2")

    check_list = ("float16", "float32")
    check_dtype(y_dtype, check_list, param_name="x1")
    check_dtype(x_dtype, check_list, param_name="x2")
    if y_dtype.lower() != x_dtype.lower():
        raise RuntimeError("The input tensor must have identical dtype!")
    shape_y, shape_x = refine_shapes_for_broadcast(shape_y, shape_x)
    input_y = tvm.placeholder(shape_y, dtype=y_dtype.lower(), name="input_y")
    input_x = tvm.placeholder(shape_x, dtype=x_dtype.lower(), name="input_x")

    res = atan2_compute(input_y, input_x, y, kernel_name)
    res = te.lang.cce.cast_to(res, x_dtype.lower())
    with tvm.target.cce():
        auto_sch = topi.generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": (input_y, input_x, res),
        "print_ir": False,
        "bool_storage_as_1bit": False
    }

    te.lang.cce.cce_build_code(auto_sch, config)

コード例 #28

0

ファイルを表示

    def check_input_params(self):
        """
        to the check whether the input parameters is valid or not
        """
        if self.input_dtype != self.output_dtype:
            error_manager_vector.raise_err_inputs_dtype_not_equal(
                "split_d", "self.input_dtype", "self.output_dtype",
                self.input_dtype, self.output_dtype)

        dtype_list = (
            "float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64"
        )
        check_dtype(self.input_dtype, dtype_list, param_name="x")

コード例 #29

0

ファイルを表示

ファイル: depthwise_weight_6d_2_4d.py プロジェクト: gekowa/ascend-opp

def depthwise_weight_6d_2_4d(x,
                             y,
                             src_format,
                             dst_format,
                             kernel_name="depthwise_weight_6d_2_4d"):
    """Operation and Schedule for depthwise_weight_6d_2_4d.

    Parameters
    ----------
    x: shape and dtype of input, the dtype support float16, float32,
    int32, uint16.

    y: the shape and dtype of outputs, the dtype same as input.

    src_format: the source data_format

    dst_format: the target data_format

    kernel_name : cce kernel name, default value is "depthwise_weight_6d_2_4d"

    Returns
    -------
        convert C1HWNCoC0 tp HWCN
    """
    _check_parameters(x, y, src_format, dst_format)
    output_shape = y.get("shape")
    channel_size = output_shape[2]
    input_shape = x.get("shape")
    dtype = x.get("dtype")
    channel_4d = channel_size
    op_utils.check_shape(input_shape, param_name="x")

    check_list = ("float16", "float32", "int32", "uint16")
    dtype = dtype.lower()
    op_utils.check_dtype(dtype, check_list, param_name="x")

    input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype)

    six2four = _Six2FourParam(input_shape, channel_4d)

    res = tvm.extern(
        [six2four.get_out_shape()], [input_data],
        lambda ins, outs: _intrin_factor(six2four, dtype, ins, outs),
        name="res",
        dtype=dtype)

    sch = tvm.create_schedule(res.op)
    build_list = [input_data, res]

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)

コード例 #30

0

ファイルを表示

ファイル: depthwise_weight_4d_2_6d.py プロジェクト: gekowa/ascend-opp

def depthwise_weight_4d_2_6d(x,
                             y,
                             src_format,
                             dst_format,
                             kernel_name="depthwise_weight_4d_2_6d"):
    """Operation and Schedule for depthwise_weight_4d_2_6d.

    Parameters
    ----------
    x: shape and dtype of input, the dtype support float16,
    float32, int32, uint16.

    y: the shape and dtype of outputs, the dtype same as input.

    src_format: the source data_format

    dst_format: the target data_format

    kernel_name : cce kernel name, default value is "depthwise_weight_4d_2_6d"

    Returns
    -------
        convert HWCN to C1HWNCoC0
    """
    if src_format.lower() != "hwcn":
        raise RuntimeError("dst_format must be HWCN!")

    if dst_format.lower() != "c1hwncoc0":
        raise RuntimeError("src_format must be C1HWNCoC0 !")

    input_shape = x.get("shape")
    dtype = x.get("dtype")
    op_utils.check_shape(input_shape, param_name="x")
    check_list = ("float16", "float32", "int32", "uint16")
    dtype = dtype.lower()
    op_utils.check_dtype(dtype, check_list, param_name="x")

    input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype)
    four2six = _Four2SixParam(input_shape)

    res = tvm.extern(
        [four2six.get_out_shape()], [input_data],
        lambda ins, outs: _intrin_factor(four2six, dtype, ins, outs),
        name="res",
        dtype=dtype)

    sch = tvm.create_schedule(res.op)
    build_list = [input_data, res]

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)