コード例 #1
0
def load_to_l1(input_x, output_x, kernel_name="load_to_l1"):
    """
    copy data from ddr to l1

    Parameters
    ----------
    input_x : TVM tensor
        the input tensor
    output_x : dict
        dict of output_x, include keys(shape and dtype)
    kernel_name : str
        kernel name, default value is "load_to_l1"

    Returns
    -------
    None
    """
    input_shape = input_x.get("shape")
    input_dtype = input_x.get("dtype")

    input_tensor = tvm.placeholder(input_shape,
                                   name="input_tensor",
                                   dtype=input_dtype)

    res = load_to_l1_compute(input_tensor, output_x, kernel_name=kernel_name)
    sch = tvm.create_schedule([res.op])

    sch[res].set_scope(cce.scope_cbuf_fusion)
    sch[res].emit_insn(res.op.axis[0], 'dma_copy')

    tensor_list = [input_tensor, res]

    with build_config:
        tvm.build(sch, tensor_list, "cce", name=kernel_name)
コード例 #2
0
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False,
                 need_print=False):
    """
    do element-wise equal operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input x

    shape_y : shape of input y

    dtype : source data type, support float16,float32,int32,int8,uint8

    kernel_name : cce kernel name, default value is "cce_tf_equal"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"]

    dtype = dtype.lower()
    if not (dtype in check_list):
        raise RuntimeError(
            "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype))

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)

    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    x_tmp = te.lang.cce.broadcast(x, shape_max)
    y_tmp = te.lang.cce.broadcast(y, shape_max)

    res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res')

    sch = tvm.create_schedule(res.op)

    if need_print:
        with build_config:
            print(tvm.lower(sch, [x, y, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(sch, [x, y, res], "cce", name=kernel_name)
コード例 #3
0
ファイル: assign_sub.py プロジェクト: gekowa/ascend-opp
def assign_sub(var, value, out, kernel_name='assign_sub'):
    """
    Update var by subtracting value from it.

    Parameters:
    ----------
    var : dict
        dict of input_var, include shape and dtype,
        dtype support int8, uint8, int32, float16, float32

    value : dict
        dict of input_value, include shape and dtype,
        dtype support int8, uint8, int32, float16, float32.
        Must have the same shape and dtype as input_var

    out : dict
        dict of out

    kernel_name : str
        cce kernel name, default value is "assign_sub"

    Returns
    -------
    None
    """

    # get the shape and dtype
    shape_var = var.get("shape")
    shape_value = value.get("shape")
    dtype_var = var.get("dtype")
    dtype_value = value.get("dtype")

    # kernel name check: should be unique

    # check whether the shape is right
    check_shape(shape_var, param_name="var")
    check_shape(shape_value, param_name="value")
    if not operator.eq(shape_var, shape_value):
        raise RuntimeError("all input shape must be the equal")

    # check whether dtypes are fp16, fp32, int8, uint8, int32
    # and whether they are the same
    check_list = ("float16", "float32", "int8", "uint8", "int32")
    check_dtype(dtype_var, check_list, param_name="var")
    check_dtype(dtype_value, check_list, param_name="value")
    dtype_var = dtype_var.lower()
    dtype_value = dtype_value.lower()
    if dtype_var != dtype_value:
        raise RuntimeError("all input dtype must be same")

    shape, _ = refine_shape_axes(shape_var, [])
    data_var = tvm.placeholder(shape, dtype=dtype_var, name='data_var')
    data_value = tvm.placeholder(shape, dtype=dtype_value, name='data_value')
    sch, res = _assign_sub_compute(data_var, data_value, out, kernel_name)

    with set_bool_storage_config():
        tvm.build(sch, [data_var, data_value, res], "cce", name=kernel_name)
コード例 #4
0
def custom_logical_not(shape,
                       dtype,
                       kernel_name="cce_tf_logical_not",
                       need_build=False,
                       need_print=False):
    """
    logical not for the input tensor

    Parameters
    ----------
    shape : input shape of data

    dtype : the data type, support bool

    kernel_name : cce kernel name, default value is "cce_logical_not"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)

    check_list = ["bool"]
    if not dtype.lower() in check_list:
        raise RuntimeError(
            "logical_not_cce ony supports %s while dtype is %s" %
            (",".join(check_list), dtype))

    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    data = tvm.placeholder(shape, name="data", dtype=inp_dtype)

    with tvm.target.cce():

        result = tvm.compute(
            shape,
            lambda *i: tvm.select(data[i] is True, False, True),
            name="result")

        schedule = tvm.create_schedule(result.op)

        if need_print:
            with build_config:
                print(tvm.lower(schedule, [data, result], simple_mode=True))
        if need_build:
            with build_config:
                tvm.build(schedule, [data, result], "cce", name=kernel_name)
コード例 #5
0
def bn_training_reduce(x, sum, square_sum, kernel_name="bn_training_reduce"):
    """
    algorithm: part of fused_batch_norm_v2
    The first step of batch_norm
    which to calculate the sum and square sum of x.
    The major component of this operator is reduce operation.

    Parameters
    ----------
    x: dict
        dict of input, A 5HD Tensor for input data.
    sum: dict
        dict of sum, A `Tensor`. Sum of x.
    square_sum: dict
        dict of square_sum, A `Tensor`. Square sum of x.
    kernel_name: str
        kernel name, default value is "bn_training_reduce"

    Returns
    -------
    None
    """

    shape_x = x.get("shape")
    dtype_x = x.get("dtype")

    check_shape(shape_x, param_name="x")
    check_dtype(dtype_x.lower(), ("float16", "float32"), param_name="x")

    data_format = x.get("format")
    origin_format = x.get("ori_format")
    _check_format(data_format, origin_format)

    x_input = tvm.placeholder(shape_x, name="x_input", dtype=dtype_x.lower())

    res = bn_training_reduce_compute(x_input,
                                     sum,
                                     square_sum,
                                     kernel_name=kernel_name)
    if data_format == "NC1HWC0":
        with tvm.target.cce():
            sch = generic.auto_schedule(res)
    else:
        sch, tensor_list = bn_training_reduce_schedule_nd(res)

        with build_config:
            tvm.build(sch, tensor_list, "cce", name=kernel_name)
        return
    tensor_list = [x_input] + list(res)

    config = {"name": kernel_name, "tensor_list": tensor_list}
    te.lang.cce.cce_build_code(sch, config)
コード例 #6
0
def depthwise_weight_6d_2_4d(x,
                             y,
                             src_format,
                             dst_format,
                             kernel_name="depthwise_weight_6d_2_4d"):
    """Operation and Schedule for depthwise_weight_6d_2_4d.

    Parameters
    ----------
    x: shape and dtype of input, the dtype support float16, float32,
    int32, uint16.

    y: the shape and dtype of outputs, the dtype same as input.

    src_format: the source data_format

    dst_format: the target data_format

    kernel_name : cce kernel name, default value is "depthwise_weight_6d_2_4d"

    Returns
    -------
        convert C1HWNCoC0 tp HWCN
    """
    _check_parameters(x, y, src_format, dst_format)
    output_shape = y.get("shape")
    channel_size = output_shape[2]
    input_shape = x.get("shape")
    dtype = x.get("dtype")
    channel_4d = channel_size
    op_utils.check_shape(input_shape, param_name="x")

    check_list = ("float16", "float32", "int32", "uint16")
    dtype = dtype.lower()
    op_utils.check_dtype(dtype, check_list, param_name="x")

    input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype)

    six2four = _Six2FourParam(input_shape, channel_4d)

    res = tvm.extern(
        [six2four.get_out_shape()], [input_data],
        lambda ins, outs: _intrin_factor(six2four, dtype, ins, outs),
        name="res",
        dtype=dtype)

    sch = tvm.create_schedule(res.op)
    build_list = [input_data, res]

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)
コード例 #7
0
def depthwise_weight_4d_2_6d(x,
                             y,
                             src_format,
                             dst_format,
                             kernel_name="depthwise_weight_4d_2_6d"):
    """Operation and Schedule for depthwise_weight_4d_2_6d.

    Parameters
    ----------
    x: shape and dtype of input, the dtype support float16,
    float32, int32, uint16.

    y: the shape and dtype of outputs, the dtype same as input.

    src_format: the source data_format

    dst_format: the target data_format

    kernel_name : cce kernel name, default value is "depthwise_weight_4d_2_6d"

    Returns
    -------
        convert HWCN to C1HWNCoC0
    """
    if src_format.lower() != "hwcn":
        raise RuntimeError("dst_format must be HWCN!")

    if dst_format.lower() != "c1hwncoc0":
        raise RuntimeError("src_format must be C1HWNCoC0 !")

    input_shape = x.get("shape")
    dtype = x.get("dtype")
    op_utils.check_shape(input_shape, param_name="x")
    check_list = ("float16", "float32", "int32", "uint16")
    dtype = dtype.lower()
    op_utils.check_dtype(dtype, check_list, param_name="x")

    input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype)
    four2six = _Four2SixParam(input_shape)

    res = tvm.extern(
        [four2six.get_out_shape()], [input_data],
        lambda ins, outs: _intrin_factor(four2six, dtype, ins, outs),
        name="res",
        dtype=dtype)

    sch = tvm.create_schedule(res.op)
    build_list = [input_data, res]

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)
コード例 #8
0
ファイル: zn_2_hwcn.py プロジェクト: gekowa/ascend-opp
def zn_2_hwcn(src, dst, src_format, dst_format, kernel_name='zn_2_hwcn'):
    """
    algorithm: zn_2_hwcn
    calculating: change data format from Zn to HWCN

    Parameters
    ----------
    src: dict
        contains shape and dtype information of input tensor
    dst: dict
        contains shape and dtype information of output tensor
    src_format: str
        represents the format of input tensor, only support "Zn"
    dst_format: str
        represents the format of output tensor, only support "HWCN"
    kernel_name: str
        cce kernel name, default value is "zn_2_hwcn"

    Returns
    -------
    None
    """
    _check_parameters(src, dst, src_format, dst_format, kernel_name)
    dst_shape = dst.get("shape")
    dtype = src.get("dtype")

    h_i, w_i, c_i, n_i = dst_shape
    c_0 = 16
    if dtype == "int8":
        c_0 = 32
    c_1 = _ceil_div(c_i, c_0)
    n_ni = 16
    n_no = _ceil_div(n_i, n_ni)
    shape_zn = [c_1*h_i*w_i, n_no, n_ni, c_0]

    branch = _get_ir_branch(shape_zn, dtype)
    data = tvm.placeholder(shape_zn, dtype=dtype, name="data")
    if branch == "more_row":
        res = tvm.extern(dst_shape, [data],
                         lambda ins, outs: _more_row_ir(outs[0], ins[0], c_0),
                         name="res", dtype=dtype)
    else:
        res = tvm.extern(dst_shape, [data],
                         lambda ins, outs: _split_row_ir(outs[0], ins[0]),
                         name="res", dtype=dtype)

    tensor_list = [data, res]
    sch = tvm.create_schedule(res.op)
    with build_config:
        tvm.build(sch, tensor_list, "cce", name=kernel_name)
コード例 #9
0
ファイル: upsample.py プロジェクト: gekowa/ascend-opp
def upsample(x, y, scale=1.0, stride_h=2, stride_w=2, kernel_name="upsample"):
    """
    calculating data

    Parameters
    ---------
    x : dict
        include shape dtype and format
    stride_h : int
        the shape change axis h
    stride_w : int
        the shape change axis w
    scale : float
        the value of tensor change axis, default value is 1.0
    y :output
    kernel_name : str
        kernel name, default value is "upsample"

    Returns
    -------
    None
    """
    upsample_check(x, stride_h, stride_w, kernel_name)
    dtype = x.get("dtype")
    op_list, ins_list, tensor_dic, feature, y \
        = gen_upsample(x, dtype, scale, stride_h, stride_w)
    schedule = tvm.create_schedule(y.op)
    # skip the res buffer
    buffer_mapping(schedule, op_list[:-1])
    tilling_spilt_axis_dic \
        = tilling_spilt_axis(schedule, tensor_dic, stride_h, stride_w)
    cal_axis_dic, axis \
        = cal_axis_spilt(x, stride_h, stride_w,
                         tilling_spilt_axis_dic, tensor_dic, schedule)

    axis_list = upsample_compute(schedule, cal_axis_dic, tensor_dic)
    res_op = tensor_dic.get("res")
    ins_emit(schedule, op_list, axis_list, ins_list)
    if axis == 0:
        schedule[y].bind(cal_axis_dic.get("axis_xo"),
                         tvm.thread_axis("blockIdx.x"))
    else:
        res_out, _ = bind_multcore(axis, x, schedule, res_op)
        schedule[y].bind(res_out, tvm.thread_axis("blockIdx.x"))

    with build_config:
        tvm.build(schedule, [feature, y], "cce", name=kernel_name)
コード例 #10
0
ファイル: store_to_gm.py プロジェクト: gekowa/ascend-opp
def store_to_gm(input_x, output_x, kernel_name="store_to_gm"):
    """
    copy data from l1 to ddr (l1 --> ub --> ddr)

    Parameters
    ----------
    input_x : TVM tensor
        the input tensor
    output_x : dict
        dict of output_x, include keys(shape and dtype)
    kernel_name : str
        kernel name, default value is "store_to_gm"

    Returns
    -------
    None
    """
    input_shape = input_x.get("shape")
    input_dtype = input_x.get("dtype")

    input_tensor = tvm.placeholder(input_shape,
                                   name="input_tensor",
                                   dtype=input_dtype)

    res, res_ub = store_to_gm_compute(input_tensor,
                                      output_x,
                                      kernel_name=kernel_name)
    sch = tvm.create_schedule([res.op])

    split_axis, split_factor = _tilling_axis(input_shape, input_dtype)
    axis_outer, axis_inner = sch[res].split(res.op.axis[split_axis],
                                            factor=split_factor)

    sch[res_ub].compute_at(sch[res], axis_outer)
    sch[input_tensor].set_scope(cce.scope_cbuf_fusion)
    sch[res_ub].set_scope(cce.scope_ubuf)

    sch[res_ub].emit_insn(res_ub.op.axis[split_axis], 'dma_copy')
    sch[res].emit_insn(axis_inner, 'dma_copy')

    tensor_list = [input_tensor, res]

    with build_config:
        tvm.build(sch, tensor_list, "cce", name=kernel_name)
コード例 #11
0
ファイル: flatten.py プロジェクト: gekowa/ascend-opp
def flatten(x, y, kernel_name="flatten"):
    """return a copy of the tensor collapsed into one dimension.

    Parameters
    ----------
    x : dict
        shape and dtype of input.
    y : dict
        shape and dtype of output.
    kernel_name : str
        kernel name, default value is "flatten"

    Returns
    -------
    None
    """
    shape = x.get("shape")
    dtype = x.get("dtype")
    dtype_lower = dtype.lower()
    check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16",
                  "uint32", "uint64", "float16", "float32")

    check_shape(shape, param_name="x")
    check_dtype(dtype_lower, check_list, param_name="x")

    size = 1
    for i, _ in enumerate(shape):
        size = size * shape[i]

    shape_new = [size]
    data = tvm.placeholder(shape_new, name="data", dtype=dtype_lower)
    data_ub = tvm.compute(shape_new, lambda *i: data(*i), name='data_ub')
    res = tvm.compute(shape_new, lambda *i: data_ub(*i), name='res')

    sch = tvm.create_schedule(res.op)
    sch[data_ub].set_scope(tbe_platform.scope_ubuf)

    sch_new = _tile_axis([sch, data_ub, res], shape_new, dtype_lower)

    with build_config:
        tvm.build(sch_new, [data, res], "cce", name=kernel_name)
コード例 #12
0
    def build_cce(self):
        """
        Build cce
        """
        self._compute()
        tiling_cases = self._calc_tiling_case()
        for case in tiling_cases:
            tvm_vars = self.dim_info_vars.copy()
            right_dim_in = CompileVar("right_dim_in", self.right_range)
            tvm_vars.append(right_dim_in)
            split_factor = CompileVar("split_factor",
                                      case.get("ub_factor_bound"))
            tvm_vars.append(split_factor)

            var_list = [var.get_tvm_var() for var in tvm_vars]
            sch, tensor_list = self._unpack_schedule(
                case.get("block_tiling_axis"), right_dim_in.get_tvm_var(),
                case.get("ub_tiling_axis"), split_factor.get_tvm_var())

            # set var bound
            for var in tvm_vars:
                sch.set_var_range(var.get_tvm_var(), *(var.get_bound()))

            self.sch_list.append(sch)
            self.arg_list.append(var_list + tensor_list)

            self.rules.append(case.get("key"))
            self.compile_vars[case.get("key")] = \
                [var.get_name() for var in tvm_vars]

        build_config_items = {"parse_ddr_args": True,
                              "build_fatbin": True}
        build_config = cce_build.build_config_update_list(
            cce_build.dynamic_build_config,
            build_config_items)

        with build_config:
            tvm.build(self.sch_list, self.arg_list, rules=self.rules,
                      target="cce", name=self.kernel_name)
コード例 #13
0
def batchnorm_fold2_grad_reduce(dout,
                                x,
                                dout_reduce,
                                dout_x_reduce,
                                kernel_name="batchnorm_fold2_grad_reduce"):
    """_BatchNormFold2GradReduce op"""
    shape = x.get("shape")
    x_format = x.get("format")
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
    check_list = ["float16", "float32"]
    inp_dtype = x.get("dtype").lower()
    if not inp_dtype in check_list:
        raise RuntimeError("Dtype of input only support float16, float32")
    dout_t = tvm.placeholder(shape, name="dout", dtype=inp_dtype)
    x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype)

    res_list = batchnorm_fold2_grad_reduce_compute(dout_t, x_t, dout,
                                                   kernel_name)

    if x_format == "NC1HWC0":
        with tvm.target.cce():
            sch = generic.auto_schedule(res_list)
        tensor_list = [dout_t, x_t] + list(res_list)
        config = {
            "print_ir": False,
            "name": kernel_name,
            "tensor_list": tensor_list
        }

        te.lang.cce.cce_build_code(sch, config)
        return
    from impl.bn_training_reduce import bn_training_reduce_schedule_nd
    sch, tensor_list = bn_training_reduce_schedule_nd(res_list)
    with build_config:
        tvm.build(sch, tensor_list, "cce", name=kernel_name)
コード例 #14
0
def space_to_batch_d(x,
                     y,
                     block_size,
                     paddings,
                     kernel_name="space_to_batch_d"):
    """
    the main function of space_to_batch_d

    Parameters
    ----------
    x: dict,shape and datatype,datatype supports float16,float32
    y: dict,shape and datatype,datatype supports float16,float32
    block_size: must be greater than one. It indicates the block size
    paddings: (tuple, list),the padding of the input with zeros across the
              spatial dimensions as follows:
              paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
    kernel_name: cce kernel name, default value is "space_to_batch_d"
    Returns
    -------
    None
    """
    if len(paddings) == 4:
        paddings = [[paddings[0], paddings[1]], [paddings[2], paddings[3]]]

    _check_param(x, y, paddings, block_size, kernel_name)

    input_shape = x.get("shape")
    input_dtype = x.get("dtype").lower()
    block_shape = [block_size, block_size]

    data = tvm.placeholder(input_shape, name="data", dtype=input_dtype)
    res = space_to_batch_nd_d_compute(data, y, block_shape, paddings,
                                      kernel_name)
    sch = tvm.create_schedule(res.op)
    with build_config:
        tvm.build(sch, [data, res], "cce", name=kernel_name)
コード例 #15
0
def custom_round(shape,
                 dtype,
                 kernel_name="cce_round",
                 need_build=False,
                 need_print=False):
    """
    doing round operations, calculating data type is float16 or float32 or int32
    
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype

    kernel_name : cce kernel name, default value is "cce_round"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
        
    """
    check_list = ["float16", "float32", "int32"]
    device_api_map = {
        "float16": "cc_device_round_float16",
        "float32": "cc_device_round_float",
        "int32": "cc_device_round_int32"
    }

    max_dim = 8
    shape_len = len(shape)
    if shape_len > max_dim:
        raise RuntimeError(
            "round_cce only support up to %d dimensions while the shape's dimension is %d"
            % (max_dim, shape_len))

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in check_list):
        raise RuntimeError("round_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)
    device_api = device_api_map[inp_dtype]

    block_num = "block_num"
    block_idx = "block_idx"
    v_ndim = tvm.const(len(shape), "int32")
    padC0 = tvm.const(0, "int32")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_input, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_ndim,
            ins[1].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
コード例 #16
0
ファイル: drop_out_do_mask.py プロジェクト: gekowa/ascend-opp
def drop_out_do_mask(input_tensor,
                     input_mask,
                     input_keep_prob,
                     output,
                     kernel_name="dropout_do_mask"):
    """
    algorithm: tf_dropout_do_mask
    scale_x = x*(1 / keep_prob)
    res = select(mask == 1, scale_x, 0)

    Parameters
    ----------
    input_tensor : dict,shape and dtype of input_tensor,only support float16 and float32
    input_mask : dict,shape and dtype of input_mask
        shape of mask,1D, dtype == uint8
        length=(size(shape_tensor)+ELEMS_BATCH_PROCESS_FP16
        -1)/ELEMS_BATCH_PROCESS_FP16*ELEMS_BATCH_PROCESS_FP16/8
        eg. shape_tensor=[2,5,8] shape_mask=[16] shape_res=[2,5,8]
        shape_tensor=[15,17,19] shape_mask=[608] shape_res=[15,17,19]
    input_keep_prob : dict,shape and dtype of input_keep_prob
        shape of keep_prob, only 1 parament and equals to (1)
        prob scale (0.0,1.0] NOTICE: type same as dytpe
    output : dict,shape and dtype of output
    kernel_name : str
        cce kernel name, default value is "dropout_do_mask"

    Returns
    -------
    None
    """
    shape_tensor = input_tensor.get("shape")
    shape_mask = input_mask.get("shape")
    shape_keep_prob = input_keep_prob.get("shape")
    dtype = input_tensor.get("dtype")
    if shape_keep_prob == 1:
        shape_keep_prob = (shape_keep_prob, )
    check_shape(shape_tensor, param_name="input_tensor")
    check_dtype(dtype.lower(), ["float16", "float32"],
                param_name="input_tensor")
    if len(shape_mask) != 1:
        raise RuntimeError("The length of mask shape must be 1")
    if shape_keep_prob not in [(1, ), [
            1,
    ]]:
        raise RuntimeError("Only support shape (1, ) or [1, ]")
    # functools_reduce: product of all dimension
    # Align to ELEMS_BATCH_PROCESS_FP16
    product_mask = (functools_reduce(lambda x, y: x*y, shape_tensor[:]) +
                    ELEMS_BATCH_PROCESS_FP16 - 1) // \
                   ELEMS_BATCH_PROCESS_FP16 * ELEMS_BATCH_PROCESS_FP16 // 8
    if product_mask != shape_mask[0]:
        raise RuntimeError("The mask[0] should=%d, but now=%d" %
                           (product_mask, shape_mask[0]))
    data_tensor = tvm.placeholder(
        (functools_reduce(lambda x, y: x * y, shape_tensor), ),
        dtype=dtype,
        name="data_tensor")
    data_mask = tvm.placeholder(
        (functools_reduce(lambda x, y: x * y, shape_mask), ),
        dtype='uint8',
        name="data_mask")
    keep_prob_tensor = tvm.placeholder(shape_keep_prob,
                                       dtype=dtype,
                                       name="keep_prob_tensor")
    const_1 = tvm.const(1.0, dtype=dtype)

    res = tvm.extern([shape_tensor, shape_mask, shape_keep_prob],
                     [data_tensor, data_mask, keep_prob_tensor],
                     lambda ins, outs: _kernel_ir(outs, ins, const_1),
                     name="res",
                     dtype=dtype)

    tensor_list = [data_tensor, data_mask, keep_prob_tensor, res]
    schedule = tvm.create_schedule(res.op)

    with build_config:
        tvm.build(schedule, tensor_list, "cce", name=kernel_name)
コード例 #17
0
def custom_batch_matmul(shape_x,
                        shape_y,
                        dtype,
                        trans_a=False,
                        trans_b=False,
                        kernel_name="cce_tf_batch_matmul",
                        need_build=False,
                        need_print=False):
    """
    Multiplies slices of two tensors in batches(each slice can be viewed
    as an element of a batch), the output is of the same batch size.

    Each of the individual slices can optionally be transposed before
    multiplication by setting the trans_a or trans_b flag to True, which
    are by default False. The input tensors are 2-D or higher with the
    shape [..., r_x, c_x] and [..., r_y, c_y]. The output tensor is 2-D
    or higher with the shape [..., r_o, c_o], where
    r_o = c_x if trans_a else r_x
    c_o = r_y if trans_b else c_y

    Parameters
    ----------
    shape_x : shape of the first tensor x with rank > 1

    shape_y : shape of the second tensor y with the same type and shape with x

    dtype : the data type, support int8, uint8,float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_batch_matmul"

    trans_a : if True, shape_x is transposed before multiplication

    trans_b : if True, shape_y is transposed before multiplication

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    data_dtype = dtype.lower()
    check_list = ["int8", "uint8", "float16", "float32", "int32"]
    if data_dtype not in check_list:
        raise RuntimeError(
            "batch_matmul_cce ony supports %s while dtype is %s" %
            (",".join(check_list), dtype))

    def transpose_tensor(shape, size):
        """Transpose the shape, e.g., the shape [..., r_x, c_x] is transposed
        to [..., c_x, r_x].

        Parameters
        ----------
        shape : shape of a tensor

        size : length of the shape

        Returns
        -------
        shape_ori : the transposed shape
        """
        shape_ori = ()
        if size == 1:
            shape_ori = shape_ori + shape
        elif size == 2:
            shape_ori = shape_ori + (shape[1], ) + (shape[0], )
        else:
            shape_ori = shape_ori + (shape[:(size - 2)]) + (
                shape[size - 1], ) + (shape[size - 2], )
        return shape_ori

    def check_matmul(shape_x, shape_y):
        """Check whether batch_matmul is supported or not.

        Parameters
        ----------
        shape_x : shape of the first tensor x

        shape_y : shape of the second tensor y with the same type and shape
        with x

        Returns
        -------
        None
        """
        len_x = len(shape_x)
        len_y = len(shape_y)
        if (len_x < 2) or (len_y < 2):
            raise RuntimeError("Only tensors of rank>=2 are supported!")
        if shape_x[len_x - 1] != shape_y[len_y - 2]:
            raise RuntimeError(
                "Invalid matrix multiplication for the inner 2 dimensions!")
        if (len_x == len_y) and (len_x > 2):
            for i in range(len_x - 2):
                if shape_x[i] != shape_y[i]:
                    raise RuntimeError("Outer dimensions do not match!")
            return
        elif (len_x == len_y) and (len_x == 2):
            return
        else:
            raise RuntimeError("The input tensors are not with the same rank!")

    def _compute(output_shape, x, y, K, trans_a, trans_b, *indices):
        """matmul compuation in terms of the output shape and the transposes

        Parameters
        ----------
        output_shape : the final output shape, e.g., shape_x = (2, 6),
            shape_y = (8, 2), trans_a = True, True_b = True, then,
            output_shape = (6, 8).

        x : the first input tensor according to shape_x.

        y : the second input tensor according to shape_y.

        K : the number of the axis for sum, in the above example, K = 2.

        trans_a : if True, x needs to be transposed.

        trans_b : if True, y needs to be transposed.

        *indices : the output shape space for tvm.compute.

        Returns
        -------
        tvm.Tensor
        """
        n_len = len(output_shape)
        k = tvm.reduce_axis((0, K), 'k')
        if trans_a is True and trans_b is False:
            # For example, A: (6, 7, 8), B: (6, 7, 9), so the length is n = 3
            # C = A' * B : (6, 8, 9), A' means the transpose of A
            # indices means the space of (6, 8, 9), k = 7
            # x_indices = indices[:1]+(7, )+indices[1:2] = (6, 7, 8)
            # y_indices = indices[:1]+(7, )+indices[2:] = (6, 7, 9)
            x_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 2):
                                                                (n_len - 1)]
            y_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 1):]
            return tvm.sum(x(*x_indices) * y(*y_indices), axis=k)
        elif not trans_a and trans_b:
            # For example, A: (6, 7, 8), B: (6, 9, 8), C = A * B' : (6, 7, 9)
            # indices means the space of (6, 7, 9), n=3, k = 8
            # x_indices = indices[:2]+(8, ) = (6, 7, 8)
            # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 9, 8)
            x_indices = indices[:(n_len - 1)] + (k, )
            y_indices = indices[:(n_len - 2)] + indices[(n_len - 1):] + (k, )
            return tvm.sum(x(*x_indices) * y(*y_indices), axis=k)
        elif trans_a and trans_b:
            # For example, A: (6, 8, 10), B: (6, 12, 8), C = A' * B' : \
            # (6, 10, 12)
            # indices means the space of (6, 10, 12), n=3, k = 8
            # x_indices = indices[:1]+(8, )+indices[1:2] = (6, 8, 10)
            # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 12, 8)
            x_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 2):
                                                                (n_len - 1)]
            y_indices = indices[:(n_len - 2)] + indices[(n_len - 1):] + (k, )
            return tvm.sum(x(*x_indices) * y(*y_indices), axis=k)
        else:
            # For example, A: (6, 15, 16), B: (6, 16, 18), C = A * B : \
            # (6, 15, 18)
            # indices means the space of (6, 15, 18), n=3, k = 16
            # x_indices = indices[:2]+(16, ) = (6, 15, 16)
            # y_indices = indices[:1]+(16, )+indices[2:] = (6, 16, 18)
            x_indices = indices[:(n_len - 1)] + (k, )
            y_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 1):]
            return tvm.sum(x(*x_indices) * y(*y_indices), axis=k)

    def check_supportted_shape_size(shape_x, shape_y, limit, trans_a, trans_b):
        """
        check shape size for operator
        ----------
        shape: shape of data

        limit: limit of the product

        Returns
        -------
        None
        """
        # This function is used to check whether the shape is too large to \
        # cause a timeout.
        # shape_x = (a,b,c,d,e,k)  shape_y = (a,b,c,d,k,f)
        # t_1 : time consumed by each addition operation
        # t_2 : time consumed by each multiplication operation
        # t_all : time consumed by a complete calculation
        # t_all is approximately equal to (a*b*c*d)*(e*k*f)*(t_1+t_2)
        # As (t_1 + t_2) is a constant, so t_all is proportional to \
        # (a * b * c * d * e * k * f)

        len_x = len(shape_x)
        len_y = len(shape_y)
        if (len_x < 2) or (len_y < 2):
            raise RuntimeError("Only tensors of rank>=2 are supported!")

        shape_x = list(shape_x)
        shape_y = list(shape_y)

        tmp_shape_x = shape_x[:]
        if trans_a:
            tmp_shape_x = shape_x[:-2] + [shape_x[-1], shape_x[-2]]

        tmp_shape_y = shape_y[:]
        if trans_b:
            tmp_shape_y = shape_y[:-2] + [shape_y[-1], shape_y[-2]]

        union_shape = tmp_shape_x + [tmp_shape_y[-1]]

        union_size = reduce(lambda i, j: i * j, union_shape)

        if union_size > limit:
            raise RuntimeError("the shape is too large to calculate")

    if data_dtype in ["float16", "float32", "int32"]:
        type_shape_map = {
            'float16': SHAPE_SIZE_FP16_LIMIT,
            'float32': SHAPE_SIZE_FP32_LIMIT,
            'int32': SHAPE_SIZE_INT32_LIMIT
        }

        check_supportted_shape_size(shape_x, shape_y,
                                    type_shape_map[data_dtype], trans_a,
                                    trans_b)

    x_size = len(shape_x)
    y_size = len(shape_y)
    shape_a = shape_x
    shape_b = shape_y
    if trans_a is True:
        shape_x = transpose_tensor(shape_x, x_size)

    if trans_b is True:
        shape_y = transpose_tensor(shape_y, y_size)

    check_matmul(shape_x, shape_y)
    last_axis = shape_x[x_size - 1]

    x_temp = tvm.placeholder(shape_a, name="input_1", dtype=data_dtype)
    y_temp = tvm.placeholder(shape_b, name="input_2", dtype=data_dtype)

    # output shape
    output_shape = ()
    for i in range(x_size - 1):
        output_shape = output_shape + (shape_x[i], )
    output_shape = output_shape + (shape_y[x_size - 1], )
    result = tvm.compute(
        output_shape,
        lambda *indices: _compute(output_shape, x_temp, y_temp, last_axis,
                                  trans_a, trans_b, *indices),
        name="result")
    schedule = tvm.create_schedule(result.op)

    if need_print:
        with build_config:
            print(
                tvm.lower(schedule, [x_temp, y_temp, result],
                          simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [x_temp, y_temp, result],
                      "cce",
                      name=kernel_name)
コード例 #18
0
ファイル: custom_pow.py プロジェクト: huaweiatlas-test/AtkOps
def custom_pow(shape,
               shape_y,
               dtype,
               kernel_name="cce_tf_pow",
               need_build=False,
               need_print=False):
    """
    calculate x^y, calculating data type is float16 or float32 or int32
    when x < 0 , the output is a meaningless value.
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support
    float16, float32, int32

    kernel_name : cce kernel name, default value is "tf_pow_cce"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32", "int32"]
    device_api = "cc_device_pow"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError("tf_pow_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_lhs = tvm.placeholder(shape, name="data_lhs", dtype=inp_dtype)
    data_rhs = tvm.placeholder(shape, name="data_rhs", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([0], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_power = util.create_param_ptr([0], inp_dtype, "p_power")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_lhs, data_rhs, p_scale, p_shift, p_power, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[2].access_ptr("r"),  # scale
            ins[3].access_ptr("r"),  # shift
            ins[4].access_ptr("r"),  # power
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            v_ndim,
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[1].access_ptr("r"),  # input y
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(
                tvm.lower(schedule, [data_lhs, data_rhs, output],
                          simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_lhs, data_rhs, output],
                      "cce",
                      name=kernel_name)
コード例 #19
0
def lars_v2_update(weight,
                   grad,
                   weight_s,
                   grad_s,
                   weight_decay,
                   learning_rate,
                   out,
                   hyperparam=0.001,
                   epsilon=1e-5,
                   use_clip=False,
                   kernel_name="lars_update"):
    """
    the opreator's compute
    hyper_weight_norm = hyperparam * sqrt(weight_s)
    grad_weight_norm = sqrt(grad_s) + weight_decay*sqrt(weight_s) + epsilon
    grad_weight = grad + weight_decay * weight

    if use_clip == True:
        coeff = hyper_weight_norm / grad_weight_norm
        coeff = min(coeff / learning_rate, 1)
        coeff = max(coeff, 0)
    else:
        coeff = hyper_weight_norm / grad_weight_norm

    grad_new = coeff * grad_weight
    Parameters:
    ----------
    weight: dict
        input tensor contains shape and dtype attributes.
        only support float32.
    grad: dict
        input tensor contains shape and dtype attributes.
        Must have the same dtype and shape as 'weight'.
    weight_s: dict
        input tensor contains shape and dtype attributes.
        Must have the same dtype as 'weight'.
    grad_s: dict
        input tensor contains shape and dtype attributes.
        Must have the same dtype as 'weight'.
    weight_decay: dict
        input tensor contains shape and dtype attributes.
        Must have the same dtype as 'weight'.
    learning_rate: dict
        input tensor contains shape and dtype attributes.
        Must have the same dtype as 'weight'.
    out: dict
        output tensor contains shape and dtype attributes.
        Must have the same dtype and shape  as 'weight'.
    hyperparam: float
        default value is 0.001
    epsilon: float
        default value is 1e-5
    use_clip: bool
        default value is "False".
    kernel_name : str
        kernel name, default value is "lars_update"

    Returns:
    None
    """

    check_list = ("float16", "float32")
    inputs = [weight, grad, weight_s, grad_s, weight_decay, learning_rate]

    weight_shape = weight.get("shape")
    grad_shape = grad.get("shape")
    weight_dtype = weight.get("dtype")
    grad_dtype = grad.get("dtype")
    if list(weight_shape) != list(grad_shape):
        raise RuntimeError("weight and grad must be the same shape")

    if grad_dtype != weight_dtype:
        raise RuntimeError("wight and grad must be the same dtype")

    vdiv_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.vdiv", "float32")
    if weight_dtype == "float32" and not vdiv_support:
        raise RuntimeError(
            "Input dtype is float32, but do not support on the platform")

    input_place_holders = []
    for i, input_val in enumerate(inputs):
        input_dtype = input_val.get("dtype").lower()
        input_shape = input_val.get("shape")
        op_utils.check_shape(input_shape)
        op_utils.check_dtype(input_dtype, check_list)
        shape_one_dim = (functools_reduce(operator.mul, input_shape), )
        input_place_holders.append(
            tvm.placeholder(shape_one_dim,
                            name="input_data_%d" % i,
                            dtype=input_dtype))

    res = lars_v2_update_compute(input_place_holders, hyperparam, epsilon,
                                 use_clip, out, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    data = input_place_holders
    data.append(res)

    new_config = build_config_update(build_config, "dummy_placeholder", True)
    with new_config:
        tvm.build(schedule, data, "cce", name=kernel_name)
コード例 #20
0
def strided_slice_d(input_x,
                    output_x,
                    begin,
                    end,
                    strides=None,
                    begin_mask=0,
                    end_mask=0,
                    ellipsis_mask=0,
                    new_axis_mask=0,
                    shrink_axis_mask=0,
                    kernel_name="strided_slice_d"):
    """
    Extracts a strided slice of a tensor (generalized python array indexing).
    Roughly speaking, this op extracts a slice of size (end-begin)/stride
    from the given input_ tensor.
    Starting at the location specified by begin the slice continues
     by adding stride to the index
    until all dimensions are not less than end. Note that a stride
    can be negative, which causes a reverse slice.

    Parameters
    ----------
    input_x : dict
        shape and dtype of input
    output_x : dict
        shape and dtype of out
    begin: list.
        represents the index of the first value to select.
    end: list.
        represents the index of the last value to select.
    strides: list or tuple.
        step length to select.
    begin_mask: int
        a bitmask where a bit i being 1 means to ignore the begin
        value and instead use the
        largest interval possible.
    end_mask: int
        analogous to `begin_mask`.
    ellipsis_mask: int
        a bitmask where bit `i` being 1 means the `i`th position
        is actually an ellipsis.
    new_axis_mask: int
        a bitmask where bit `i` being 1 means the `i`th specification creates a
        new shape 1 dimension.
    shrink_axis_mask: int
        a bitmask where bit `i` implies that the `i`th specification
        should shrink the dimensionality.
    kernel_name : str
        cce kernel name, default value is "strided_slice_d"

    Returns
    -------
    None
    """
    input_shape = input_x.get("shape")
    input_dtype = input_x.get("dtype").lower()
    check_list = ("float16", "float32", "int32", "uint8", "bool", "int8")

    check_dtype(input_dtype, check_list, param_name="input_x")
    check_shape(input_shape, param_name="input_x")

    begin = list(begin)
    end = list(end)

    if not _check_parameter(input_shape, begin, end, strides, ellipsis_mask,
                            new_axis_mask, shrink_axis_mask):
        raise RuntimeError("Parameter Invalid!")

    if strides is None:
        strides = _fill_list_with_ones(len(input_shape))
    else:
        strides = list(strides)

    input_tensor = tvm.placeholder(input_shape,
                                   dtype=input_dtype,
                                   name='input_tensor')

    [output, out_shape] = strided_slice_d_compute(input_tensor,
                                                  output_x,
                                                  begin,
                                                  end,
                                                  strides,
                                                  begin_mask,
                                                  end_mask,
                                                  ellipsis_mask,
                                                  new_axis_mask,
                                                  shrink_axis_mask,
                                                  kernel_name=kernel_name)

    # pylint: disable=locally-disabled,unnecessary-lambda
    out_tensor = tvm.compute(out_shape,
                             lambda *i: output(*i),
                             name='out_tensor',
                             tag='strided_slice_d|3')

    input_size = functools_reduce(lambda x, y: x * y, input_shape[0:])
    out_size = functools_reduce(lambda x, y: x * y, out_shape[0:])

    output_dtype = output_x.get("dtype").lower()
    output_shape = output_x.get("shape")
    if input_size == out_size:
        if output_dtype == "bool":
            input_x["dtype"] = "int8"
            output_x["dtype"] = "int8"
        if len(output_shape) == 0:
            output_x["shape"] = (1, )
        copy_only(input_x, output_x, kernel_name)
        return
    output_shape_one = list(output_shape)
    if ellipsis_mask == 0 and shrink_axis_mask != 0:
        for i, _ in enumerate(list(input_shape)):
            if (shrink_axis_mask & 2**i) == 2**i:
                output_shape_one.insert(i, 1)
    output_shape = tuple(output_shape_one)

    # for RL tune getting res
    fusion_manager.set_op_res(out_tensor)

    ret, sch = rl_bank.query_rl_bank([out_tensor])
    if ret and sch:
        with build_config:
            tvm.build(sch, [input_tensor, out_tensor], "cce", name=kernel_name)
        return

    sch = tvm.create_schedule(out_tensor.op)
    sch[output].set_scope(tbe_platform.scope_ubuf)

    sch_input_shape = []
    for dim in output.shape:
        sch_input_shape.append(dim.value)
    check_result = _check_last_axis_situation(sch_input_shape, begin, end,
                                              strides)
    if check_result:
        _schedule_last_axis(sch, sch_input_shape, output, out_tensor,
                            input_dtype)
        with build_config:
            tvm.build(sch, [input_tensor, out_tensor], "cce", name=kernel_name)
        return

    if _check_tik_branch(input_shape, output_shape, begin, end, strides):
        begin_shape = copy.deepcopy(begin)
        end_shape = copy.deepcopy(end)
        stride_shape = list(strides)
        stride_shape = copy.deepcopy(stride_shape)
        input_list = list(input_shape)
        # update begin_shape, end_shape
        begin_shape, end_shape, stride_shape = _init_parameter(
            input_list, begin_shape, end_shape, stride_shape, begin_mask,
            end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask)
        head_size = 1
        for i in range(0, (len(input_shape) - 1)):
            head_size = head_size * input_shape[i]
        if input_dtype == "float32" and input_shape[-1] == 2 and \
           begin_shape[len(begin_shape) - 1] == 0  and end_shape[len(begin_shape) - 1] == 1 \
           and head_size > 128:
            strided_slice_two_turn_one(input_x, output_x, kernel_name)
            return
        if input_list[-1] > 80 and output_shape[-1] == 80:
            res1 = strided_slice_last_dim_only(input_shape, input_dtype,
                                               output_shape, begin_shape,
                                               kernel_name)
            if res1:
                return
        if input_list[-1] >= 32 and input_list[-1] < 7500 and len(output_shape) > 1 and \
           output_shape[-1] >= 32:
            res = strided_slice_last_dim_mte(input_shape, input_dtype,
                                             output_shape, begin_shape,
                                             kernel_name)
            if res:
                return
        res = strided_slice_last_dim(input_shape, input_dtype, output_shape,
                                     begin_shape, end_shape, stride_shape,
                                     kernel_name)
        if res:
            return
        else:
            res1 = strided_slice_last_dim_one(input_shape, input_dtype,
                                              output_shape, begin_shape,
                                              kernel_name)
            if res1:
                return

    split_axis, split_factor = _tilling_axis(out_shape, dtype=input_dtype)
    core_state = _get_multicore(out_shape, input_dtype, split_axis,
                                split_factor)
    axis_outer, axis_inner = sch[out_tensor].split(
        out_tensor.op.axis[split_axis], factor=split_factor)

    if split_axis == 0:
        core_num = _get_target_core_num(out_shape[split_axis] // split_factor)
        axis_outer_outer, axis_outer_inter = sch[out_tensor].split(
            axis_outer, nparts=core_num)
    else:
        core_num = _get_target_core_num(out_shape[0])
        axis_outer_outer, axis_outer_inter = sch[out_tensor].split(
            out_tensor.op.axis[0], nparts=core_num)
        for i in range(1, split_axis):
            axis_outer_inter = sch[out_tensor].fuse(axis_outer_inter,
                                                    out_tensor.op.axis[i])
        axis_outer_inter = sch[out_tensor].fuse(axis_outer_inter, axis_outer)

    sch[output].compute_at(sch[out_tensor], axis_outer_inter)

    sch[output].emit_insn(output.op.axis[0], insn_cmd.DMA_COPY)  # gm-ub
    if len(out_shape) >= 2:
        # Convert bytes to Bytes
        dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(
            input_dtype) // 8
        # 32 means one block size(32 Bytes), divide by 32 to
        # get the numbers of data that
        # can be stored in one block.
        element = 32 // dtype_bytes_size
        align_axis = _get_align_axis(out_shape)
        sch[output].storage_align(output.op.axis[align_axis], element, 0)

    if core_state:
        thread_block = tvm.thread_axis("blockIdx.x")
        sch[out_tensor].bind(axis_outer_outer, thread_block)

    sch[out_tensor].emit_insn(axis_inner, insn_cmd.DMA_COPY)  # ub-gm

    with build_config:
        tvm.build(sch, [input_tensor, out_tensor], "cce", name=kernel_name)
コード例 #21
0
def unpack(x, y, num=None, axis=0, kernel_name="unpack"):
    """
    unpacks the given dimension of a rank R tensor into rank (R-1) tensors.

    Parameters
    ----------
    x : dict.
        shape, dtype and format of value to be unpacked.
    y: tuple or list
        the list of output tensor.
    num : int.
        the length of the dim axis, automatically inferred if None(default).
    axis: int.
        the axis to unpack along.
    kernel_name : str
        cce kernel name, default value is "unpack".

    Returns
    -------
    None
    """
    shape = x.get("shape")
    dtype = x.get("dtype").lower()
    format = x.get("format")
    _check_params(shape, num, axis, format, dtype, kernel_name)

    # infer the value of num
    real_axis = axis + len(shape) if axis < 0 else axis
    num = shape[real_axis]

    # turn the input shape into three dimensions (a, b, c), so axis = 1
    beferdim = 1
    for befer_dim in shape[0:real_axis]:
        beferdim *= befer_dim
    afterdim = 1
    for after_dim in shape[real_axis + 1:]:
        afterdim *= after_dim
    reshape = (beferdim, shape[real_axis], afterdim)

    _, _, is_use_split = check_use_special_optimize(dtype,
                                                    afterdim,
                                                    flag=False)
    reshape_input = x.copy()
    reshape_input["shape"] = reshape
    real_axis = 1
    # only 1 output tensor, so output equals to input
    if num == 1:
        copy_only(reshape_input, reshape_input, kernel_name)
    # use split
    elif is_use_split:
        split_d(reshape_input,
                y,
                split_dim=real_axis,
                num_split=num,
                kernel_name=kernel_name)
    else:
        new_dtype, afterdim, _ = check_use_special_optimize(dtype,
                                                            afterdim,
                                                            flag=False)
        new_shape = (beferdim, reshape[real_axis], afterdim)

        input_place = tvm.placeholder(new_shape,
                                      name="input_place",
                                      dtype=new_dtype)
        sch, build_list = _unpack_schedule(input_place, reshape, y, num,
                                           real_axis, dtype)

        with build_config:
            tvm.build(sch, build_list, "cce", name=kernel_name)
コード例 #22
0
def custom_Upsample(shape,
                    dtype,
                    scale,
                    data_format="channels_last",
                    kernel_name="cce_darknet_upsample",
                    need_build=False,
                    need_print=False):
    """
    Parameters
    ----------
    shape: input tensor's shape

    dtype: input tensor's dtype, support:`float16,float32

    scale: the upsampling factors

    data_format: "channels_last" or "channels_first"

    kernel_name : kernel name, default value is "MyUpsample"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    """
    TODO:
    Please refer to the TE DSL Manual, And code here with TE DSL.
    """
    inp_dtype = dtype.lower()
    check_list = ["float16", "float32", "int32", "int8", "uint8"]
    if inp_dtype not in check_list:
        raise RuntimeError("upsample only support %s while dtype is %s" %
                           (",".join(check_list), dtype))

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
    size = (scale, scale)

    shape_size = len(shape)
    if not (shape_size == 4 or shape_size == 5):
        raise RuntimeError(
            "upsample only support 4D or 5D while len(shape):%d" % len(shape))

    input_tensor = tvm.placeholder(shape, name="input_tensor", dtype=inp_dtype)

    res = None
    if shape_size == 5:
        # shape_size == 5 D-sepecial (N, C1, H, W, C0)
        output_shape = (shape[0], shape[1], shape[2] * size[0],
                        shape[3] * size[1], shape[4])
        res = tvm.compute(
            output_shape, lambda n, c0, h, w, c: input_tensor[n, c0, h // size[
                0], w // size[1], c])
    else:
        if data_format == "channels_last":
            output_shape = (shape[0], shape[1] * size[0], shape[2] * size[1],
                            shape[3])
            res = tvm.compute(
                output_shape, lambda n, h, w, c: input_tensor[n, h // size[0],
                                                              w // size[1], c])
        elif data_format == "channels_first":
            output_shape = (shape[0], shape[1], shape[2] * size[0],
                            shape[3] * size[1])
            res = tvm.compute(
                output_shape, lambda n, c, h, w: input_tensor[n, c, h // size[
                    0], w // size[1]])
        else:
            raise RuntimeError(
                "upsample only support channels_last|channels_first "
                "while input type %s" % data_format)

    schedule = tvm.create_schedule(res.op)
    if need_print:
        with build_config:
            print(tvm.lower(schedule, [input_tensor, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(schedule, [input_tensor, res], "cce", name=kernel_name)
コード例 #23
0
def avg_pool_grad_d(input_grad,
                    mean_matrix,
                    kernel_matrix,
                    out_grad,
                    orig_input_shape,
                    ksize,
                    strides,
                    padding,
                    data_format='NHWC',
                    kernel_name="cce_avg_pool_grad_dilation"):
    """
    computes average pooling backwards gradients.

    Parameters:
    ----------

    input_grad: a dict, global model support 'NHWC' or 'NCHW'
                and padding valid, common model support 'NHWC'
                and float16

    mean_matrix: a dict or nonetype, global model support 'NHWC' or 'NCHW'
                and padding valid, common model support 'NHWC'
                and float16

    kernel_matrix: a dict or nonetype, global model support 'NHWC' or 'NCHW'
                and padding valid, common model support 'NHWC'
                and float16

    out_grad: a dict, global model support 'NHWC' or 'NCHW'
                and padding valid, common model support 'NHWC'
                and float16

    orig_input_shape: orward input shape, 4-D list, global model
                     support 'NHWC' or 'NCHW' and padding valid,
                     common model support 'NHWC'

    ksize: filter window size, int or 4-D list, support 'NHWC'

    strides: strides over h and w axis, int or 4-D list,
             support 'NHWC' or 'NCHW'

    padding:global model support 'NHWC' or 'NCHW' and padding valid

    data_format: support 'NHWC' or 'NCHW'

    kernel_name : cce kernel name, default value is "cce_avg_pool_grad_dilation"

    Returns
    -------
    None
    """

    input_grad_ori_format = input_grad.get('ori_format')
    if input_grad_ori_format == "NHWC":
        kernel_h = ksize[1]
        kernel_w = ksize[2]
        stride_h = strides[1]
        stride_w = strides[2]
        # transfer 4D to 5D orig_input_shape
        ON, OHH, OWW, OC = orig_input_shape
    elif input_grad_ori_format == "NCHW":
        kernel_h = ksize[2]
        kernel_w = ksize[3]
        stride_h = strides[2]
        stride_w = strides[3]
        # transfer 4D to 5D orig_input_shape
        ON, OC, OHH, OWW = orig_input_shape
    OC1 = _ceil(OC) // BLOCK_SIZE
    OC0 = BLOCK_SIZE
    orig_input_shape = ON, OC1, OHH, OWW, OC0
    input_grad_shape = input_grad.get("shape")
    out_grad_shape = out_grad.get("shape")
    dtype = input_grad.get("dtype").lower()

    op_utils.check_shape(input_grad_shape,
                         min_rank=INPUT_DIM,
                         max_rank=INPUT_DIM)
    op_utils.check_shape(orig_input_shape,
                         min_rank=INPUT_DIM,
                         max_rank=INPUT_DIM)
    op_utils.check_shape(strides, min_rank=SHAPE_SIZE, max_rank=SHAPE_SIZE)
    op_utils.check_shape(ksize, min_rank=SHAPE_SIZE, max_rank=SHAPE_SIZE)

    if list(out_grad_shape) != list(orig_input_shape):
        raise RuntimeError("out_grad_shape must equal input_grad_shape")
    if stride_h < 1 or stride_w < 1:
        raise RuntimeError("stride should >= 1")

    data_dtype = dtype.lower()
    op_utils.check_dtype(data_dtype, ('float16', ))

    _, _, HH, WW, _ = orig_input_shape

    if (HH == kernel_h and WW == kernel_w and input_grad_shape[2] == 1
            and input_grad_shape[3] == 1 and padding == 'VALID'):
        # for mobileV2 net, only support VALID padding.
        if padding != 'VALID':
            raise RuntimeError("gobla model ,padding only support VALID ")
        else:
            pad_top, pad_left, pad_bottom, pad_right = 0, 0, 0, 0

        input_grad = tvm.placeholder(input_grad_shape,
                                     name="input_grad",
                                     dtype=data_dtype)

        # input_grad is overlapped result
        filter_num_h = (HH - kernel_h + pad_top + pad_bottom) // stride_h + 1
        filter_num_w = (WW - kernel_w + pad_left + pad_right) // stride_w + 1

        # global_avgpool, input FMAP size equals kernel size, kernel number=1
        if not (filter_num_h == 1 and filter_num_w == 1):
            raise RuntimeError("global average pooling, input_grad_h"
                               "and input_grad_w must equel 1")

        kernel_size_reciprocal = 1.0 / (kernel_h * kernel_w)

        with tvm.target.cce():
            input_grad_fp32 = te.lang.cce.cast_to(input_grad, "float32")
            grad_tmp = te.lang.cce.vmuls(input_grad_fp32,
                                         kernel_size_reciprocal)
            if data_dtype == "float16":
                grad_tmp = te.lang.cce.cast_to(grad_tmp, "float16")
            res = te.lang.cce.broadcast(grad_tmp, orig_input_shape)
            sch = generic.auto_schedule(res)
        config = {"name": kernel_name, "tensor_list": [input_grad, res]}
        te.lang.cce.cce_build_code(sch, config)
    else:
        shape_in = orig_input_shape
        shape_in_n, shape_in_c1, shape_in_h, shape_in_w, shape_in_c0 = shape_in
        shape_k = (shape_in_c1, kernel_h, kernel_w, 1, BLOCK_SIZE, BLOCK_SIZE)
        shape_out = input_grad_shape
        shape_out_n, shape_out_c1, shape_out_h, shape_out_w, \
        shape_out_c0 = shape_out
        # strides dim is two
        strides = stride_h, stride_w

        parameter_check(shape_in, shape_k, shape_out, dtype, strides, padding,
                        kernel_name)

        shape_in = shape_in_n, shape_in_c1, 1, \
                   shape_in_h, shape_in_w, shape_in_c0
        shape_k = (shape_out_c1, kernel_h * kernel_w, 1, BLOCK_SIZE,
                   BLOCK_SIZE)
        shape_out = shape_out_n, shape_out_c1, 1, \
                    shape_out_h, shape_out_w, shape_out_c0
        kernel_placeholder = tvm.placeholder(shape_k,
                                             dtype=dtype,
                                             name='kernel')
        dout_placeholder = tvm.placeholder(shape_out, dtype=dtype, name='dout')

        vealuemean_placeholder = tvm.placeholder(shape_out,
                                                 dtype=dtype,
                                                 name='dvealuemean')
        res = avg_pool_grad_compute(shape_in, kernel_placeholder,
                                    dout_placeholder, vealuemean_placeholder,
                                    [kernel_h, kernel_w], strides, padding)

        s = avg_pool_grad_schedule(res)

        with tbe_platform.build_config:
            tvm.build(s, [
                dout_placeholder, vealuemean_placeholder, kernel_placeholder,
                res
            ],
                      "cce",
                      name=kernel_name)
コード例 #24
0
def histogram_fixed_width_d(x,
                            range,
                            y,
                            nbins,
                            dtype="int32",
                            kernel_name='histogram_fixed_width_d'):
    """this operation returns a rank 1 histogram counting
     the number of entries in `values` that fell into every bin.
      The bins are equal width and determined by the arguments
    `value_range` and `nbins`.

    Parameters
    ----------
    x: dict
        dict info of input value, must include the keys(shape and dtype).
    range: dict
        dict info of input value_range, must include the keys(shape and dtype).
                        the shape must be (2,) or [2]
    y: dict
        dict info of output
    nbins: int
        number of histogram bins.
    dtype: str
        data type for returned histogram.
    kernel_name: str
        cce kernel name, default value is "histogram_fixed_width"


    returns
    -------
    None
    """
    input_shape_list = [x.get("shape"), range.get("shape")]
    input_dtype = x.get("dtype")
    dtype_input = input_dtype.lower()

    check_shape(input_shape_list[0], param_name="x")
    check_shape(input_shape_list[1], param_name="range")
    util.compare_tensor_dict_key(x, range, "dtype")
    data_shape_size = util.check_tensor_shape_size(list(input_shape_list[0]))
    data_range_shape_size = util.check_tensor_shape_size(
        list(input_shape_list[1]))

    check_dtype(dtype_input, ("float16", "float32", "int32"), param_name="x")

    if data_range_shape_size != 2:
        raise RuntimeError("the shape of range must be (2,) or [2]")

    if nbins <= 0:
        raise RuntimeError("the nbins must be > 0")

    data = tvm.placeholder([data_shape_size],
                           dtype=dtype_input,
                           name="input_data")
    range_data = tvm.placeholder([data_range_shape_size],
                                 dtype=dtype_input,
                                 name="input_range_data")

    res = histogram_fixed_width_d_compute(data, range_data, y, nbins,
                                          kernel_name)
    sch = tvm.create_schedule(res.op)
    with build_config:
        tvm.build(sch, [data, range_data, res], "cce", name=kernel_name)
コード例 #25
0
ファイル: cast.py プロジェクト: gekowa/ascend-opp
def cast(input_x, output_y, dst_type, kernel_name="cast"):
    """
    cast a tensor/scaler with input shape form src data type to dst data
    type. restrictions of input algorithms are as follow
    only types' groups blow are support tensor process:
        float16->float32
        float16->int32
        float32->float16
        float32->int32
        int8->float32
        uint8->float32
        int8->float16
        uint8->float16
        int8->int32
        uint8->int32
        int32->uint8 // number out of [0,255] can get unexpected result
        int32->int8 // number out of [-128,127] can get unexpected result
        int32->float32 // For tans with fp16, only guarantees
                        number in [-1023,1023] get correct result
        int32->float16 // only guarantees
                        number in [-1023,1023] get correct result
    scale convert support:(means only support shape [1,])
        int64->int32
        int64->float32

    Parameters
    ----------
    input_x : dict
        shape and dtype of input, only support float16, float32
    output_y: dict
        shape and dtype of output, should be same shape as input,
        and the dtype is the dst dtype need to cast
    kernel_name : str
        cce kernel name, default value is cast

    Returns
    -------
    None
    """
    shape = util.scalar2tensor_one(input_x.get("shape"))
    src_type = input_x.get("dtype").lower()
    check_shape(shape, param_name="input_x")

    if src_type == "bool":
        src_type = "int8"

    dst_type = _cast_dsttype_conversion(dst_type)
    fuseshape = [1]
    fuseshape[0] = reduceIns(lambda x, y: x * y, shape)
    data = tvm.placeholder(fuseshape, name="data", dtype=src_type)
    if src_type == "int64":
        check_dtype(dst_type, ("float32", "int32"), param_name="dst_type")
        res = tvm.extern(
            [fuseshape], [data],
            lambda ins, outs: _kernel_ir(outs, ins, dst_type, "int64"),
            name="res",
            dtype=dst_type)
        tensor_list = [data, res]
        schedule = tvm.create_schedule(res.op)
        with build_config:
            tvm.build(schedule, tensor_list, "cce", name=kernel_name)
    else:
        with tvm.target.cce():
            res = cast_compute(data, output_y, dst_type, kernel_name)
            sch = generic.auto_schedule(res)
        config = {
            "print_ir": False,
            "name": kernel_name,
            "tensor_list": [data, res]
        }
        te.lang.cce.cce_build_code(sch, config)
コード例 #26
0
def SpatialTransformer(input_shape, out_shape, dtype="float32", kernel_name="SpatialTransformer", need_build = True, need_print = False):
    """Spatial Transformer Layer
    
    Implements a spatial transformer layer as described in [1]_.
    Based on [2]_.
    
    Parameters
    ----------
    input_shape : 
        the shape of input tensor
        [num_batch, height, width, num_channels]
        
    out_shape: float
        the height and width of output tensor
        [out_height, out_width].
        
    out_size: tuple of two ints
        The size of the output of the network (height, width)
        
    dtype: data type    
        
    kernel_name : kernel name, default value is "SpatialTransformer"

    need_buid : if need to build CCEC kernel, default value is True

    need_print : if need to print the ir, default value is False
    
    Returns
    -------
    tvm.Tensor

    References
    ----------
    .. [1]  Spatial Transformer Networks
            Max Jaderberg, Karen Simonyan, Andrew Zisserman, Koray Kavukcuoglu
    .. [2]  https://github.com/tensorflow/models/tree/master/research/transformer
    """
    
    def _meshgrid(height, width):
		
        y0 = tvm.compute((height,), lambda i: -1 + i * 2.0 / (height - 1), name = 'y0')
        x0 = tvm.compute((width,), lambda i: -1 + i * 2.0 / (width - 1), name = 'x0')
        
        y = tvm.compute((height * width,), lambda i: y0[i // width], name = 'y')
        x = tvm.compute((height * width,), lambda i: x0[i % width], name = 'x')
        
        y = topi.reshape(y, (1, height * width))
        x = topi.reshape(x, (1, height * width))
        ones = tvm.compute((1, height * width), lambda i,j:1, name = 'ones')
         
        grid = tvm.compute((3, height * width),lambda i,j: 0.5 * (i - 1) * (i - 2) * x[0,j] + i * (2 - i) * y[0,j] + 0.5 * i * (i-1) * ones[0,j], name = 'grid')
        
        #grid = topi.concatenate((x,y,ones),0) #can not use topi.concatenate
        return grid       

    def _interpolate(im, im_shape, x, y, out_size, dtype):
        
        num_batch = im_shape[0]
        height = im_shape[1]
        width = im_shape[2]
        channels = im_shape[3]
            
        out_height = out_size[0]
        out_width = out_size[1]
        max_y = int(im_shape[1] - 1)
        max_x = int(im_shape[2] - 1)
               
        #[-1,1] -> [0, width-1]
        x = topi.multiply(topi.add(x, tvm.const(1, dtype=dtype)), width / tvm.const(2, dtype=dtype))
        y = topi.multiply(topi.add(y, tvm.const(1, dtype=dtype)), height / tvm.const(2, dtype=dtype))
            
        # do sampling
        dim3 = out_height * out_width * num_batch
            
        x0 = topi.cast(topi.floor(x), 'int32')  
        y0 = topi.cast(topi.floor(y), 'int32')
        x1 = topi.add(x0,tvm.const(1, dtype="int32"))
        y1 = topi.add(y0,tvm.const(1, dtype="int32"))

        x0 = topi.clip(x0, 0, max_x)
        x1 = topi.clip(x1, 0, max_x)
        y0 = topi.clip(y0, 0, max_y)
        y1 = topi.clip(y1, 0, max_y)

        dim2 = width
        dim1 = width * height

        base = tvm.compute((dim3,),lambda i:(i // (out_height * out_width)) * width * height, name = 'base')        
        base_y0 = topi.add(base, topi.multiply(y0, dim2))
        base_y1 = topi.add(base, topi.multiply(y1, dim2))

        idx_a = topi.add(base_y0, x0)
        idx_b = topi.add(base_y1, x0)
        idx_c = topi.add(base_y0, x1)
        idx_d = topi.add(base_y1, x1)
                
        im_flat = topi.reshape(im, (num_batch * height * width, channels))
        im_flat = topi.cast(im_flat, dtype)
        
        Ia = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_a[i], j], name = 'Ia')       
        Ib = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_b[i], j], name = 'Ib') 
        Ic = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_c[i], j], name = 'Ic')
        Id = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_d[i], j], name = 'Id')
            
        x0_f = topi.cast(x0, dtype)
        x1_f = topi.cast(x1, dtype)
        y0_f = topi.cast(y0, dtype)
        y1_f = topi.cast(y1, dtype)
        wa = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y1_f, y)), 1)
        wb = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y, y0_f)), 1)
        wc = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y1_f, y)), 1)
        wd = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y, y0_f)), 1)
 
        output = topi.add(topi.add(topi.add(topi.multiply(wa, Ia), topi.multiply(wb, Ib)),topi.multiply(wc, Ic)), topi.multiply(wd, Id))
        
        return output

    def _transform(theta, input_dim, out_size, input_shape, dtype):
        
        num_batch = input_shape[0]
        height = input_shape[1]
        width = input_shape[2]
        num_channels = input_shape[3]

        theta = topi.reshape(theta, (num_batch, 2, 3))
        theta = topi.cast(theta, dtype)

        out_height = out_size[0]
        out_width = out_size[1]
                
        grid = _meshgrid(out_height, out_width)       
        grid = topi.reshape(grid, (num_batch, 3, out_height*out_width))
        grid = topi.cast(grid, dtype=dtype)
        
        k = tvm.reduce_axis((0, 3), 'k')
        T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g')
              
        x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s')
        y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s')
              
        x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,))
        y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,))
                      
        input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype)
        output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels])
        return output 
    
    num_batch = input_shape[0]
    input_height = input_shape[1]
    input_width = input_shape[2]
    channel = input_shape[3]
    
    U = tvm.placeholder((num_batch, input_height, input_width, channel), name="U", dtype=dtype)    
    theta = tvm.placeholder((num_batch, 6, 1, 1), dtype=dtype)    
    output = _transform(theta, U, out_shape, input_shape, dtype)       
    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [U, theta, output], simple_mode=True))
            
    if need_build:
        with build_config:
            tvm.build(s, [U, theta, output], "cce", name=kernel_name)
コード例 #27
0
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod",
                       need_build=False, need_print=False):
    """
    do element-wise truncatemod operation between two input tensors

    Parameters:
    ----------
    shape1 : shape of input data1

    shape2 : shape of input data2

    dtype : source data type, support float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_tf_truncatemod"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    max_dim = 8
    shape1_len = len(shape1)
    shape2_len = len(shape2)
    if shape1_len > max_dim or shape2_len > max_dim:
        raise RuntimeError(
            "mod_cce only support up to %d dimensions while the shape's \
            dimensions is %d, %d" % (max_dim, shape1_len, shape2_len))
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape1)
    util.check_shape_rule(shape2)

    util.check_shape_size(shape1, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape2, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    device_api_map = {"float16": "cc_device_truncatemod_float16",
                      "float32": "cc_device_truncatemod_float",
                      "int32": "cc_device_truncatemod_int32"}

    dtype = dtype.lower()
    if dtype not in check_list:
        raise RuntimeError(
            "tf_truncatemod_cce only support %s while dtype is %s" % (
                ",".join(check_list), dtype))

    shape1, shape2, shape_out = util.produce_shapes(shape1, shape2)
    util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    device_api = device_api_map[inp_dtype]

    # block
    block_num = "block_num"
    block_idx = "block_idx"
    # x param
    v_xndim_cnt = tvm.const(len(shape1), "int32")
    p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape")
    xpad_c0 = tvm.const(0, "int32")
    data_input_x = tvm.placeholder(shape1, name="data_input_x",
                                   dtype=inp_dtype)
    # y param
    v_yndim_cnt = tvm.const(len(shape2), "int32")
    p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape")
    ypad_c0 = tvm.const(0, "int32")
    data_input_y = tvm.placeholder(shape2, name="data_input_y",
                                   dtype=inp_dtype)
    # output
    v_out_ndim_cnt = tvm.const(len(shape_out), "int32")
    p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape")
    out_padc0 = tvm.const(0, "int32")

    output = tvm.extern(shape_out,
                        [p_xshape, data_input_x, p_yshape, data_input_y,
                         p_out_shape], lambda ins, outs:
                        tvm.call_extern("int32_t", device_api,
                                        block_num,
                                        block_idx,
                                        v_xndim_cnt,
                                        ins[0].access_ptr("r"),  # shape x
                                        xpad_c0,
                                        ins[1].access_ptr("r"),  # input x
                                        v_yndim_cnt,
                                        ins[2].access_ptr("r"),  # shape y
                                        ypad_c0,
                                        ins[3].access_ptr("r"),  # input y
                                        v_out_ndim_cnt,
                                        ins[4].access_ptr("r"),  # shape out
                                        out_padc0,
                                        outs[0].access_ptr("w")),
                        name="output", dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    # print IR
    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input_x, data_input_y, output],
                            simple_mode=True))
            # Compile to generate the cce file
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input_x, data_input_y, output], "cce",
                      name=kernel_name)
コード例 #28
0
def max_pool_grad_grad_with_argmax(x,
                                   grad,
                                   argmax,
                                   y,
                                   ksize,
                                   strides,
                                   padding="VALID",
                                   kernel_name="cce_max_pool_grad_grad"
                                   "_with_argmax"):
    """
    Computes second-order gradients of the maxpooling function.

    Parameters
    ----------
        x: dict
             Include info about ori_input,
             format, ori_format, shape, ori_shape, dtype.
        grad: dict
             Include info about grad of ori_input,
             format, ori_format, shape, ori_shape, dtype.
        argmax: dict
             Include info about ori_input,
             format, ori_format, shape, ori_shape, dtype.
        y: dict
             Include info about result of function,
             format, ori_format, shape, ori_shape, dtype.
        ksize: list or tuple
            The size of the window for each dimension of the input tensor.
        strides: list or tuple
            The stride of the sliding window of the input tensor.
        padding: str
            The type of padding algorithm to use.
            Only support "VALID" or "SAME"
        kernel_name: str
            Cce kernel name,
            default value is "cce_max_pool_grad_grad_with_argmax"
    Returns
    -------
    None
    """
    check_shape_and_format_vailded(x, grad, argmax, y, ksize, strides, padding,
                                   kernel_name)
    shape_x = x.get("shape")
    shape_grad = grad.get("shape")
    shape_argmax = argmax.get("shape")
    shape_argmax = (shape_argmax[0], shape_argmax[1], shape_argmax[2],
                    shape_argmax[3] * shape_argmax[4], 1)

    dtype_x = x.get("dtype").lower()
    dtype_grad = grad.get("dtype").lower()

    ori_format_x = x.get("ori_format")

    x_tensor = tvm.placeholder(shape_x, dtype=dtype_x, name="input_x")

    # argmax is continuous bool, real type is uint16
    _, _, _, howo, _ = shape_argmax
    shape_argmax_boolean = (shape_argmax[0], shape_argmax[1] * shape_argmax[2],
                            howo // 16, 16, shape_argmax[4])
    shape_argmax_boolean = list(shape_argmax_boolean[:-1]) + list(
        [shape_argmax_boolean[-1] * 16])
    argmax_tensor = tvm.placeholder(shape_argmax_boolean,
                                    dtype="bool",
                                    name="argmax")

    grad_tensor = tvm.placeholder(shape_grad,
                                  dtype=dtype_grad,
                                  name="input_grad")

    compute_list = _max_pool_grad_grad_with_argmax_compute(
        [x_tensor, argmax_tensor, grad_tensor], x, argmax, grad, y, ksize,
        strides, padding, ori_format_x, kernel_name)

    res = compute_list[-1]
    sch = tvm.create_schedule(res.op)
    _max_pool_grad_grad_with_argmax_schedule(compute_list, [sch])

    tensor_list = [x_tensor, grad_tensor, argmax_tensor, res]
    new_config = build_config_update(build_config, "dummy_placeholder", True)
    with new_config:
        tvm.build(sch, tensor_list, "cce", name=kernel_name)
コード例 #29
0
def custom_Exp(shape,
               dtype,
               gamma,
               alpha,
               beta,
               kernel_name="cce_exp",
               need_build=False,
               need_print=False):
    """
    calculate gamma **(alpha * data + beta),
    calculate exp(log(gamma) * alpha * data) * (gamma ** beta)

    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support \
    float16, float32

    gamma : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, base

    alpha : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, scale

    beta : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, shift

    kernel_name : cce kernel name, default value is "cce_exp"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32"]
    device_api = "DeviceExp"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError(
            "caffe_exp_layer_cce only support %s while dtype is %s" %
            (",".join(supported_dtypes), dtype))

    if gamma != -1 and gamma <= 0:
        # api  cc_device_exp_c handle gamma == -1 as e
        raise ValueError(
            "please ensure gamma is greater than 0, where gamma = %s" %
            str(gamma))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([gamma], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    # scale --> alpha, shitf --> beta, base --> gamma
    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input, output], "cce", name=kernel_name)
コード例 #30
0
def custom_expm1(shape,
                 dtype,
                 kernel_name="cce_tf_expm1",
                 need_build=False,
                 need_print=False):
    """
    algorithm: expm1

    calculating data's expm1, y= (e ** x) - 1,dtype is float16 or float32.

    Parameters
    ----------
    shape : shape of data.

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32.

    kernel_name : cce kernel name, default value is "cce_tf_expm1".

    need_buid : if need to build CCEC kernel, default value is False.

    need_print : if need to print the ir, default value is False.

    Returns
    -------
    None

    """

    # [aicpu] int32_t cc_device_exp(uint32_t blockNum, uint32_t blockIdx, int32_t dataType, const void *scale, const void *shift,
    # const void *base, int32_t dimCnt, int32_t *shape, uint32_t padC0, const void *x, void *y);

    supported_dtypes = ["float16", "float32"]

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("tf_expm1_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    # step 1. calculate y = exp ** x by aicpu api
    device_api = "DeviceExp"
    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0
    p_scale = util.create_param_ptr([1], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([-1], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output_exp = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output_exp",
        dtype=inp_dtype)

    offset = tvm.const((-1), dtype=inp_dtype)

    # step 2. cauculate y = exp ** x - 1 by tvm
    output = tvm.compute(
        shape,
        lambda *indice: output_exp(*indice) + offset.astype(inp_dtype),
        name="output")

    # step 3. schedule the computation by tvm
    s = tvm.create_schedule(output.op)

    # step 4. build by tvm
    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)