Beispiel #1
0
    def __init__(self, shape, dtype, kernel_name):
        """
        init the parameters

        Parameters
        ----------
        shape: tuple or list
            the shape of input tensor
        dtype: string
            the dtype of input tensor
        kernel_name: str
            kernel name, default value is "reverse_ext2"

        Returns
        -------
        None
        """
        self.tik_instance = tik.Tik(tik.Dprofile())

        self.aicore_num = cce.cce_conf.get_soc_spec(cce.cce_conf.CORE_NUM)

        self.shape = list(shape)
        self.dtype = dtype
        self.kernel_name = kernel_name

        block_byte_size = 32
        dtype_byte_size = cce.cce_intrin.get_bit_len(dtype) // 8

        self.data_each_block = block_byte_size // dtype_byte_size

        ub_byte_size = (cce.cce_conf.get_soc_spec(cce.cce_conf.UB_SIZE) -
                        block_byte_size)

        self.ub_element_number = (ub_byte_size // dtype_byte_size //
                                  self.data_each_block * self.data_each_block)
        self.input_total_num = functools_reduce(lambda x, y: x * y, shape)

        self.data_num_each_core = self.input_total_num // self.aicore_num
        self.last_data_num = self.input_total_num % self.aicore_num

        self.input_gm = self.tik_instance.Tensor(self.dtype,
                                                 self.shape,
                                                 name="input_gm",
                                                 scope=tik.scope_gm)
        self.output_gm = self.tik_instance.Tensor(self.dtype,
                                                  self.shape,
                                                  name="output_gm",
                                                  scope=tik.scope_gm)
        self.input_ub = None
Beispiel #2
0
    def init_tik_instance(self):
        """
        init the tik_instance

        Parameters
        ----------

        Returns
        -------
        None
        """
        profile = tik.Dprofile()
        self.tik_instance = tik.Tik(profile)
        self.real_core_num = profile.get_aicore_num()
        self.l1_buffer_size = profile.get_l1_buffer_size()
Beispiel #3
0
    def __init__(self, input_dict):
        """
      init the Crop parameters

      Parameters
      ----------
        input_dict: input_dict is a dict, the keys as follow:
                x1: dict,shape and datatype,datatype supports int8,uint8,
                    int16,uint16,int32,uint32,int64,uint64,float16,float32
                x2: dict,shape and datatype,datatype supports int8,uint8,
                    int16,uint16,int32,uint32,int64,uint64,float16,float32
                y: dict,shape and datatype,datatype supports int8,uint8,
                    int16,uint16,int32,uint32,int64,uint64,float16,float32
                axis: crop start with axis
                offsets: crop start offset of each axis
                kernel_name: cce kernel name, default value is "crop"
      Returns
      -------
      None
      """
        self.instance = tik.Tik(tik.Dprofile())
        self.dtype = input_dict.get("x1").get("dtype").lower()
        self.dsize = common_util.get_data_size(self.dtype)
        total_size = tbe_platform.cce_conf.get_soc_spec(
            tbe_platform.cce_conf.UB_SIZE)
        ub_size = (total_size - RESERVE_SIZE) // (2 * self.dsize)
        burnest_len = constant.BLOCK_SIZE // self.dsize
        ub_size = ((ub_size + burnest_len - 1) // burnest_len) * burnest_len
        self.one_max_size = ub_size
        x1_len = get_shape_total_number(input_dict.get("x1").get("shape"))
        x1_len = ((x1_len + burnest_len - 1) // burnest_len) * burnest_len
        mod = input_dict.get("y").get("shape")[-1] % burnest_len
        if mod != 0:
            x1_len = x1_len + burnest_len
        self.x1_gm = self.instance.Tensor(self.dtype, (x1_len, ),
                                          name="x1_gm",
                                          scope=tik.scope_gm)
        self.x2_gm = self.instance.Tensor(self.dtype, (32, ),
                                          name="x2_gm",
                                          scope=tik.scope_gm)
        y_len = get_shape_total_number(input_dict.get("y").get("shape"))
        y_len = ((y_len + burnest_len - 1) // burnest_len) * burnest_len
        if mod != 0:
            y_len = y_len + burnest_len
        self.y_gm = self.instance.Tensor(self.dtype, (y_len, ),
                                         name="y_gm",
                                         scope=tik.scope_gm)
        self.input_dict = input_dict
Beispiel #4
0
 def __init__(self, input_dict, stride_h, stride_w):
     self.dprofile = tik.Dprofile()
     self.tik_instance = tik.Tik(self.dprofile)
     self.ub_size = self.dprofile.get_unified_buffer_size()
     self.dtype = input_dict.get("x").get("dtype").lower()
     self.x_shape = input_dict.get("x").get("shape")
     self.dsize = get_data_size(self.dtype)
     self.y_shape = cal_out_shape(self.x_shape, stride_h, stride_w)
     self.x_gm = self.tik_instance.Tensor(self.dtype,
                                          self.x_shape,
                                          name="x_gm",
                                          scope=tik.scope_gm)
     self.y_gm = self.tik_instance.Tensor(self.dtype,
                                          self.y_shape,
                                          name="y_gm",
                                          scope=tik.scope_gm)
Beispiel #5
0
def map_index(x_dic, data_seq_dic, level_index_dic, y_dic,
              kernel_name="map_index"):
    """
    :param x_dic:
    :param data_seq_dic:
    :param level_index_dic:
    :param y_dic:
    :param kernel_name:
    :return:
    """

    check_list = ["int32"]
    x_shape = x_dic.get("shape")
    x_dtype = x_dic.get("dtype")
    check_dtype(x_dtype.lower(), check_list, param_name="x")

    data_seq_shape = data_seq_dic.get("shape")
    data_seq_dtype = data_seq_dic.get("dtype")
    check_dtype(data_seq_dtype.lower(), check_list,
                param_name="data_seq")

    y_dtype = y_dic.get("dtype")
    check_dtype(y_dtype.lower(), check_list, param_name="y")

    if x_shape[0] > 8:
        raise RuntimeError("the length of x should "
                           "be less than or equal to 8")

    if data_seq_shape[0] % x_shape[0] != 0:
        raise RuntimeError("the length of data_seq must "
                           "be multiple of the length of x")

    tik_instance = tik.Tik(tik.Dprofile())

    if level_index_dic:
        level_index_dtype = level_index_dic.get("dtype")
        check_dtype(level_index_dtype.lower(), check_list,
                    param_name="level_index")

        map_index_result = MapIndexProcess((tik_instance, x_dic, data_seq_dic,
                                            y_dic, level_index_dic))
    else:
        map_index_result = MapIndexProcess((tik_instance, x_dic, data_seq_dic,
                                            y_dic))

    return map_index_result.cce_map_index(kernel_name)
Beispiel #6
0
 def __init__(self, input_dict):
     """
     init the permute parameters
     """
     self.instance = tik.Tik(tik.Dprofile())
     self.dtype = input_dict.get("x").get("dtype").lower()
     self.dsize = 2
     size = get_shape_size(input_dict.get("x").get("shape"))
     self.x_gm = self.instance.Tensor(self.dtype, (size, ),
                                      name="x_gm",
                                      scope=tik.scope_gm)
     self.y_gm = self.instance.Tensor(self.dtype, (size, ),
                                      name="y_gm",
                                      scope=tik.scope_gm)
     ub_size = (UB_SIZE_B - 1024) // 4 // self.dsize // 256 * 256
     self.ub_size = ub_size
     self.input_dict = input_dict
Beispiel #7
0
    def init_param(self, pooled_hw, dicts, spatial_scale_list, kernel_name):
        """
        init parameters

        Parameters
        ----------
        pooled_hw: (pooled_h, pooled_w)
        dicts: (x_dict, rois_dict, actual_dict, y_dict)
        spatial_scale_list: (spatial_scale_h, spatial_scale_w)
        kernel_name: kernel name

        Returns
        -------
        None
        """
        self.tik_instance = tik.Tik(tik.Dprofile())
        self.pooled_h = pooled_hw[0]
        self.pooled_w = pooled_hw[1]

        self.dtype = dicts[0].get("dtype").lower()
        self.shape = dicts[0].get("shape")

        self.rois_dtype = dicts[1].get("dtype").lower()
        self.rois_shape = dicts[1].get("shape")

        self.output_shape = dicts[3].get("shape")

        self.spatial_scale_h = spatial_scale_list[0]
        self.spatial_scale_w = spatial_scale_list[1]

        self.roi_actual_num_effect = (dicts[2] is not None)

        self.kernel_name = kernel_name

        self.feature_batch = self.shape[0]
        self.fm_c1 = self.shape[1]
        self.fm_h = self.shape[2]
        self.fm_w = self.shape[3]
        self.fm_c0 = self.shape[4]

        self.device_core_num = \
            tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.CORE_NUM)

        self.proposal_num_per_tiling = 128
        self.roi_max_num = self.rois_shape[2]
Beispiel #8
0
def clip_boxes_d_compute(boxes_input, img_w, img_h, kernel_name="clip_boxes"):
    """
    the compute process of clip_boxes
    input:
     boxes_input:a dict, include shape, and dtype
     img_w: width of the image
     img_h: height of the image
     kernel_name: the kernel name
    return:
     the tik container
    """

    const_num = ConstList()
    tiling_para = TilingFunc(boxes_input.get("shape"))

    #  start the TIK container
    tik_instance = tik.Tik(tik.Dprofile(), True)

    anchors = tik_instance.Tensor("float16",
                                  (tiling_para.tot_of_blk*const_num.num_d,
                                   const_num.num_d),
                                  name="anchors",
                                  scope=tik.scope_gm)
    res_anchors = tik_instance.Tensor("float16",
                                      (tiling_para.tot_of_blk*const_num.num_d,
                                       const_num.num_d),
                                      name="res_anchors",
                                      scope=tik.scope_gm)

    with tik_instance.for_range(0, tiling_para.loop_time - CONFIG_ONE,
                                thread_num=tiling_para.thread_num) as loop_i:
        processing_one_loop(tik_instance,
                            (anchors, res_anchors),
                            tiling_para,
                            (img_h, img_w),
                            loop_i)

    # the tail processing
    processing_tail(tik_instance,
                    (anchors, res_anchors),
                    tiling_para,
                    (img_h, img_w))

    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[anchors], outputs=[res_anchors])
    return tik_instance
 def __init__(self, input_data, block_size):
     """
     init space_to_depth base parameters
     Parameters
     ----------
     input_data: shape and data type,data type supports float16,float32,
                 int32,uint32,int16,uint16,int8,uint8,int64,uint64
     block_size: must be greater than one. It indicates the block size
     """
     self.input_shape = input_data.get("shape")
     self.dtype = input_data.get("dtype").lower()
     self.dtype_size = common_util.get_data_size(self.dtype)
     self.block_size = block_size
     self.tik_instance = tik.Tik(tik.Dprofile())
     self.output_shape = (self.input_shape[0],
                          self.input_shape[1] // block_size,
                          self.input_shape[2] // block_size,
                          self.input_shape[3] * block_size * block_size)
Beispiel #10
0
def check_shape_dtype_format(input_shape, input_dtype, input_format):
    """
    check shape, dtype and format

    Parameters
    ----------
    input_shape: input dic shape
    input_dtype: input dtype
    input_format: input format, NC1HWC0
    The common check rule for tensor shape, just for 5hd

    Returns
    -------
    None
    """
    tik_name = tik.Dprofile().get_product_name()
    if tik_name == "hisi-es":
        check_list = ["float16"]
    else:
        check_list = ["float16", "float32"]
    if input_dtype not in check_list:
        raise RuntimeError("upsample only support %s while dtype is %s"
                           % (",".join(check_list), input_dtype))

    util.check_shape_rule(input_shape)
    if len(input_shape) != DIM_5HD:
        raise RuntimeError(
            "The dim of tensor must be %d"
            ", actual dim is %d" % (DIM_5HD, len(input_shape)))
    n, c1, h, w, c0 = input_shape
    shape_c0 = C0
    if input_shape[DIM_5HD - 1] != shape_c0:
        raise RuntimeError(
            "The value of C0 must be 16")

    if input_format != "NC1HWC0":
        raise RuntimeError(
            "The format must be NC1HWC0, while actual format is %s" %(input_format))
Beispiel #11
0
    def __init__(self, kernel_name):
        self.hidden_size = 32
        self.feature_size = 32
        self.block_size = 16
        self.feature_block_num = self.feature_size // self.block_size
        self.hidden_block_size = self.hidden_size // self.block_size
        self.batch_size = 32
        self.batch_blocks = self.batch_size // self.block_size
        self.num_step = 16

        self.forget_bias = 1.0

        self.use_fixpipe = True
        self.matmul_init_l1out = self.use_fixpipe

        self.feature_hidden_size = self.feature_size + self.hidden_size
        self.feature_hidden_block = self.feature_hidden_size // self.block_size

        self.tik_instance = tik.Tik(tik.Dprofile())

        self.fixpipe_workspace = self.tik_instance.Tensor(
            "float16", (1, 4 * self.hidden_block_size, self.batch_blocks,
                        self.block_size, self.block_size),
            name="fixpipe_workspace",
            scope=tik.scope_gm,
            is_workspace=True)

        self.declare_gm_tensor()

        self.init_core()

        self.tik_instance.BuildCCE(
            kernel_name,
            inputs=[
                self.gm_x, self.gm_init_h, self.gm_init_c, self.gm_weight,
                self.gm_b
            ],
            outputs=[self.gm_output_h, self.gm_output_c])
Beispiel #12
0
    def __init__(self, shape_info, param_info):
        classes = param_info['classes']
        coords = param_info['coords']
        boxes = param_info['boxes']
        dtype = param_info['dtype']
        batch = shape_info['batch']
        height = shape_info['height']
        width = shape_info['width']
        dtype_size = 2 if (param_info['dtype'] == "float16") else 4
        self.product_name = ""
        self.total_ub_size = 0

        self.tik_inst = tik.Tik(tik.Dprofile())

        # in order to solve 32B not enough when do the last data_mov
        batch_padding = 32 // (
            (boxes * (coords + 1 + classes)) * height * width * dtype_size) + 1
        self.yolo_din = \
            self.tik_inst.Tensor(dtype, (batch+batch_padding,
                                         boxes*(coords+1+classes),
                                         height, width),
                                 scope=tik.scope_gm, name="yolo_din")
        # shape defined by fp16 for infershape dtype can not be determined
        self.crd_dout = \
            self.tik_inst.Tensor(dtype, (batch, boxes*coords,
                                         ceil_x(height*width*2+32, 32)//2),
                                 scope=tik.scope_gm, name="crd_dout")

        self.obj_dout = \
            self.tik_inst.Tensor(dtype, (batch, ceil_x(boxes*height*width *
                                                       2+32, 32)//2),
                                 scope=tik.scope_gm, name="obj_dout")

        self.cls_dout = \
            self.tik_inst.Tensor(dtype, (batch, classes,
                                         ceil_x(boxes*height*width *
                                                2+32, 32)//2),
                                 scope=tik.scope_gm, name="cls_dout")
Beispiel #13
0
    def __init__(self, input_dict):
        """
      init the ShuffleChannel parameters

      Parameters
      ----------
        input_dict: input_dict is a dict, the keys as follow:
            x: dict,shape and datatype,datatype supports int8,uint8,int16,
              uint16,int32,uint32,int64,uint64,float16,float32
            y: dict,shape and datatype,datatype supports int8,uint8,int16,
              uint16,int32,uint32,int64,uint64,float16,float32
            group: 1 channel group
            kernel_name: cce kernel name, default value is "shuffle_channel"
      Returns
      -------
      None
      """
        self.instance = tik.Tik(tik.Dprofile())
        self.dtype = input_dict.get("x").get("dtype").lower()
        self.dsize = common_util.get_data_size(self.dtype)
        total_size = tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE)
        ub_size = (total_size - RESERVE_SIZE) // (2 * self.dsize)
        burnest_len = constant.BLOCK_SIZE // self.dsize
        ub_size = ((ub_size + burnest_len - 1) // burnest_len) * burnest_len
        self.one_max_size = ub_size
        x_len = get_shape_total_number(input_dict.get("x").get("shape"))
        x_len = ((x_len + burnest_len - 1) // burnest_len) * burnest_len
        hw = input_dict.get("y").get("shape")[2] * \
             input_dict.get("y").get("shape")[3]
        mod = hw % burnest_len
        if mod != 0:
            x_len = x_len + burnest_len
        self.x_gm = self.instance.Tensor(self.dtype, (x_len,), name="x_gm",
                                         scope=tik.scope_gm)
        self.y_gm = self.instance.Tensor(self.dtype, (x_len,), name="y_gm",
                                         scope=tik.scope_gm)
        self.input_dict = input_dict
Beispiel #14
0
    def __init__(self, x, y, split_dim, num_split, kernel_name):
        """
        Init split_d parameters

        Parameters
        ----------
        x: dict
            the dict of input tensor.
        y: list or tuple
            the list of output tensor.
        split_dim: int
            the dimension along which to split_d.
        num_split: int
            an integer indicating the number of split_d along `split_dim`.
        kernel_name: str
            cce kernel name, default value is "split_d".

        Returns
        -------
        None
        """
        self.tik_instance = tik.Tik(tik.Dprofile())
        self.split_dim = split_dim
        self.num_split = num_split
        self.kernel_name = kernel_name
        self.input_dtype = x.get("dtype").lower()
        self.output_dtype = y[0].get("dtype").lower()
        self.input_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(
            self.input_dtype) // EIGHT_BIT
        self.input_data_each_block = BLOCK_BYTES // self.input_dtype_bytes_size
        self.core_num = tbe_platform.cce_conf.get_soc_spec(
            tbe_platform.cce_conf.CORE_NUM)
        self.ub_size = tbe_platform.cce_conf.get_soc_spec(
            tbe_platform.cce_conf.UB_SIZE) - RESERVED_UB_SIZE
        self.ub_number = self.ub_size // self.input_dtype_bytes_size
        self.ub_number = (self.ub_number // self.input_data_each_block
                          ) * self.input_data_each_block
        self.tiling_gm, self.input_gm, self.outs_gm = self.init_gm_tensor()
        self.check_input_params()

        self.ub_number_new = None
        self.input_ub = None
        self.temp_ub = None
        self.tiling_ub = None
        self.select_mode = None
        self.input_size_split = None
        self.output_size_split = None
        self.act_core_num = None
        self.loop_each_core = None
        self.loop_last_core = None
        self.data_each_core = None
        self.data_last_core = None
        self.loop_num = None
        self.last_num = None
        self.loop_num_last_core = None
        self.last_num_last_core = None
        self.input_num = None
        self.loop_each = None
        self.loop_last = None
        self.loop_each_last_core = None
        self.loop_last_last_core = None
        self.loop_burst_len = None
Beispiel #15
0
    def __init__(self, var, indices, updates, var_out, axis, kernel_name, compute_type):
        """
        Init scatter axis parameters

        Parameters
        ----------
        var: dict
            data of input
            datatype suports float32,float16,int32,int8,uint8
        indices: dict
            data of indices
            datatype supports int32
        updates: dict
            data of updates
            datatype supports float32,float16,int32,int8,uint8
        var_out: dict
            data of input
        axis: bool
            axis
        kernel_name: str
            the name of the operator
        compute_type: str
            the compute type of scatter
        Returns
        -------
            example: var(2, 6, 8, 8) axis=1
            process uint is var[axis:] (6,8,8) slice shape
            small slice shape is var[axis+1:] (8,8)
            slice num is 2 and divide in each core to proc
            each proc of slice data(6,8,8)
            updates_date proc by indices info to copy
        """
        self.tik_instance = tik.Tik(tik.Dprofile())
        self.var_shape = var.get("shape")
        self.var_dtype = var.get("dtype").lower()
        self.indices_shape = indices.get("shape")
        self.indices_dtype = indices.get("dtype").lower()
        self.updates_shape = updates.get("shape")
        self.updates_dtype = updates.get("dtype").lower()
        self.var_ele_num = functools_reduce(lambda x, y: x * y, self.var_shape)
        self.indices_num = functools_reduce(lambda x, y: x * y, self.indices_shape)
        self.updates_num = functools_reduce(lambda x, y: x * y, self.updates_shape)
        self.axis = axis
        self.kernel_name = kernel_name
        self.compute_type = compute_type

        self.ub_size_bytes = (tik.Dprofile().get_unified_buffer_size() - UB_RESERVE_SIZE)
        self.ai_core_num = tik.Dprofile().get_aicore_num()

        self.var_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(self.var_dtype) // 8
        self.indices_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(self.indices_dtype) // 8
        self.var_data_each_block = 32 // self.var_dtype_bytes_size
        self.indices_data_each_block = 32 // self.indices_dtype_bytes_size

        self.check_param(var_out)

        # indices buf size in ub
        self.indices_ub_number = 0
        # var and updates buf size in ub
        self.updates_ub_number = 0

        # slice is var[axis:],  one uint of process
        if axis == 0:
            self.slice_num = 1
        else:
            self.slice_num = functools_reduce(lambda x, y: x * y, self.var_shape[0:axis])
        self.slice_shape = self.var_shape[axis:]
        self.slice_data_num = functools_reduce(lambda x, y: x * y, self.var_shape[axis:])
        self.small_elem_num = self.slice_data_num // self.var_shape[axis]
        self.slice_size = self.slice_data_num * self.var_dtype_bytes_size

        self.max_num_one_repeat = 128
        if self.var_dtype in ("float32", "int32"):
            self.max_num_one_repeat = 64

        # decide block num
        if self.slice_num == 1:
            self.block_num = 1
            self.slice_step = 0
        else:
            self.slice_step = math.ceil(self.slice_num / self.ai_core_num)
            self.block_num = math.ceil(self.slice_num / self.slice_step)

        # each loop data buf now is one slice data var[axis:] date
        self.update_data_num = self.slice_data_num
        self.vconv_dst_dtype = "float16"

        self.init_gm_tensor()
        self.init_ub_tensor_para()
        self.init_scalar_val()
Beispiel #16
0
def CusMatMulCubeFraczLeftCast(input_x1,
                               input_x2,
                               bias=None,
                               output_y={},
                               trans_a=False,
                               trans_b=False,
                               kernel_name="CusMatMulCubeFraczLeftCast"):
    """
    calculating  matrix multiplication with bias, C = A*B + bias, support input
    data with fractal format.

    Parameters:
    shape_a: list or tuple
            Shape of the first tensor a with rank > 1
    shape_b:  list or tuple
            Shape of the second tensor b with the same type with a,
            and shape_a, shape_b must be 2 dims
    src_dtype: str
            The data type of input, support "float32", "float16"
    dst_dtype: str
            The data type of output, support "float32", "float16"
    trans_a: bool
            If True, shape_a == transposed before multiplication
    trans_b: bool
            If True, shape_b == transposed before multiplication
    is_fractal: bool
            If True, the input data format of a and b must be fractal format
    shape_bias: list or tuple
            Shape of bias, only support the input data format with ND

    Returns
    -------
    None
    """
    shape_a = input_x1.get("ori_shape")
    shape_b = input_x2.get("ori_shape")
    print("============")
    print(input_x1.get("format"), input_x2.get("format"))
    print(shape_a, shape_b)
    print("============")
    if input_x2.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_b
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_b = [n, c1 * h * w * c0]
        shape_a = [n, n]

    if input_x1.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_a
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_a = [n, c1 * h * w * c0]
        shape_b = [c1 * h * w * c0, c1 * h * w * c0]

    if input_x2.get("format") == "FRACTAL_NZ":
        shape_a = [shape_b[0], shape_b[0]]
        shape_b = shape_b

    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = shape_a
        shape_b = [shape_a[1], shape_a[1]]

    shape_a = list(shape_a)
    shape_b = list(shape_b)

    shape_a = _get_input_shape(shape_a)
    shape_b = _get_input_shape(shape_b)

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_a)
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)

    shape_a = [shape_a[1], shape_a[0]]
    trans_a = bool(1 - trans_a)

    shape_b = [shape_b[1], shape_b[0]]
    trans_b = bool(1 - trans_b)

    shape_bias = ()
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)

    src_dtype = input_x1.get("dtype").lower()
    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)

    m_shape = shape_a[len(shape_a) - 2]
    km_shape = shape_a[len(shape_a) - 1]
    kn_shape = shape_b[len(shape_a) - 2]
    n_shape = shape_b[len(shape_a) - 1]

    if src_dtype == "float16":
        block_reduce = cce.BLOCK_REDUCE

    block_in = cce.BLOCK_IN
    block_out = cce.BLOCK_OUT

    if trans_a and km_shape == 1:
        block_in = cce.BLOCK_VECTOR

    if not trans_a and m_shape == 1:
        block_in = cce.BLOCK_VECTOR

    if trans_b and kn_shape == 1:
        block_out = cce.BLOCK_VECTOR

    if not trans_b and n_shape == 1:
        block_out = cce.BLOCK_VECTOR

    if trans_a:
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in,
                        block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce,
                        block_in, block_reduce)

    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce,
                        block_reduce, block_out)
    else:
        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out,
                        block_out, block_reduce)
    shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2],
                    shape_a_temp[3])
    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2],
                    shape_b_temp[3])

    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
    input_x1 = tik_instance.Tensor(input_x1.get("dtype"),
                                   shape_a_temp,
                                   name="left_matrix",
                                   scope=tik.scope_gm)
    input_x2 = tik_instance.Tensor(input_x2.get("dtype"),
                                   shape_b_temp,
                                   name="right_matrix",
                                   scope=tik.scope_gm)
    res_matmul = tik_instance.Tensor(output_y.get("dtype"),
                                     output_y.get("shape"),
                                     name="output",
                                     scope=tik.scope_gm)
    DIAG_SIZE = 128
    mo_tile, ko_tile, no_tile, diag_opt = get_cus_tile_info(
        input_x1, input_x2, DIAG_SIZE)
    cus_cube_matmul_cast(tik_instance,
                         input_x1,
                         trans_a,
                         input_x2,
                         trans_b,
                         res_matmul,
                         mo_tile=mo_tile,
                         ko_tile=ko_tile,
                         no_tile=no_tile,
                         diag_opt=diag_opt,
                         diag_size=DIAG_SIZE)
    tik_instance.BuildCCE(kernel_name=kernel_name,
                          inputs=[input_x1, input_x2],
                          outputs=[res_matmul])
    return tik_instance
def CusCholeskyTrsm(input_x, output, kernel_name):
    """CusCholeskyTrsm"""
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    split_dim = 128
    matrix_dim = input_x_shape[0]
    split_dim = min(matrix_dim, split_dim)
    vector_repeat_times = int(split_dim // 64)
    blocks = int(matrix_dim // split_dim)
    if blocks == 0:
        blocks = 1
    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))

    input_x = tik_instance.Tensor("float32",
                                  input_x_shape,
                                  name="input_x",
                                  scope=tik.scope_gm)
    res = tik_instance.Tensor("float32",
                              output_shape,
                              name="res",
                              scope=tik.scope_gm)
    with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
        input_x_ub = tik_instance.Tensor("float32", (split_dim, split_dim),
                                         name="input_x_ub",
                                         scope=tik.scope_ubuf)
        temp_ub = tik_instance.Tensor("float32", (split_dim, split_dim),
                                      name="temp_ub",
                                      scope=tik.scope_ubuf)
        assist_1_ub = tik_instance.Tensor("float32", (split_dim, ),
                                          name="assist_1_ub",
                                          scope=tik.scope_ubuf)
        assist_2_ub = tik_instance.Tensor("float32", (split_dim, ),
                                          name="assist_2_ub",
                                          scope=tik.scope_ubuf)
        with tik_instance.for_range(0, split_dim) as i:
            tik_instance.data_move(
                input_x_ub[i, 0], input_x[block_index * split_dim + i,
                                          block_index * split_dim], 0, 1,
                vector_repeat_times * 8, 0, 0)
        scalar1 = tik_instance.Scalar("float32", init_value=-0.5)

        with tik_instance.for_range(0, split_dim) as i:
            scalar2 = tik_instance.Scalar("float32")
            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i, 0],
                             vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1,
                               vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0],
                              vector_repeat_times, 1, 1, 8, 8)
            scalar2.set_as(assist_1_ub[i])
            tik_instance.vmuls(64, input_x_ub[i, 0], input_x_ub[i, 0], scalar2,
                               vector_repeat_times, 1, 1, 8, 8)
            with tik_instance.for_range(i + 1, split_dim) as j:
                scalar3 = tik_instance.Scalar("float32")
                scalar3.set_as(input_x_ub[i, j])
                tik_instance.vmuls(64, temp_ub[j, 0], input_x_ub[i, 0],
                                   scalar3, vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vsub(64, input_x_ub[i + 1, 0], input_x_ub[i + 1, 0],
                              temp_ub[i + 1, 0],
                              (split_dim - 1 - i) * vector_repeat_times, 1, 1,
                              1, 8, 8, 8)

        zero = tik_instance.Scalar("float32")
        zero.set_as(0.0)
        one = tik_instance.Scalar("float32")
        one.set_as(1.0)
        with tik_instance.for_range(0, split_dim) as i:
            tik_instance.vector_dup(64, temp_ub[i, 0], zero,
                                    vector_repeat_times, 1, 8)
            temp_ub.__setitem__(i * split_dim + i, one)

        chol_diag_element_final = tik_instance.Scalar("float32")
        chol_diag_element_final.set_as(input_x_ub[split_dim * split_dim - 1])
        trsm_diag_element = tik_instance.Scalar("float32")
        trsm_diag_element.set_as(1.0 / chol_diag_element_final)
        temp_ub.__setitem__(split_dim * split_dim - 1, trsm_diag_element)

        with tik_instance.for_range(1, split_dim) as i:
            index = split_dim - i - 1
            tik_instance.vector_dup(64, assist_1_ub, zero, vector_repeat_times,
                                    1, 8)
            with tik_instance.for_range(0, i) as j:
                chol_diag_element_loop = tik_instance.Scalar("float32")
                chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j])
                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0],
                                   chol_diag_element_loop, vector_repeat_times,
                                   1, 1, 8, 8)
                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub,
                                  vector_repeat_times, 1, 1, 1, 8, 8, 8)
            temp_scalar = tik_instance.Scalar("float32")
            temp_scalar.set_as(input_x_ub[index, index])
            chol_diag_element = tik_instance.Scalar("float32")
            chol_diag_element.set_as(1.0 / temp_scalar)
            tik_instance.vsub(64, temp_ub[index, 0], temp_ub[index,
                                                             0], assist_1_ub,
                              vector_repeat_times, 1, 1, 1, 8, 8, 8)
            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0],
                               chol_diag_element, vector_repeat_times, 1, 1, 8,
                               8)

        tik_instance.data_move(res[block_index, 0, 0], temp_ub, 0, 1,
                               8 * vector_repeat_times * split_dim, 0, 0)

    tik_instance.BuildCCE(kernel_name=kernel_name,
                          inputs=[input_x],
                          outputs=[res])
    return tik_instance
Beispiel #18
0
    def __init__(self, params_dict, indices_dict, axis_dict, y_dict, kernel_name):
        """
        constructor of GatherV2

        Parameters
        ----------
        params_dict: dict
            shape and dtype of input params
        indices_dict: dict
            shape and dtype of input indices
        axis_dict: dict
            shape and dtype of input axis
        y_dict: dict
            shape and dtype of output, should be same dtype as input
        kernel_name: str
            kernel name, default value is "GatherV2"

        Returns
        -------
        None
        """
        self.params_dtype = params_dict.get("dtype").lower()
        self.indices_dtype = indices_dict.get("dtype").lower()
        self.axis_dtype = axis_dict.get("dtype").lower()
        self.y_dtype = y_dict.get("dtype").lower()
        self.tiling_dtype = INT32
        dtype_list = ("int8", "int16", "int32", "int64", "uint8", "uint16",
                      "uint32", "uint64", "float16", "float32")
        indices_support_dtype_list = ("int32", "int64")
        check_dtype(self.params_dtype, dtype_list, param_name="x")
        check_dtype(self.indices_dtype, indices_support_dtype_list, param_name="indices")
        check_dtype(self.axis_dtype, (INT32,), param_name="axis")
        if self.y_dtype != self.params_dtype:
            error_manager_vector.raise_err_inputs_dtype_not_equal(kernel_name, "y", "x",
                                                                  self.y_dtype, self.params_dtype)

        profile = tik.Dprofile()
        self.ub_size = profile.get_unified_buffer_size()
        self.l1_size = profile.get_l1_buffer_size()
        self.core_num = profile.get_aicore_num()
        self.tik_instance = tik.Tik(profile, disable_debug=True)
        self.kernel_name = kernel_name

        self.axis_shape = (1,)
        self.x_shape = (PARAMS_SIZE,)
        self.indices_shape = (INDICES_NUM,)
        self.y_shape = (PARAMS_SIZE,)

        self.params_dsize = TYPE_LEN_DICT.get(self.params_dtype)
        self.indices_dsize = TYPE_LEN_DICT.get(self.indices_dtype)
        self.block_elem = BLOCK_SIZE // self.params_dsize

        self.x = None
        self.indices = None
        self.axis = None
        self.tiling_gm = None
        self.y = None

        self.params_pre = None
        self.params_axis = None
        self.params_row = None
        self.indices_num = None

        self.cache_params = None
        self.need_core_num = None
        self.tail_process_core = None
        self.indices_num_each_core = None
        self.indices_num_remaining = None
        self.indices_loop_num = None
        self.indices_row_num_once = None
        self.indices_row_num_last = None

        self.row_num_once_ub = None
        self.row_num_once_tail_ub = None
        self.inner_loop_num = None
        self.row_num_last_ub = None
        self.row_num_last_tail_ub = None
        self.inner_loop_num_last = None
Beispiel #19
0
    def __init__(self, padding, dtype, kernel_name, tik_obj, fuse_mark):
        """
        Function: store pad_d's parameters of compilation
        """
        self.dtype = dtype.lower()
        self.ori_padding = padding.copy()
        self.padding = padding.copy()
        self.kernel_name = kernel_name
        self.num_bit = tbe_platform.cce_intrin.get_bit_len(self.dtype) // 8
        self.fuse_mark = fuse_mark

        self.mask = 128
        if self.num_bit == 4:
            self.mask = 64
        self.max_ub_size = tik.Dprofile().get_unified_buffer_size() - 1024
        self.max_core = tik.Dprofile().get_aicore_num()

        self.tiling_gm = None
        self.input_gm = None
        self.output_gm = None

        self.tiling_buf = None
        self.tiling_buf_size = None
        self.buf = None
        self.buf_size = None
        self.help_buf = None
        self.tik_instance = tik_obj

        # circulation
        self.axis_amount = None
        self.branch = None
        self.depth = None

        self.top_vol = None
        self.top_address = None
        self.top_div_core = None
        self.top_total_core = None
        self.top_core_vol_0 = None
        self.top_core_vol_1 = None
        self.top_core_gap_0 = None
        self.top_core_gap_1 = None

        self.bottom_vol = None
        self.bottom_address = None
        self.bottom_div_core = None
        self.bottom_total_core = None
        self.bottom_core_vol_0 = None
        self.bottom_core_vol_1 = None
        self.bottom_core_gap_0 = None
        self.bottom_core_gap_1 = None

        # recursion
        self.recur_total_core = None
        self.recur_div_core = None
        self.recur_in_vol = None
        self.recur_loop_0 = None
        self.recur_loop_1 = None
        self.recur_gap_0 = None
        self.recur_gap_1 = None
        self.recur_cond = None
        self.recur_start_address = None

        self.new_in_shape = None
        self.new_out_shape = None
        self.new_padding_top = None
        self.new_padding_bottom = None
        self.recur_model = None
        self.recur_dup_mk = None
        self.recur_gm2buf_mk = None
        self.prod_new_in = None
        self.prod_new_out = None

        self.tiling_arg_kind = None
        self.tiling_arg_num = None
        self.tiling_arg_idx = None
def CusMatMulCubeDenseLeft(input_x1,
                           input_x2,
                           bias=None,
                           output_y={},
                           trans_a=False,
                           trans_b=False,
                           kernel_name="matmulcube"):
    """
    calculating  matrix multiplication with bias, C = A*B + bias, support input
    data with fractal format.

    Parameters:
    shape_a: list or tuple
            Shape of the first tensor a with rank > 1
    shape_b:  list or tuple
            Shape of the second tensor b with the same type with a,
            and shape_a, shape_b must be 2 dims
    src_dtype: str
            The data type of input, support "float32", "float16"
    dst_dtype: str
            The data type of output, support "float32", "float16"
    trans_a: bool
            If True, shape_a == transposed before multiplication
    trans_b: bool
            If True, shape_b == transposed before multiplication
    is_fractal: bool
            If True, the input data format of a and b must be fractal format
    shape_bias: list or tuple
            Shape of bias, only support the input data format with ND

    Returns
    -------
    None
    """
    print("!!!!come into zzt~~~~~~~!!!!")
    shape_a = input_x1.get("ori_shape")
    shape_b = input_x2.get("ori_shape")
    shape_output = output_y.get("ori_shape")
    print("============")
    print(input_x1.get("format"), input_x2.get("format"))
    print(shape_a, shape_b)
    print("============")
    if input_x2.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_b
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_b = [n, c1 * h * w * c0]
        shape_a = [n, n]

    if input_x1.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_a
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_a = [n, c1 * h * w * c0]
        shape_b = [c1 * h * w * c0, c1 * h * w * c0]

    if input_x2.get("format") == "FRACTAL_NZ":
        shape_a = [shape_b[0], shape_b[0]]
        shape_b = shape_b

    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = shape_a
        shape_b = [shape_a[1], shape_a[1]]

    shape_a = list(shape_a)
    shape_b = list(shape_b)

    shape_a = _get_input_shape(shape_a)
    shape_b = _get_input_shape(shape_b)

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_a)
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)

    shape_a = [shape_a[1], shape_a[0]]
    trans_a = bool(1 - trans_a)

    shape_b = [shape_b[1], shape_b[0]]
    trans_b = bool(1 - trans_b)

    shape_bias = ()
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)

    src_dtype = input_x1.get("dtype").lower()
    dst_dtype = output_y.get("dtype").lower()
    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)

    m_shape = shape_a[len(shape_a) - 2]
    km_shape = shape_a[len(shape_a) - 1]
    kn_shape = shape_b[len(shape_a) - 2]
    n_shape = shape_b[len(shape_a) - 1]

    if src_dtype == "float16":
        block_reduce = cce.BLOCK_REDUCE

    block_in = cce.BLOCK_IN
    block_out = cce.BLOCK_OUT

    if trans_a and km_shape == 1:
        block_in = cce.BLOCK_VECTOR

    if not trans_a and m_shape == 1:
        block_in = cce.BLOCK_VECTOR

    if trans_b and kn_shape == 1:
        block_out = cce.BLOCK_VECTOR

    if not trans_b and n_shape == 1:
        block_out = cce.BLOCK_VECTOR

    if trans_a:
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in,
                        block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce,
                        block_in, block_reduce)

    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce,
                        block_reduce, block_out)
    else:
        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out,
                        block_out, block_reduce)
    shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2],
                    shape_a_temp[3])
    format_a = "FRACTAL_NZ"
    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2],
                    shape_b_temp[3])
    format_b = "FRACTAL_NZ"

    print("=======================================")
    print(shape_a_temp, shape_b_temp)
    print(format_a, format_b)
    print("=======================================")
    tensor_bias = None
    tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a', dtype=src_dtype)
    tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b', dtype=src_dtype)

    if shape_bias:
        tensor_bias = tvm.placeholder(shape_bias,
                                      name='tensor_bias',
                                      dtype=dst_dtype)

    if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[
            0] == 128 and shape_b_temp[1] == 63:
        if util.get_product_version() == util.VERSION_MINI:
            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
        else:
            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))

        input_x1 = tik_instance.Tensor("float16",
                                       shape_a_temp,
                                       name="left_matrix",
                                       scope=tik.scope_gm)
        input_x2 = tik_instance.Tensor("float16",
                                       shape_b_temp,
                                       name="right_matrix",
                                       scope=tik.scope_gm)
        resMatmul = tik_instance.Tensor("float16",
                                        shape_output,
                                        name="output",
                                        scope=tik.scope_gm)
        with tik_instance.for_range(0, 32, block_num=32) as block_index:
            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256, ),
                                                     scope=tik.scope_ubuf,
                                                     name="resMatmul_local_UB")
            resMatmul_local_UB_local_L0C = tik_instance.Tensor(
                "float32", (128 * 256, ),
                scope=tik.scope_cc,
                name="resMatmul_local_UB")
            input_1_local_L1_local_L0A = tik_instance.Tensor(
                "float16", (128 * 128, ),
                scope=tik.scope_ca,
                name="input_1_local_L1_local_L0A")
            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256, ),
                                                   scope=tik.scope_cbuf,
                                                   name="input_2_local_L1")
            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128, ),
                                                   scope=tik.scope_cbuf,
                                                   name="input_1_local_L1")
            input_2_local_L1_local_L0B = tik_instance.Tensor(
                "float16", (128 * 256, ),
                scope=tik.scope_cb,
                name="input_2_local_L1_local_L0B")
            core_m_idx = block_index % 8
            core_n_idx = block_index // 8
            with tik_instance.if_scope(core_m_idx != 7):
                tik_instance.data_move(
                    input_1_local_L1,
                    input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128,
                    55 * 16, 0)
                tik_instance.data_move(
                    input_2_local_L1,
                    input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
                    0, 32, 128, 55 * 16, 0)
                with tik_instance.for_range(0, 8) as cc12:
                    tik_instance.load2dv1(
                        input_1_local_L1_local_L0A[cc12 * 2048],
                        input_1_local_L1[cc12 * 256], 0, 8, 8, 0, False)
                with tik_instance.for_range(0, 2) as cc6:
                    with tik_instance.for_range(0, 8) as cc121:
                        tik_instance.load2dv1(
                            input_2_local_L1_local_L0B[cc121 * 4096],
                            input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16,
                            8, 0, True)
                    tik_instance.mmad(resMatmul_local_UB_local_L0C,
                                      input_1_local_L1_local_L0A,
                                      input_2_local_L1_local_L0B, 128, 128,
                                      256, 0)
                    tik_instance.data_move(resMatmul_local_UB,
                                           resMatmul_local_UB_local_L0C, 0, 1,
                                           128, 0, 0, 1)
                    tik_instance.data_move(
                        resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 +
                                  core_n_idx * 512 * 1008], resMatmul_local_UB,
                        0, 16, 256 // 2, 0, 55 * 16 * 2 // 2)
            with tik_instance.else_scope():
                tik_instance.data_move(
                    input_1_local_L1,
                    input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112,
                    56 * 16, 0)
                tik_instance.data_move(
                    input_2_local_L1,
                    input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
                    0, 32, 112, 56 * 16, 0)
                with tik_instance.for_range(0, 7) as cc10:
                    tik_instance.load2dv1(
                        input_1_local_L1_local_L0A[cc10 * 1792],
                        input_1_local_L1[cc10 * 256], 0, 7, 7, 0, False)
                with tik_instance.for_range(0, 2) as cc5:
                    with tik_instance.for_range(0, 7) as cc101:
                        tik_instance.load2dv1(
                            input_2_local_L1_local_L0B[cc101 * 4096],
                            input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16,
                            7, 0, True)
                    tik_instance.mmad(resMatmul_local_UB_local_L0C,
                                      input_1_local_L1_local_L0A,
                                      input_2_local_L1_local_L0B, 112, 112,
                                      256, 0)
                    tik_instance.data_move(resMatmul_local_UB,
                                           resMatmul_local_UB_local_L0C, 0, 1,
                                           112, 0, 0, 1)
                    tik_instance.data_move(
                        resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 +
                                  core_n_idx * 512 * 1008], resMatmul_local_UB,
                        0, 16, 224 // 2, 0, 56 * 16 * 2 // 2)
        tik_instance.BuildCCE(kernel_name=kernel_name,
                              inputs=[input_x1, input_x2],
                              outputs=[resMatmul])
        return tik_instance

    print("come into tbe, shape is error!")
    result = te.lang.cce.matmul(tensor_a,
                                tensor_b,
                                trans_a,
                                trans_b,
                                format_a=format_a,
                                format_b=format_b,
                                dst_dtype=dst_dtype,
                                tensor_bias=tensor_bias)

    with tvm.target.cce():
        schedule = generic.auto_schedule(result)

    tensor_list = [tensor_a, tensor_b, result]
    if shape_bias:
        tensor_list = [tensor_a, tensor_b, tensor_bias, result]

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(schedule, config)
Beispiel #21
0
def decode_bbox(box_predictions,
                anchors,
                decoded_boxes,
                decode_clip,
                kernel_name="decode_bbox"):
    """
    calculating data

    Parameters
    ----------
    box_predictions : shape and dtype of input
    anchors : shape and dtype of input
    decoded_boxes : shape and dtype of output, s
                    hould be same shape and type as input
    decode_clip : decode_clip
    kernel_name : kernel name, default value is "decode_bbox"
    Returns
    -------
    None
    """

    # check param & data
    shape_box_predictions = box_predictions.get("shape")
    shape_anchors = anchors.get("shape")
    shape_decoded_boxes = decoded_boxes.get("shape")
    util.check_kernel_name(kernel_name)
    format_box_predictions = box_predictions.get("format")
    format_anchors = anchors.get("format")
    format_decoded_boxes = decoded_boxes.get("format")
    check_format_shape(format_box_predictions, format_anchors,
                       format_decoded_boxes)
    util.check_shape_rule(shape_box_predictions, CONFIG_THREE, CONFIG_FOUR,
                          None)
    util.check_shape_rule(shape_anchors, CONFIG_THREE, CONFIG_FOUR, None)
    util.check_shape_rule(shape_decoded_boxes, CONFIG_TWO, CONFIG_TWO, None)
    util.check_shape_size(shape_box_predictions, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_anchors, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_decoded_boxes, SHAPE_SIZE_LIMIT)
    util.check_dtype_rule(box_predictions.get("dtype").lower(), ("float16", ))
    util.check_dtype_rule(anchors.get("dtype").lower(), ("float16", ))
    util.check_dtype_rule(decoded_boxes.get("dtype").lower(), ("float16", ))
    if shape_box_predictions != shape_anchors:
        raise RuntimeError("the input shape_box_predictions and anchors)"
                           "must be same")
    if (reduce(lambda x, y: x * y, shape_box_predictions[:])) \
            != (reduce(lambda x, y: x * y, shape_decoded_boxes[:])):
        raise RuntimeError("the input shape (box_predictions and anchors"
                           "is not equal to out shape(decoded_boxes)")
    if (shape_box_predictions[-1] == CONFIG_FOUR
            and len(shape_box_predictions) == CONFIG_THREE):
        if shape_decoded_boxes[1] != CONFIG_FOUR:
            raise RuntimeError("the output shape_decoded_boxes must be 4")
    else:
        if (shape_box_predictions[0] == CONFIG_FOUR
                and len(shape_box_predictions) == CONFIG_FOUR):
            if shape_decoded_boxes[0] != CONFIG_FOUR:
                raise RuntimeError("the output shape_decoded_boxes must be 4")
        else:
            raise RuntimeError("the input shape not in {(4,C,H,W), (H,W,4)}")
    if not isinstance(decode_clip, (float, int)):
        raise RuntimeError("input param type of decode_clip should be Float")
    if decode_clip < 0 or decode_clip > 10:
        raise RuntimeError(
            "input param decode_clip can't be negtive and shoud be [0,10]! ")
    # init the tiling shape
    print("shape_box_predictions", shape_box_predictions)
    shape = TilingFunc(shape_box_predictions)
    # calculate the deocede_bbox
    tik_instance = tik.Tik(tik.Dprofile())
    data_tensor = InitTensor(tik_instance, shape)
    if shape.input_shape[-1] == CONFIG_FOUR \
            and len(shape.input_shape) == CONFIG_THREE:
        decode_bbox_compute(tik_instance, shape, data_tensor, decode_clip,
                            kernel_name)
    if shape.input_shape[0] == CONFIG_FOUR \
            and len(shape.input_shape) == CONFIG_FOUR:
        decode_bbox_compute_transpose(tik_instance, shape, data_tensor,
                                      decode_clip, kernel_name)
    return tik_instance
    def __init__(self,
                 input0,
                 gamma0,
                 beta0,
                 output0,
                 kernel_name="BatchNorm"):

        self.tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))

        self.sclar_gamma = self.tik_instance.Scalar("float16")
        self.sclar_beta = self.tik_instance.Scalar("float16")
        #
        self.input_n = self.tik_instance.InputScalar(dtype="int32",
                                                     name="inputscalar_n")
        self.input_c = self.tik_instance.InputScalar(dtype="int32",
                                                     name="inputscalar_c")
        self.input_h = self.tik_instance.InputScalar(dtype="int32",
                                                     name="inputscalar_h")
        self.input_w = self.tik_instance.InputScalar(dtype="int32",
                                                     name="inputscalar_w")
        self.inputtype = \
            self.tik_instance.InputScalar(dtype="int32",
                                          name="inputscalar_dtype")
        self.output_n = self.tik_instance.InputScalar(dtype="int32",
                                                      name="outputscalar_n")
        self.output_c = self.tik_instance.InputScalar(dtype="int32",
                                                      name="outputscalar_c")
        self.output_h = self.tik_instance.InputScalar(dtype="int32",
                                                      name="outputscalar_h")
        self.output_w = self.tik_instance.InputScalar(dtype="int32",
                                                      name="outputscalar_w")
        self.outputtype = \
            self.tik_instance.InputScalar(dtype="int32",
                                          name="outputscalar_dtype")
        self.gamma_c = self.tik_instance.InputScalar(dtype="int32",
                                                     name="gammascalar")
        self.gammatype = \
            self.tik_instance.InputScalar(dtype="int32",
                                          name="gammascalar_dtype")
        self.beta_c = self.tik_instance.InputScalar(dtype="int32",
                                                    name="betascalar")
        self.betatype = self.tik_instance.InputScalar(dtype="int32",
                                                      name="betascalar_dtype")
        self.param1 = self.tik_instance.InputScalar(dtype="int32",
                                                    name="param1")
        self.param2 = self.tik_instance.InputScalar(dtype="int32",
                                                    name="param2")
        self.param3 = self.tik_instance.InputScalar(dtype="int32",
                                                    name="param3")
        self.param4 = self.tik_instance.InputScalar(dtype="int32",
                                                    name="param4")
        self.param5 = self.tik_instance.InputScalar(dtype="int32",
                                                    name="param5")
        self.param6 = self.tik_instance.InputScalar(dtype="int32",
                                                    name="param6")
        self.param7 = self.tik_instance.InputScalar(dtype="int32",
                                                    name="param7")
        self.param8 = self.tik_instance.InputScalar(dtype="int32",
                                                    name="param8")
        self.param9 = self.tik_instance.InputScalar(dtype="int32",
                                                    name="param9")
        self.param10 = self.tik_instance.InputScalar(dtype="int32",
                                                     name="param10")

        self.byte_fp16 = 2
        self.input_dtype = "float16"
        self.kernel_name = kernel_name

        # gm buffer
        self.gamma_gm = self.tik_instance.Tensor("float16", (MAX_CHANNEL, ),
                                                 name="gamma_gm",
                                                 scope=tik.scope_gm)
        self.beta_gm = self.tik_instance.Tensor("float16", (MAX_CHANNEL, ),
                                                name="beta_gm",
                                                scope=tik.scope_gm)
        self.input_gm = self.tik_instance.\
            Tensor("float16",
                   (MAX_BATCH*MAX_CHANNEL*MAX_HEIGHT*MAX_WIDTH,),
                   name="input_gm", scope=tik.scope_gm)
        self.output_gm = self.tik_instance.\
            Tensor("float16",
                   (MAX_BATCH*MAX_CHANNEL*MAX_HEIGHT*MAX_WIDTH,),
                   name="output_gm", scope=tik.scope_gm)
        self.gamma_ub = self.tik_instance.\
            Tensor("float16", (MAX_CHANNEL, ),
                   name="gamma_ub", scope=tik.scope_ubuf)
        self.beta_ub = self.tik_instance.\
            Tensor("float16", (MAX_CHANNEL, ),
                   name="beta_ub", scope=tik.scope_ubuf)

        align_c = ceil_div_mul(self.input_c, 16)

        #clear to zero
        self.tik_instance.vec_muls(128, self.gamma_ub, self.gamma_ub, 0,
                                   MAX_CHANNEL // 128, 8, 8)
        self.tik_instance.vec_muls(128, self.beta_ub, self.beta_ub, 0,
                                   MAX_CHANNEL // 128, 8, 8)

        self.tik_instance.data_move(self.gamma_ub, self.gamma_gm, 0, 1,
                                    align_c // 16, 0, 0)
        self.tik_instance.data_move(self.beta_ub, self.beta_gm, 0, 1,
                                    align_c // 16, 0, 0)

        # 1/var
        self.tik_instance.vrec(16, self.beta_ub, self.beta_ub, align_c // 16,
                               1, 1, 1, 1)
        # -mean
        self.tik_instance.vec_muls(16, self.gamma_ub, self.gamma_ub, -1.0,
                                   align_c // 16, 1, 1)
def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
    """CusBatchMatMul"""
    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
    x1_shape = input_x1.get("shape")
    dtype = input_x1.get("dtype").lower()
    x2_shape = input_x2.get("shape")
    if dtype != input_x2.get("dtype").lower():
        raise RuntimeError("dtype of input_x1 and input_x2 must be same, but got %s vs %s" % (
            dtype, input_x2.get("dtype").lower()))
    input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b)
    support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True),
                     ((36, 128, 128), (36, 128, 128), "float32", False, True),
                     ((5, 128, 128), (5, 128, 128), "float32", False, True),
                     ((18, 128, 128), (18, 128, 128), "float32", False, True),
                     ((16, 128, 128), (16, 128, 128), "float32", False, True),
                     ((9, 128, 128), (9, 128, 128), "float32", False, True),
                     ((1, 64, 64), (1, 64, 64), "float32", False, True),
                     ((1, 128, 128), (1, 128, 128), "float32", False, True),
                     ((4, 128, 128), (4, 128, 128), "float32", False, True),
                     ((2, 128, 128), (2, 128, 128), "float32", False, True)]
    if input_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" % str(input_shape))

    # if not transpose_a and transpose_b:
    batch, m, k = x1_shape

    input1_shape = _get_flattern_shape(x1_shape)
    input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm)
    input2_shape = _get_flattern_shape(x2_shape)
    input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm)

    output_shape = x1_shape
    res_shape = _get_flattern_shape(output_shape)
    res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm)

    if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 2) as cc0:
                with tik_instance.for_range(0, 128, thread_num=2) as cc1:
                    input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                    input2_index = block_idx * 32768 + cc0 * 16384
                    res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                    _inner_matmul_new(tik_instance, dtype,
                                      input1, input1_index,
                                      input2, input2_index,
                                      res, res_index)
    if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 30, block_num=30) as block_idx:
            with tik_instance.for_range(0, 11) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as thread_idx:
                    with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)):
                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB",
                                                               scope=tik.scope_ubuf)
                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB",
                                                             scope=tik.scope_ubuf)
                        tik_instance.data_move(input_1_local_UB, input1[
                            (block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128], 0, 1,
                                               16, 0, 0)
                        with tik_instance.for_range(0, 2) as vec_i:
                            tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0,
                                               64, 1, 1, 16, 0)
                        with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
                            input_2_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="input_2_local_UB",
                                                                   scope=tik.scope_ubuf)
                            t_1_local_UB = input_2_local_UB
                            bisec_last_axis_local_UB = input_2_local_UB
                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64],
                                                                             name="matmul_hybrid_f_t_local_UB",
                                                                             scope=tik.scope_ubuf)
                            matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64],
                                                                                     name="matmul_hybrid_f_t_local_UB_dst_tmp",
                                                                                     scope=tik.scope_ubuf)
                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8)
                            tik_instance.data_move(input_2_local_UB,
                                                   input2[(block_idx // 6) * 16384 + thread_idx2 * 8192], 0, 1,
                                                   1024, 0, 0)
                            tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8)
                            tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1,
                                              16, 16, 16)
                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8)
                            with tik_instance.for_range(0, 64) as cc6:
                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6],
                                                   bisec_last_axis_local_UB[cc6 * 128],
                                                   1, 1, 1, 8)
                            tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp,
                                              matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
                            tik_instance.data_move(
                                res[(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 +
                                    thread_idx * 128 + thread_idx2 * 64],
                                matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)

    if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 128, thread_num=2) as cc0:
                input1_index = block_idx * 16384 + cc0 * 128
                input2_index = block_idx * 16384
                res_index = block_idx * 16384 + cc0 * 128
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)

    if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 27, block_num=27) as block_idx:
            with tik_instance.for_range(0, 42, thread_num=2) as cc0:
                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                input2_index = (block_idx // 3) * 16384
                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)
            with tik_instance.if_scope((block_idx % 3) < 2):
                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                input2_index = (block_idx // 3) * 16384
                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)

    if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 2, thread_num=2) as cc0:
                input1_index = block_idx * 128 + cc0 * 64
                input2_index = 0
                res_index = block_idx * 128 + cc0 * 64
                _inner_matmul_new_1_64_32_64(tik_instance, dtype,
                                             input1, input1_index,
                                             input2, input2_index,
                                             res, res_index)

    input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True),
                        ((2, 128, 128), (2, 128, 128), "float32", False, True),
                        ((4, 128, 128), (4, 128, 128), "float32", False, True),
                        ((8, 128, 128), (8, 128, 128), "float32", False, True),
                        ((16, 128, 128), (16, 128, 128), "float32", False, True)
                        ]
    if input_shape in input_shape_list:
        block_num = 32
        input1_unit_size = 128
        input2_unint_size = 128 * 128
        with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
            block_process_ele_num = (batch * m * k) // block_num
            loop_time = (batch * m * k) // block_num // input1_unit_size
            thread_num = 2
            with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0:
                input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                if batch > 1:
                    input2_index = block_idx // (block_num // batch) * input2_unint_size
                else:
                    input2_index = 0
                res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)

    tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res])
    return tik_instance
Beispiel #24
0
def sort(x, y1, y2, axis=-1, descending=False, kernel_name="sort"):
    """
    Function: Sorts the elements of the input tensor along a given dimension in ascending order by value.
    Modify : 2020-08-03

    Init base parameters
    Parameters
    ----------
    input(x): dict
        data of input
    output(y1): dict
        data of output
    indices(y2): dict
        data of indices
    dim(axis): int
    descending: bool
    kernel_name: str
        the name of the operator
    ----------
    """
    shape, dtype, allnum, num = cheak(x, y1, y2, axis, kernel_name)

    tik_instance = tik.Tik(tik.Dprofile())

    add16 = (16 - (num % 16)) % 16
    total = num + add16

    big_shape = list(shape)
    big_shape[-1] = total

    input_gm = tik_instance.Tensor(dtype, shape, name="x", scope=tik.scope_gm)
    data_out = tik_instance.Tensor(dtype,
                                   big_shape,
                                   name="data_out",
                                   scope=tik.scope_gm,
                                   is_workspace=True)
    data_indices = tik_instance.Tensor("int32",
                                       big_shape,
                                       name="data_indices",
                                       scope=tik.scope_gm,
                                       is_workspace=True)
    data_out_ = tik_instance.Tensor(dtype,
                                    shape,
                                    name="data_out_",
                                    scope=tik.scope_gm)
    data_indices_ = tik_instance.Tensor("int32",
                                        shape,
                                        name="data_indices_",
                                        scope=tik.scope_gm)

    # to figure the index of input_gm
    L = len(shape)
    distance = []
    big_distance = []
    tmp = allnum
    big_tmp = allnum // num * total

    for i in range(L - 1):
        tmp = tmp // shape[i]
        distance.append(tmp)
        big_tmp = big_tmp // shape[i]
        big_distance.append(big_tmp)

    rounds = allnum // num

    available_aicore_num = tik.Dprofile().get_aicore_num()
    used_aicore_num = available_aicore_num if rounds > available_aicore_num else rounds
    batch_num_per_aicore_process = rounds // used_aicore_num
    batch_tail = rounds % used_aicore_num

    with tik_instance.for_range(0, used_aicore_num,
                                block_num=used_aicore_num) as i:
        with tik_instance.for_range(0, batch_num_per_aicore_process) as k:
            data_out, data_indices = sort_compute(tik_instance, dtype, total,
                                                  i + k * used_aicore_num,
                                                  descending, num, distance,
                                                  shape, big_distance,
                                                  data_out, data_indices,
                                                  input_gm, L)
        with tik_instance.if_scope(i < batch_tail):
            data_out, data_indices = sort_compute(
                tik_instance, dtype, total,
                batch_num_per_aicore_process * used_aicore_num + i, descending,
                num, distance, shape, big_distance, data_out, data_indices,
                input_gm, L)

    float_ub = tik_instance.Tensor("float16", [total],
                                   name="float_ub",
                                   scope=tik.scope_ubuf)
    int_ub = tik_instance.Tensor("int32", [total],
                                 name="int_ub",
                                 scope=tik.scope_ubuf)

    with tik_instance.for_range(0, rounds) as i:
        tik_instance.data_move(float_ub[0], data_out[i * total], 0, 1,
                               total // 16, 0, 0)
        tik_instance.data_move(data_out_[i * num], float_ub[0], 0, 1,
                               total // 16, 0, 0)

        tik_instance.data_move(int_ub[0], data_indices[i * total], 0, 1,
                               total // 8, 0, 0)
        tik_instance.data_move(data_indices_[i * num], int_ub[0], 0, 1,
                               total // 8, 0, 0)

    tik_instance.BuildCCE(kernel_name=kernel_name,
                          inputs=[input_gm],
                          outputs=[data_out_, data_indices_])

    return tik_instance
Beispiel #25
0
def safe_check(dicts, kernel_name):
    """
    check if the inputs are legal

    Parameters
    ----------
    dicts: (x_dict, rois_dict, actual_dict, y_dict)
    kernel_name: kernel name

    Returns
    -------
    None
    """
    x_shape = dicts[0].get("shape")
    x_dtype = dicts[0].get("dtype").lower()
    rois_shape = dicts[1].get("shape")
    rois_dtype = dicts[1].get("dtype").lower()

    y_dtype = dicts[3].get("dtype").lower()
    y_shape = dicts[3].get("shape")

    profile = tik.Dprofile()
    tik_name_check = tbe_platform.cce_conf.get_soc_spec("SOC_VERSION")
    if tik_name_check in ("Ascend310", "Ascend910", "Hi3796CV300ES",
                          "Hi3796CV300CS"):
        op_utils.check_dtype(x_dtype, ["float16"], param_name="input_x")
        op_utils.check_dtype(rois_dtype, ["float16"], param_name="input_rois")
    else:
        op_utils.check_dtype(x_dtype, ["float16", "float32"],
                             param_name="input_x")
        op_utils.check_dtype(rois_dtype, ["float16", "float32"],
                             param_name="input_rois")

    if x_dtype != rois_dtype or x_dtype != y_dtype:
        error_info = {}
        error_info['errCode'] = 'E81012'
        error_info['op_name'] = 'roi_pooling'
        error_info['real_dtypes'] = str((x_dtype, rois_dtype, y_dtype))
        raise RuntimeError(
            error_info,
            "In op[roi_pooling], the dtype of tensor x, rois and y should be the same, but actually they are [%s]."
            % error_info['real_dtypes'])

    op_utils.check_shape(x_shape, min_rank=5, max_rank=5, param_name="input_x")
    op_utils.check_shape(rois_shape,
                         min_rank=3,
                         max_rank=3,
                         param_name="input_rois")
    op_utils.check_shape(y_shape,
                         min_rank=5,
                         max_rank=5,
                         param_name="output_y")

    roi_max_num = rois_shape[2]
    if roi_max_num > 6000 or roi_max_num % 16 != 0:
        error_info = {}
        error_info['errCode'] = 'E81013'
        error_info['real_rois_shape[2]'] = str(rois_shape[2])
        raise RuntimeError(
            error_info,
            "In op[roi_pooling], the rois_shape[2] should be less than 6000 and can be divided by 16, but actually is [%s]."
            % error_info['real_rois_shape[2]'])
Beispiel #26
0
def conv2d_tik_compute(params):
    te_set_l2_mode(1)
    tik_instance = tik.Tik(tik.Dprofile(params["arch"], params["version"]),
                           err_msg_level=1)
    n, c1, h, w, c0 = params["fm_shape"]
    c1, kh, kw, cout, c0 = params["weight_shape"]
    stride_h, stride_w = params["stride_list"]
    dilation_h, dilation_w = params["dilation_list"]
    pad_top, pad_bot, pad_left, pad_right = params["pad_list"]
    kh_dilation = (kh - 1) * dilation_h + 1
    kw_dilation = (kw - 1) * dilation_w + 1
    ho = int(np.ceil((h + pad_top + pad_bot - kh_dilation + 1) / stride_h))
    wo = int(np.ceil((w + pad_right + pad_left - kw_dilation + 1) / stride_w))
    round_howo = ceil_div(ho * wo, 16) * 16

    fm_gm = tik_instance.Tensor(params['fm_dtype'], (n, c1, h, w, c0),
                                name='fm_gm',
                                scope=tik.scope_gm)
    weight_gm = tik_instance.Tensor(params['weight_type'],
                                    (c1, kh, kw, cout, c0),
                                    name='weight_gm',
                                    scope=tik.scope_gm)

    if params['dst_gm_type'] in ("int8", "uint8"):
        dst_gm = tik_instance.Tensor(params['dst_gm_type'],
                                     [n, cout // 32, ho, wo, 32],
                                     name='dst_gm',
                                     scope=tik.scope_gm)
    else:
        dst_gm = tik_instance.Tensor(params['dst_gm_type'],
                                     [n, cout // 16, ho, wo, 16],
                                     name='dst_gm',
                                     scope=tik.scope_gm)

    core_num = 2
    pre_core_cout = cout // core_num
    cout_iter_num = pre_core_cout // params["cout_split_factor"]
    Cin_blocks = c1

    with tik_instance.for_range(0, core_num, block_num=core_num) as cout_o:
        with tik_instance.for_range(0, cout_iter_num, thread_num=1) as cout_i:
            weight_L1 = tik_instance.Tensor(
                params['weight_type'],
                (Cin_blocks, kh, kw, params["cout_split_factor"], c0),
                name='weight_l1',
                scope=tik.scope_cbuf)
            tik_instance.data_move(
                weight_L1,
                weight_gm.flatten()[cout_o * pre_core_cout * c0 +
                                    params["cout_split_factor"] * cout_i * c0],
                0, Cin_blocks * kh * kw, params["cout_split_factor"],
                (cout - params["cout_split_factor"]), 0)

            with tik_instance.for_range(0, n, thread_num=2) as n_index:
                feature_map_l1 = tik_instance.Tensor(params['fm_dtype'],
                                                     (c1, h, w, c0),
                                                     name='feature_map_l1',
                                                     scope=tik.scope_cbuf)
                tik_instance.data_move(feature_map_l1,
                                       fm_gm[n_index, :, :, :, :], 0, 1,
                                       c1 * h * w, 0, 0)
                dst_l0c = tik_instance.Tensor(
                    params['dst_l0c_type'],
                    [params["cout_split_factor"] // 16, round_howo, 16],
                    name='dst_l0c',
                    scope=tik.scope_cbuf_out)

                tik_instance.conv2d(
                    dst_l0c, feature_map_l1, weight_L1, (c1, h, w, c0),
                    (Cin_blocks, kh, kw, params["cout_split_factor"], c0),
                    params['stride_list'], params['pad_list'],
                    params['dilation_list'], params['pad_value'])

                tik_instance.fixpipe(
                    dst_gm[n_index, (cout_o * pre_core_cout +
                                     params["cout_split_factor"] * cout_i) //
                           (32 // DTYPE_SIZE[params['dst_gm_type']]), 0, 0, 0],
                    dst_l0c,
                    params["cout_split_factor"] // 16,
                    ho * wo * 16 * DTYPE_SIZE[params['dst_l0c_type']] // 32,
                    0,
                    0,
                    extend_params={
                        "bias": None,
                        "quantize_params": params["quantize_params"]
                    })

    tik_instance.BuildCCE(kernel_name=params["kernel_name"],
                          inputs=[fm_gm, weight_gm],
                          outputs=[dst_gm])

    return tik_instance
Beispiel #27
0
def decode_boundaries_target(boundary_predictions, anchors, boundary_encoded,
                             kernel_name="cce_decode_boundaries_target_fpLINE"):
    """
    calculating data

    Parameters
    ----------
    boundary_predictions : dict
        shape and dtype of input
    anchors : dict
        shape and dtype of input
    boundary_encoded : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "decode_boundaries_target"

    Returns
    -------
    None
    """
    util.check_kernel_name(kernel_name)
    input_info = InputInfo(
        shape_boundary_predictions=boundary_predictions.get("shape"),
        shape_anchors=anchors.get("shape"),
        dtype_boundary_predictions=boundary_predictions.get("dtype").lower(),
        dtype_anchors=anchors.get("dtype").lower()
    )
    input_info.set_nmax(n_max=NMAX)
    output_info = Output()

    total_handling_times, last_handling_n = check_input(
        boundary_predictions=boundary_predictions,
        anchors=anchors,
        boundary_encoded=boundary_encoded,
        n_max=input_info.n_max)
    tik_instance = tik.Tik(tik.Dprofile(), True)
    # tensor init
    data_boundary_predictions, data_anchors, \
        data_z = get_gm(tik_instance=tik_instance,
                        dtype=input_info.dtype_anchors,
                        shape1=input_info.shape_boundary_predictions,
                        shape2=input_info.shape_anchors,
                        name1="data_boundary_predictions",
                        name2="data_anchors",
                        name3="data_z",
                        scope=tik.scope_gm)

    if total_handling_times > 0:
        with tik_instance.for_range(0, total_handling_times) as current_handling_times:
            # current_handling_times:
            output_info.set_burst_num(burst_num=input_info.n_max)

            # number of LINE*LINE
            output_info.update(
                n_vector=int_ceil_div(output_info.burst_num, MATRIX_NUM),
                n_matrix=int_ceil_div(output_info.burst_num * FOUR, MATRIX_NUM)
            )
            output_info.update(
                shape_vector=(output_info.n_vector, LINE, LINE),
                shape_matrix=(output_info.n_matrix * FOUR, LINE, LINE)
            )

            # move x_gm to ub times
            # move y_gm to ub times
            input_info.update(
                burst_x=int_ceil_div(output_info.burst_num, LINE),
                burst_y=int_ceil_div(output_info.burst_num * FOUR, LINE)
            )

            output_info.update(
                rep=output_info.burst_num // VECTOR,
                overflow=0
            )

            process_calculate(tik_instance=tik_instance,
                              input_info=input_info,
                              output_info=output_info,
                              current_handling_times=current_handling_times,
                              data_boundary_predictions=data_boundary_predictions,
                              data_anchors=data_anchors,
                              data_z=data_z)

    current_handling_times = total_handling_times
    if last_handling_n > 0:
        output_info.set_burst_num(burst_num=last_handling_n)

        # number of LINE*LINE
        output_info.update(
            n_vector=int_ceil_div(output_info.burst_num, MATRIX_NUM),
            n_matrix=int_ceil_div(output_info.burst_num * FOUR, MATRIX_NUM)
        )
        output_info.update(
            shape_vector=(output_info.n_vector, LINE, LINE),
            shape_matrix=(output_info.n_matrix * FOUR, LINE, LINE)
        )

        # move x_gm to ub times
        # move y_gm to ub times
        input_info.update(
            burst_x=int_ceil_div(output_info.burst_num, LINE),
            burst_y=int_ceil_div(output_info.burst_num * FOUR, LINE)
        )

        output_info.update(
            rep=0,
            overflow=output_info.burst_num - VECTOR *
            (output_info.burst_num // VECTOR)
        )

        process_end(tik_instance=tik_instance,
                    input_info=input_info,
                    output_info=output_info,
                    current_handling_times=current_handling_times,
                    data_boundary_predictions=data_boundary_predictions,
                    data_anchors=data_anchors,
                    data_z=data_z)

    # build_cce
    tik_instance.BuildCCE(
        kernel_name=kernel_name,
        inputs=[data_boundary_predictions, data_anchors],
        outputs=[data_z])

    return tik_instance
Beispiel #28
0
    def __init__(self, var, indices, updates, var_out, nd_flag, kernel_name,
                 compute_type):
        """
        Init scatter base parameters

        Parameters
        ----------
        var: dict
            data of input
            datatype suports float32,float16,int32,int8,uint8
        indices: dict
            data of indices
            datatype supports int32
        updates: dict
            data of updates
            datatype supports float32,float16,int32,int8,uint8
        var_out: dict
            data of input
        nd_flag: bool
            if this op is nd operator
        kernel_name: str
            the name of the operator
        compute_type: str
            the compute type of scatter
        Returns
        -------
        None
        """
        self.tik_instance = tik.Tik(tik.Dprofile())
        self.nd_flag = nd_flag
        self.var_shape = var.get("shape")
        self.var_dtype = var.get("dtype").lower()
        self.indices_shape = indices.get("shape")
        self.indices_dtype = indices.get("dtype").lower()
        self.updates_shape = updates.get("shape")
        self.updates_dtype = updates.get("dtype").lower()
        self.var_ele_num = functools_reduce(lambda x, y: x * y, self.var_shape)
        self.indices_num = functools_reduce(lambda x, y: x * y,
                                            self.indices_shape)
        self.updates_num = functools_reduce(lambda x, y: x * y,
                                            self.updates_shape)
        self.kernel_name = kernel_name

        if self.indices_shape == (1,) and \
                len(self.var_shape)-len(self.updates_shape) == 1:
            if not nd_flag:
                self.updates_shape = (1, ) + self.updates_shape

        self.check_param(var_out)
        if nd_flag:
            if self.indices_shape[-1] == len(self.var_shape):
                self.update_data_num = 1
            else:
                self.update_data_num = functools_reduce(
                    lambda x, y: x * y,
                    self.var_shape[self.indices_shape[-1]:])
            self.max_indice = functools_reduce(
                lambda x, y: x * y, self.var_shape[0:self.indices_shape[-1]])
            self.index_dims = self.indices_shape[-1]
        else:
            if len(self.var_shape) > 1:
                self.update_data_num = functools_reduce(
                    lambda x, y: x * y, self.var_shape[1:])
            else:
                self.update_data_num = 1
            self.max_indice = self.var_shape[0]
            self.index_dims = 1

        self.compute_type = compute_type

        self.ub_size_bytes = (
            tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) -
            8192)
        self.var_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(
            self.var_dtype) // 8
        self.indices_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(
            self.indices_dtype) // 8
        self.var_data_each_block = 32 // self.var_dtype_bytes_size
        self.indices_data_each_block = 32 // self.indices_dtype_bytes_size
        self.indices_ub_number = 0
        self.updates_ub_number = 0

        self.index_loop_num = 0

        self.max_num_one_repeat = 128
        if self.var_dtype in ("float32", "int32"):
            self.max_num_one_repeat = 64

        if self.update_data_num < self.var_data_each_block:
            self.block_num = 1
        else:
            ai_core_num = tbe_platform.cce_conf.get_soc_spec(
                tbe_platform.cce_conf.CORE_NUM)
            self.indice_step = math.ceil(self.max_indice / ai_core_num)
            self.block_num = math.ceil(self.max_indice / self.indice_step)

        self.var_gm = self.tik_instance.Tensor(self.var_dtype,
                                               self.var_shape,
                                               name="var_gm",
                                               scope=tik.scope_gm)
        self.indices_gm = self.tik_instance.Tensor(self.indices_dtype,
                                                   self.indices_shape,
                                                   name="indices_gm",
                                                   scope=tik.scope_gm)
        self.updates_gm = self.tik_instance.Tensor(self.updates_dtype,
                                                   self.updates_shape,
                                                   name="updates_gm",
                                                   scope=tik.scope_gm)
        self.out_gm = self.tik_instance.Tensor(self.var_dtype,
                                               self.var_shape,
                                               name="out_gm",
                                               scope=tik.scope_gm)

        self.vconv_dst_dtype = "float16"

        self.init_ub_tensor_para()
        self.var_vconv_ub = None
        self.updates_vconv_ub = None
        self.var_tile_vconv_ub = None
        self.updates_tile_vconv_ub = None

        self.var_ub = None
        self.updates_ub = None
        self.indices_ub = None
        self.var_tile_ub = None
        self.updates_tile_ub = None

        self.var_read_index = None
        self.updates_read_index = None
        self.indices_loop_index = None
        self.indices_tmp = None
def CusMatMulCubeDenseRight(input_x1,
                            input_x2,
                            input_x3,
                            bias=None,
                            output_y={},
                            trans_a=False,
                            trans_b=False,
                            kernel_name="matmulcube"):
    """CusMatMulCubeDenseRight"""
    shape_a_temp = (128, 63, 16, 16)
    shape_b_temp = (128, 128, 16, 16)
    shape_output = output_y.get("shape")
    matrix_max_shape = (1, )
    support_shape = [
        (shape_a_temp, shape_b_temp, matrix_max_shape),
    ]
    shape_a_input = input_x1.get("shape")
    shape_b_input = input_x2.get("shape")
    matrix_max_input = input_x3.get("shape")
    input_shape = (tuple(shape_a_input), tuple(shape_b_input),
                   tuple(matrix_max_input))
    if input_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" %
                           str(input_shape))

    if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[
            0] == 128 and shape_b_temp[1] == 128:
        if util.get_product_version() == util.VERSION_MINI:
            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
        else:
            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
        input_x1 = tik_instance.Tensor("float16",
                                       shape_a_temp,
                                       name="left_matrix",
                                       scope=tik.scope_gm)
        input_x2 = tik_instance.Tensor("float16",
                                       shape_b_temp,
                                       name="right_matrix",
                                       scope=tik.scope_gm)
        input_x3 = tik_instance.Tensor("float32", [
            1,
        ],
                                       name="matrix_max",
                                       scope=tik.scope_gm)
        resMatmul = tik_instance.Tensor("float32",
                                        shape_output,
                                        name="output",
                                        scope=tik.scope_gm)
        with tik_instance.for_range(0, 32, block_num=32) as block_index:
            core_m_idx = block_index // 16
            core_n_idx = block_index % 16
            matrix_max_scalar = tik_instance.Scalar("float32")
            matrix_max_local_UB = tik_instance.Tensor(
                "float32", (8, ),
                scope=tik.scope_ubuf,
                name="matrix_max_local_UB")
            tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0,
                                   0)
            matrix_max_scalar.set_as(matrix_max_local_UB[0])

            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128, ),
                                                     scope=tik.scope_ubuf,
                                                     name="resMatmul_local_UB")
            resMatmul_local_UB1 = tik_instance.Tensor(
                "float32", (240 * 128, ),
                scope=tik.scope_ubuf,
                name="resMatmul_local_UB1")

            resMatmul_local_UB_local_L0C = tik_instance.Tensor(
                "float32", (256 * 128, ),
                scope=tik.scope_cc,
                name="resMatmul_local_UB_local_L0C")
            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor(
                "float32", (240 * 128, ),
                scope=tik.scope_cc,
                name="resMatmul_local_UB_local_L0C1")

            input_1_local_L1_local_L0A = tik_instance.Tensor(
                "float16", (256 * 128, ),
                scope=tik.scope_ca,
                name="input_1_local_L1_local_L0A")
            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16, ),
                                                   scope=tik.scope_cbuf,
                                                   name="input_2_local_L1")
            input_2_local_L11 = tik_instance.Tensor("float16",
                                                    (8 * 128 * 16, ),
                                                    scope=tik.scope_cbuf,
                                                    name="input_2_local_L11")

            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16, ),
                                                   scope=tik.scope_cbuf,
                                                   name="input_1_local_L1")
            input_1_local_L11 = tik_instance.Tensor("float16",
                                                    (8 * 240 * 16, ),
                                                    scope=tik.scope_cbuf,
                                                    name="input_1_local_L11")

            input_2_local_L1_local_L0B = tik_instance.Tensor(
                "float16", (128 * 128, ),
                scope=tik.scope_cb,
                name="input_2_local_L1_local_L0B")
            input_2_local_L1_local_L0B1 = tik_instance.Tensor(
                "float16", (128 * 128, ),
                scope=tik.scope_cb,
                name="input_2_local_L1_local_L0B1")

            with tik_instance.if_scope(core_m_idx == 0):
                with tik_instance.for_range(0, 2) as cc1:
                    tik_instance.data_move(
                        input_2_local_L1,
                        input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0,
                        8, 128, 1920, 0)
                    tik_instance.data_move(
                        input_1_local_L1,
                        input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256,
                        752, 0)
                    with tik_instance.for_range(0, 8) as cc10:
                        tik_instance.load2dv1(
                            input_2_local_L1_local_L0B[cc10 * 2048],
                            input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True)
                    with tik_instance.for_range(0, 16) as cc101:
                        tik_instance.load2dv1(
                            input_1_local_L1_local_L0A[cc101 * 2048],
                            input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False)

                    tik_instance.mmad(resMatmul_local_UB_local_L0C,
                                      input_1_local_L1_local_L0A,
                                      input_2_local_L1_local_L0B, 256, 128,
                                      128, 0)
                    tik_instance.data_move(resMatmul_local_UB,
                                           resMatmul_local_UB_local_L0C, 0, 1,
                                           128, 0, 0)
                    tik_instance.vmuls(64, resMatmul_local_UB,
                                       resMatmul_local_UB, matrix_max_scalar,
                                       255, 1, 1, 8, 8)
                    tik_instance.vmuls(64, resMatmul_local_UB[255 * 64],
                                       resMatmul_local_UB[255 * 64],
                                       matrix_max_scalar, 255, 1, 1, 8, 8)
                    tik_instance.vmuls(64, resMatmul_local_UB[510 * 64],
                                       resMatmul_local_UB[510 * 64],
                                       matrix_max_scalar, 2, 1, 1, 8, 8)

                    tik_instance.data_move(
                        resMatmul[core_n_idx * 129024 + cc1 * 4096],
                        resMatmul_local_UB, 0, 8, 512, 0, 1504)
            with tik_instance.else_scope():
                tik_instance.data_move(
                    input_2_local_L1,
                    input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8,
                    128, 1920, 0)
                tik_instance.data_move(
                    input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096],
                    0, 8, 256, 752, 0)
                with tik_instance.for_range(0, 8) as cc10:
                    tik_instance.load2dv1(
                        input_2_local_L1_local_L0B[cc10 * 2048],
                        input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True)
                with tik_instance.for_range(0, 16) as cc101:
                    tik_instance.load2dv1(
                        input_1_local_L1_local_L0A[cc101 * 2048],
                        input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False)

                tik_instance.mmad(resMatmul_local_UB_local_L0C,
                                  input_1_local_L1_local_L0A,
                                  input_2_local_L1_local_L0B, 256, 128, 128, 0)
                tik_instance.data_move(resMatmul_local_UB,
                                       resMatmul_local_UB_local_L0C, 0, 1, 128,
                                       0, 0)
                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB,
                                   matrix_max_scalar, 255, 1, 1, 8, 8)
                tik_instance.vmuls(64, resMatmul_local_UB[255 * 64],
                                   resMatmul_local_UB[255 * 64],
                                   matrix_max_scalar, 255, 1, 1, 8, 8)
                tik_instance.vmuls(64, resMatmul_local_UB[510 * 64],
                                   resMatmul_local_UB[510 * 64],
                                   matrix_max_scalar, 2, 1, 1, 8, 8)

                tik_instance.data_move(
                    resMatmul[core_n_idx * 129024 + 2 * 4096],
                    resMatmul_local_UB, 0, 8, 512, 0, 1504)

                tik_instance.data_move(
                    input_2_local_L11,
                    input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8,
                    128, 1920, 0)
                tik_instance.data_move(input_1_local_L11,
                                       input_x1[core_n_idx * 129024 + 12288],
                                       0, 8, 240, 768, 0)

                with tik_instance.for_range(0, 8) as cc102:
                    tik_instance.load2dv1(
                        input_2_local_L1_local_L0B1[cc102 * 2048],
                        input_2_local_L11[cc102 * 256], 0, 8, 8, 0, True)
                with tik_instance.for_range(0, 16) as cc103:
                    tik_instance.load2dv1(
                        input_1_local_L1_local_L0A[cc103 * 2048],
                        input_1_local_L11[cc103 * 256], 0, 8, 15, 0, False)

                tik_instance.mmad(resMatmul_local_UB_local_L0C1,
                                  input_1_local_L1_local_L0A,
                                  input_2_local_L1_local_L0B1, 240, 128, 128,
                                  0)
                tik_instance.data_move(resMatmul_local_UB1,
                                       resMatmul_local_UB_local_L0C1, 0, 1,
                                       120, 0, 0)

                tik_instance.vmuls(64, resMatmul_local_UB1,
                                   resMatmul_local_UB1, matrix_max_scalar, 255,
                                   1, 1, 8, 8)
                tik_instance.vmuls(64, resMatmul_local_UB1[255 * 64],
                                   resMatmul_local_UB1[255 * 64],
                                   matrix_max_scalar, 225, 1, 1, 8, 8)

                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288],
                                       resMatmul_local_UB1, 0, 8, 480, 0, 1536)

        tik_instance.BuildCCE(kernel_name=kernel_name,
                              inputs=[input_x1, input_x2, input_x3],
                              outputs=[resMatmul])
        return tik_instance
Beispiel #30
0
def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
    """CusTranspose02314"""
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    perm = (0, 2, 3, 1, 4)
    input_x_shape = tuple(input_x_shape)
    support_shape = [(32, 128, 7, 7, 16),
                     (32, 32, 7, 7, 16),
                     (32, 32, 14, 14, 16),
                     (32, 64, 14, 14, 16),
                     (32, 16, 14, 14, 16),
                     (32, 16, 28, 28, 16),
                     (32, 32, 28, 28, 16),
                     (32, 8, 28, 28, 16),
                     (32, 8, 56, 56, 16),
                     (32, 16, 56, 56, 16),
                     (32, 4, 56, 56, 16),
                     (32, 4, 112, 112, 16)]
    if input_x_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" % str(input_x_shape))

    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))

    input_x = tik_instance.Tensor("float16", input_x_shape, name="input_x", scope=tik.scope_gm)
    res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm)

    dtype = "float16"
    if tuple(input_x_shape) == (32, 4, 112, 112, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 14) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    zero = tik_instance.Scalar(dtype="float16", init_value=0)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
                                           12096, 0)
                    with tik_instance.for_range(0, 448) as cc7:
                        with tik_instance.for_range(0, 4) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 4, 56, 56, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 3) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
                                           2688, 0)
                    with tik_instance.for_range(0, 448) as cc7:
                        with tik_instance.for_range(0, 4) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)

            input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
                                                        scope=tik.scope_ubuf)
            tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 43008], 0, 4, 448, 2688, 0)
            with tik_instance.for_range(0, 448) as cc72:
                with tik_instance.for_range(0, 4) as cc82:
                    tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16],
                                       input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 16, 56, 56, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 14) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112,
                                           3024, 0)
                    with tik_instance.for_range(0, 112) as cc7:
                        with tik_instance.for_range(0, 16) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 8, 56, 56, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 7) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912,
                                           0)
                    with tik_instance.for_range(0, 224) as cc7:
                        with tik_instance.for_range(0, 16) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
                                               input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 8, 28, 28, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 2) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588,
                                           0)
                    with tik_instance.for_range(0, 196) as cc7:
                        with tik_instance.for_range(0, 8) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
                                               input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1568, 0, 0)
    elif tuple(input_x_shape) == (32, 32, 28, 28, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 7) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx],
                                           0, 32, 56, 728, 0)
                    with tik_instance.for_range(0, 56) as cc7:
                        with tik_instance.for_range(0, 32) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16],
                                               input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 16, 28, 28, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 3) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672,
                                           0)
                    with tik_instance.for_range(0, 112) as cc7:
                        with tik_instance.for_range(0, 16) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)

            input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
                                                        scope=tik.scope_ubuf)
            tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 10752], 0, 16, 112, 672, 0)
            with tik_instance.for_range(0, 112) as cc7:
                with tik_instance.for_range(0, 16) as cc8:
                    tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16],
                                       input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)

    elif tuple(input_x_shape) == (32, 16, 14, 14, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
                T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
                                                           scope=tik.scope_ubuf)
                tik_instance.data_move(input_1_local_UB, input_x[block_idx * 50176 + 1568 * db_idx], 0, 16, 98, 98, 0)
                with tik_instance.for_range(0, 98) as cc7:
                    with tik_instance.for_range(0, 16) as cc8:
                        tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
                                           input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                tik_instance.data_move(res[block_idx * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
    elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 7, thread_num=2) as cc1:
                input_x_ub = tik_instance.Tensor(dtype, [1, 128, 1, 7, 16], name="input_1_local_UB",
                                                 scope=tik.scope_ubuf)
                transpose_ub = tik_instance.Tensor(dtype, [1, 1, 7, 128, 16], name="transpose_local_UB",
                                                   scope=tik.scope_ubuf)
                tik_instance.data_move(input_x_ub, input_x[block_idx, 0, cc1, 0, 0], 0, 128, 7, 42, 0)
                with tik_instance.for_range(0, 7) as cc7:
                    with tik_instance.for_range(0, 128) as cc8:
                        tik_instance.vadds(16, transpose_ub[0, 0, cc7, cc8, 0], input_x_ub[0, cc8, 0, cc7, 0], 0,
                                           1, 1, 1, 0, 0)
                tik_instance.data_move(res[block_idx * 100352 + 14336 * cc1], transpose_ub, 0, 1, 896, 0, 0)

    elif tuple(input_x_shape) == (32, 32, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            input_x_ub = tik_instance.Tensor(dtype, [1, 32, 7, 7, 16], name="input_1_local_UB",
                                             scope=tik.scope_ubuf)
            transpose_ub = tik_instance.Tensor(dtype, [1, 7, 7, 32, 16], name="transpose_local_UB",
                                               scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, 0, 0, 0], 0, 1, 1568, 0, 0)
            with tik_instance.for_range(0, 7) as cc1:
                with tik_instance.for_range(0, 7) as cc2:
                    with tik_instance.for_range(0, 32) as cc3:
                        tik_instance.vadds(16, transpose_ub[0, cc1, cc2, cc3, 0], input_x_ub[0, cc3, cc1, cc2, 0], 0,
                                           1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 25088], transpose_ub, 0, 1, 1568, 0, 0)

    elif tuple(input_x_shape) == (32, 32, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        def _inner_compute(split_index):
            input_x_ub = tik_instance.Tensor(dtype, [1, 32, 2, 14, 16], name="input_1_local_UB",
                                             scope=tik.scope_ubuf)
            transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 32, 16], name="transpose_local_UB",
                                               scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 32, 28, 168, 0)
            with tik_instance.for_range(0, 2) as cc2:
                with tik_instance.for_range(0, 14) as cc3:
                    with tik_instance.for_range(0, 32) as cc4:
                        tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                           0, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 100352 + split_index * 2 * 7168], transpose_ub, 0, 1, 896, 0, 0)

        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 6, thread_num=2) as cc1:
                _inner_compute(cc1)
            _inner_compute(6)
    elif tuple(input_x_shape) == (32, 64, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        def _inner_compute(split_index, block_idx):
            input_x_ub = tik_instance.Tensor(dtype, [1, 64, 2, 14, 16], name="input_1_local_UB",
                                             scope=tik.scope_ubuf)
            transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 64, 16], name="transpose_local_UB",
                                               scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 64, 28, 168, 0)
            with tik_instance.for_range(0, 2) as cc2:
                with tik_instance.for_range(0, 14) as cc3:
                    with tik_instance.for_range(0, 64) as cc4:
                        tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                           0, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 200704 + split_index * 2 * 14336], transpose_ub, 0, 1, 1792, 0, 0)

        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 6, thread_num=2) as cc1:
                _inner_compute(cc1, block_idx)
            _inner_compute(6, block_idx)

    tik_instance.BuildCCE(kernel_name, inputs=[input_x], outputs=[res])
    return tik_instance