Ejemplo n.º 1
0
    def __init__(self, input_param, tik_instance):
        """
        init scatter_nd  parameters

        Parameters
        ----------
        input_param: a tuple with indices,updates,output_y,shape
               input_param[0]is indices,dict,shape and datatype,
               datatype supports int32
               input_param[1] is updates,dict,shape and datatype,
               datatype supports float32,float16,int32,int8,uint8
               input_param[2] is output_y, dict,shape and datatype,
               datatype supports float32,float16,int32,int8,uint8
               input_param[3] is shape,out put shape
        tik_instance: tik_instance

        Returns
        -------
        None
        """
        super(ScatterNd, self).__init__(input_param, tik_instance)
        updates = input_param[1]
        oneburst_num = constant.BLOCK_SIZE // common_util.get_data_size(
            input_param[0].get("dtype").lower())
        ind_shape = input_param[0].get("shape")
        indices_gm_num = get_gm_number(oneburst_num, ind_shape)
        indices_dtype = input_param[0].get("dtype").lower()

        if indices_gm_num > MAX_UB_ELEMENT_NUMBER and MAX_UB_ELEMENT_NUMBER % \
                ind_shape[-1] != 0:
            ind_ub_size = (MAX_UB_ELEMENT_NUMBER //
                           ind_shape[-1]) * ind_shape[-1]
            last_ub = indices_gm_num % ind_ub_size

            if last_ub % oneburst_num != 0:
                last_size = (last_ub // oneburst_num + 1) * oneburst_num
                indices_gm_num = indices_gm_num - last_ub + last_size

        self.input_indices_gm = \
            self.tik_instance.Tensor(indices_dtype, (indices_gm_num,),
                                     name="input_indices_gm",
                                     scope=tik.scope_gm)
        oneburst_num = constant.BLOCK_SIZE // common_util.get_data_size(
            updates.get("dtype").lower())
        update_gm_num = self.get_last_alignment_gm_num(
            self.updates.get("shape"), oneburst_num)

        updates_dtype = updates.get("dtype").lower()
        self.input_updates_gm = \
            self.tik_instance.Tensor(updates_dtype, (update_gm_num,),
                                     name="input_updates_gm",
                                     scope=tik.scope_gm)

        out_gm_num = self.get_last_alignment_gm_num(input_param[3],
                                                    oneburst_num)
        self.output_y_gm = self.tik_instance.Tensor(updates_dtype,
                                                    (out_gm_num, ),
                                                    name="output_y_gm",
                                                    scope=tik.scope_gm)
Ejemplo n.º 2
0
    def __init__(self,
                 x,
                 boxes,
                 box_index,
                 crop_size,
                 y,
                 extrapolation_value,
                 method):
        """
        Init CropAndResize base parameters

        Returns
        -------
        None
        """
        self.image_shape = x.get("shape")
        self.image_type = x.get("dtype")
        self.boxes_shape = boxes.get("shape")
        self.boxes_type = boxes.get("dtype")
        self.boxes_index_shape = box_index.get("shape")
        self.boxes_index_type = box_index.get("dtype")
        self.crop_size = crop_size
        self.extrapolation_value = extrapolation_value
        self.method = method
        self.output_shape = y.get("shape")
        self.output_type = y.get("dtype")

        # init tik_instance
        self.tik_instance = tik.Tik()
        self.aicore_num = \
            tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.CORE_NUM)
        self.input_gm_list = []
        self.output_gm_list = []

        # parsing input
        self.crop_height, self.crop_width = crop_size
        self.batch_size, self.image_c1, self.image_height, self.image_width, self.image_c0 = self.image_shape
        self.num_boxes, _ = self.boxes_shape
        byte_num_one = common_util.get_data_size(self.image_type)
        self.image_block_num = 32 // byte_num_one
        self.image_vector_num = self.image_block_num*8
        byte_num_one = common_util.get_data_size(self.boxes_type)
        self.boxes_block_num = 32 // byte_num_one
        self.boxes_vector_num = self.boxes_block_num*8
        self.block_num = self.boxes_block_num
        self.vector_num = self.boxes_vector_num

        self.index_ub = None
        self.height_mask_list = None
        self.width_mask_list = None
Ejemplo n.º 3
0
def sort_within_ub(instance: tik.Tik, src, cols):
    """
    sort_within_ub
    """
    with instance.new_stmt_scope():
        dst = instance.Tensor(src.dtype,
                              src.shape,
                              scope=tik.scope_ubuf,
                              name="ub_sort_within")
        _vrpsort16(instance, dst, src, cnt=cols)
        if cols > 16:
            result_ub = _merge_region(instance,
                                      out_ub=src,
                                      dst=src,
                                      src=dst,
                                      rows=1,
                                      cols=cols)
        else:
            result_ub = dst

        if result_ub.name != src.name:
            burst = math.ceil(cols * src.shape[1] *
                              common_util.get_data_size(src.dtype) / 32)
            instance.data_move(src, result_ub, 0, 1, burst, 0, 0)
    return src
Ejemplo n.º 4
0
    def __init__(self, input_param, tik_instance):
        """
        init scatter_nd base parameters

        Parameters
        ----------
        input_param: a tuple with indices,updates,output_y,shape
               input_param[0]is indices,dict,shape and datatype,
               datatype supports int32
               input_param[1] is updates,dict,shape and datatype,
               datatype supports float32,float16,int32,int8,uint8
               input_param[2] is output_y, dict,shape and datatype,
               datatype supports float32,float16,int32,int8,uint8
               input_param[3] is shape,out put shape
        tik_instance: tik_instance

        Returns
        -------
        None
        """
        self.tik_instance = tik_instance
        self.indices = input_param[0]
        self.updates = input_param[1]
        self.output_y = input_param[2]
        self.shape = input_param[3]
        self.data_size = common_util.get_data_size(
            self.updates.get("dtype").lower())
Ejemplo n.º 5
0
    def __init__(self, input_values, axis, kernel_name):
        self.tik_instance = tik.Tik()
        self.tik_profiling = tik.Dprofile()
        self.tiling_param = self.TilingParam(input_values, self.tik_instance)
        self.aicore_num = self.tik_profiling.get_aicore_num()
        self.kernel_name = kernel_name
        self.axis = axis

        self.dtype = input_values[0].get("dtype").lower()
        self.output_shape = (MAX_SIZE, )
        self.input_shape = (MAX_SIZE, )

        self.input_tensors, self.output_tensor = self._init_gm_tensor(
            self.input_shape, self.output_shape, len(input_values), self.dtype)

        dtype_bytes_size = common_util.get_data_size(self.dtype)
        self.ele_each_block = constant.BLOCK_SIZE // dtype_bytes_size
        valid_ub_size = self.tik_profiling.get_unified_buffer_size()
        valid_ub_size -= self.tiling_param.need_ub_size()
        self.ub_buffer_length = valid_ub_size

        # reserve one block size for not 32 bytes align
        self.ub_buffer_length -= constant.BLOCK_SIZE

        # make ub_buffer_length 32 bytes align
        self.ub_buffer_length //= constant.BLOCK_SIZE
        self.ub_buffer_length *= constant.BLOCK_SIZE

        self.ub_buffer_length //= dtype_bytes_size
Ejemplo n.º 6
0
    def __init__(self, input_x, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask,
                 kernel_name="strided_slice"):
        self.strides = strides
        self.begin_mask = begin_mask
        self.end_mask = end_mask
        self.ellipsis_mask = ellipsis_mask
        self.new_axis_mask = new_axis_mask
        self.shrink_axis_mask = shrink_axis_mask
        self.kernel_name = kernel_name

        inst = tik.Tik()
        self.tik_instance = inst
        self.tik_profiling = tik.Dprofile()
        self.tiling_param = self.TilingParam(input_x.get("shape"), inst)
        self.dtype = input_x.get("dtype").lower()
        self.dtype_size = common_util.get_data_size(self.dtype)
        self.input_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="input_gm", scope=tik.scope_gm)
        self.begin_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="begin_gm", scope=tik.scope_gm)
        self.end_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="end_gm", scope=tik.scope_gm)
        self.strides_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="strides_gm", scope=tik.scope_gm)
        self.output_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="output_gm", scope=tik.scope_gm)
        self.aicore_num = self.tik_profiling.get_aicore_num()
        self.block_element = constant.BLOCK_SIZE // self.dtype_size
        self.reserve_ub_size = 0
        self.ub_size = (self.tik_profiling.get_unified_buffer_size() // self.dtype_size // self.block_element *
                        self.block_element) - self.reserve_ub_size
        self.max_gap = 65535 * self.block_element
        self.max_last_dim = (self.max_gap + self.ub_size) // self.block_element
Ejemplo n.º 7
0
def get_blockdim_and_loop_cycle(updates, shape_out, update_each_size):
    """
    get blockdim and loop cycle

    Parameters
    ----------
    updates: dict,shape and datatype,datatype supports float32,float16,int32,
       int8,uint8
    shape_out: dict,shape and datatype,datatype supports float32,float16,int32,
       int8,uint8
    update_each_size: the elements number of each update data

    Returns
    -------
    None
    """
    blockdim = tbe_platform.cce_conf.get_soc_spec(
        tbe_platform.cce_conf.CORE_NUM)
    data_size = common_util.get_data_size(updates.get("dtype").lower())
    output_shape = scatter_nd_d_help.get_shape_total_number(shape_out)
    output_spilts = output_shape // update_each_size

    # update_each_size less than 32b,use one core,
    # beacuse Less than 32B alignment to prevent multi-core coverage,
    # using single-core processing
    if update_each_size * data_size < constant.BLOCK_SIZE:
        return 1, output_spilts
    if output_spilts < blockdim:
        return output_spilts, output_spilts // output_spilts

    return blockdim, output_spilts // blockdim
Ejemplo n.º 8
0
    def __init__(self, grad, argmax, input_x, ksize, strides, padding,
                 dilation, ceil_mode):
        """
        init compare and bit pack base parameters
        Parameters
        ----------
        input_x: input of maxpool, useless for maxpool gard
        grad: input of maxpoolgard or output of maxpool
        argmax:output of maxpool mask or index
        strides: stride , minimum length is 4,
                 just like [1, poolingStrideH, poolingStrideW, 1]
        padding: pad mode, just support "SANME" or "VALID"
        Returns
        -------
        None
        """
        self.blocknum = tbe_platform.cce_conf.get_soc_spec(
            tbe_platform.cce_conf.CORE_NUM)
        self.ub_size = tbe_platform.cce_conf.get_soc_spec(
            tbe_platform.cce_conf.UB_SIZE)

        self.input_gard_shape = grad.get("shape")
        self.argmax_shape = argmax.get("shape")
        self.y_shape = input_x.get("shape")
        self.dtype = grad.get("dtype").lower()
        self.dtype_size = common_util.get_data_size(self.dtype)
        self.nc1 = 1
        self.block = self.input_gard_shape[0] * self.input_gard_shape[1]
        self.tik_instance = tik.Tik()
        self.ksize = ksize
        self.strides = strides
        self.padding = padding
        self.ceil_mode = ceil_mode
        self.dilation = dilation
        dyh, dyw = self.input_gard_shape[2:4]
        dxh, dxw = self.y_shape[2:4]
        strideh, stridew = self.strides[1:3]
        windowh, windoww = self.ksize[1:3]
        pad_h, pad_w = self.padding[1:3]
        if self.ceil_mode is False:
            pad_top = pad_h
            pad_bottom = pad_h
            pad_left = pad_w
            pad_right = pad_w
        else:
            pad_top = pad_h
            pad_bottom = pad_h + strideh - 1
            pad_left = pad_w
            pad_right = pad_w + stridew - 1
        self.pad = (pad_top, pad_bottom, pad_left, pad_right)

        self.hoverlap = 0
        if windowh > strideh:
            self.hoverlap = windowh - strideh
        self.woverlap = 0
        if windoww > stridew:
            self.woverlap = windoww - stridew
Ejemplo n.º 9
0
    def __init__(self, input_x, output_y):
        """
        init population_count base parameters

        Parameters
        ----------
        input_x: shape and data type,datatype supports int16,uint16
        output_y: shape and data type,data type supports uint8

        Returns
        -------
        None
        """
        self.input_shape, self.input_dtype = self.get_input_params(input_x)
        self.output_shape, self.output_dtype = self.get_output_params(output_y)
        if self.output_dtype != "uint8":
            self.output_dtype = "uint8"
        self.input_data_size = common_util.get_data_size(self.input_dtype)
        self.output_data_size = common_util.get_data_size(self.output_dtype)
        self.tik_instance = tik.Tik()
Ejemplo n.º 10
0
    def __init__(self, input_dict):
        """
      init the Crop parameters

      Parameters
      ----------
        input_dict: input_dict is a dict, the keys as follow:
                x1: dict,shape and datatype,datatype supports int8,uint8,
                    int16,uint16,int32,uint32,int64,uint64,float16,float32
                x2: dict,shape and datatype,datatype supports int8,uint8,
                    int16,uint16,int32,uint32,int64,uint64,float16,float32
                y: dict,shape and datatype,datatype supports int8,uint8,
                    int16,uint16,int32,uint32,int64,uint64,float16,float32
                axis: crop start with axis
                offsets: crop start offset of each axis
                kernel_name: cce kernel name, default value is "crop"
      Returns
      -------
      None
      """
        self.instance = tik.Tik(tik.Dprofile())
        self.dtype = input_dict.get("x1").get("dtype").lower()
        self.dsize = common_util.get_data_size(self.dtype)
        total_size = tbe_platform.cce_conf.get_soc_spec(
            tbe_platform.cce_conf.UB_SIZE)
        ub_size = (total_size - RESERVE_SIZE) // (2 * self.dsize)
        burnest_len = constant.BLOCK_SIZE // self.dsize
        ub_size = ((ub_size + burnest_len - 1) // burnest_len) * burnest_len
        self.one_max_size = ub_size
        x1_len = get_shape_total_number(input_dict.get("x1").get("shape"))
        x1_len = ((x1_len + burnest_len - 1) // burnest_len) * burnest_len
        mod = input_dict.get("y").get("shape")[-1] % burnest_len
        if mod != 0:
            x1_len = x1_len + burnest_len
        self.x1_gm = self.instance.Tensor(self.dtype, (x1_len, ),
                                          name="x1_gm",
                                          scope=tik.scope_gm)
        self.x2_gm = self.instance.Tensor(self.dtype, (32, ),
                                          name="x2_gm",
                                          scope=tik.scope_gm)
        y_len = get_shape_total_number(input_dict.get("y").get("shape"))
        y_len = ((y_len + burnest_len - 1) // burnest_len) * burnest_len
        if mod != 0:
            y_len = y_len + burnest_len
        self.y_gm = self.instance.Tensor(self.dtype, (y_len, ),
                                         name="y_gm",
                                         scope=tik.scope_gm)
        self.input_dict = input_dict
Ejemplo n.º 11
0
 def __init__(self, input_data, block_size):
     """
     init space_to_depth base parameters
     Parameters
     ----------
     input_data: shape and data type,data type supports float16,float32,
                 int32,uint32,int16,uint16,int8,uint8,int64,uint64
     block_size: must be greater than one. It indicates the block size
     """
     self.input_shape = input_data.get("shape")
     self.dtype = input_data.get("dtype").lower()
     self.dtype_size = common_util.get_data_size(self.dtype)
     self.block_size = block_size
     self.tik_instance = tik.Tik(tik.Dprofile())
     self.output_shape = (self.input_shape[0],
                          self.input_shape[1] // block_size,
                          self.input_shape[2] // block_size,
                          self.input_shape[3] * block_size * block_size)
Ejemplo n.º 12
0
        def __init__(self, input_values, inst: tik.Tik):
            self.tik_instance = inst
            dtype = "int64"

            # data in tiling_gm likes:
            # 0---- 1----    2----          3----
            # axis, out_dim, max_inner_dim, min_inner_dim,
            # 4----                5----
            # output_inner_length, input_count
            # 6----    7----
            # reserve, reserve
            # 8----             9----
            # first_inner_dims, first_output_idx,
            # second_inner_dims, second_output_idx
            # ...
            self.dtype = dtype
            self.input_values = input_values
            self.axis = inst.Scalar(dtype, name="axis")
            self.out_dim = inst.Scalar(dtype, name="out_dim")
            self.max_inner_dim = inst.Scalar(dtype, name="max_inner_dim")
            self.min_inner_dim = inst.Scalar(dtype, name="min_inner_dim")
            self.output_inner_length = inst.Scalar(dtype,
                                                   name="output_inner_length")

            tiling_ub_size = max(len(input_values) * 2, 8)
            tiling_gm_size = 8 + tiling_ub_size
            tiling_gm_size = ceil_32bytes_align_count(tiling_gm_size, dtype)
            tiling_ub_size = ceil_32bytes_align_count(tiling_ub_size, dtype)
            self.tiling_ub_size = tiling_ub_size
            self.tiling_gm = inst.Tensor(dtype, (tiling_gm_size, ),
                                         name="tiling_gm",
                                         scope=tik.scope_gm)

            self._need_ub_size = (self.tiling_ub_size *
                                  common_util.get_data_size(dtype))
            self._tiling_ub = None
            self._out_dim = None
            self._inner_dim = None
Ejemplo n.º 13
0
    def __init__(self, shape, axis, dtype):
        """
        init the base param of cumsum

        Parameters
        ----------
        shape: the shape of tensor
        axis: cumulative axis
        dtype: the data type of tensor

        Returns
        -------
        None

        """
        self.tik_instance = tik.Tik()
        self.each_loop = shape[axis]
        self.dsize = get_data_size(dtype)
        self.each, self.each_tail = self.get_each(shape, axis)
        self.reserved = self.get_reserved()
        self.dtype = dtype
        self.axis = axis
        self.is_last_axis = True if (axis - len(shape)) == -1 else False
Ejemplo n.º 14
0
 def __init__(self, input_data, shape, kernel_name):
     """
     init parallel_concat base parameters
     Parameters
     ----------
     input_data: shape and data type,data type supports float16,float32,
                 int32,uint32,int16,uint16,int8,uint8,int64,uint64
     shape: list of output shape
     kernel_name: cce kernel name, default value is "parallel_concat"
     """
     self.data_shape = []
     self.data_dtype = []
     for _, input_dict in enumerate(input_data):
         shape_input = input_dict.get("shape")
         dtype_input = (input_dict.get("dtype")).lower()
         self.data_shape.append(shape_input)
         self.data_dtype.append(dtype_input)
     self.output_shape = shape
     self.kernel_name = kernel_name
     self.dtype_size = common_util.get_data_size(self.data_dtype[0])
     self.product_core_num = tbe_platform.cce_conf.get_soc_spec(
         tbe_platform.cce_conf.CORE_NUM)
     self.tik_instance = tik.Tik()
Ejemplo n.º 15
0
    def __init__(self, input_dict):
        """
      init the ShuffleChannel parameters

      Parameters
      ----------
        input_dict: input_dict is a dict, the keys as follow:
            x: dict,shape and datatype,datatype supports int8,uint8,int16,
              uint16,int32,uint32,int64,uint64,float16,float32
            y: dict,shape and datatype,datatype supports int8,uint8,int16,
              uint16,int32,uint32,int64,uint64,float16,float32
            group: 1 channel group
            kernel_name: cce kernel name, default value is "shuffle_channel"
      Returns
      -------
      None
      """
        self.instance = tik.Tik(tik.Dprofile())
        self.dtype = input_dict.get("x").get("dtype").lower()
        self.dsize = common_util.get_data_size(self.dtype)
        total_size = tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE)
        ub_size = (total_size - RESERVE_SIZE) // (2 * self.dsize)
        burnest_len = constant.BLOCK_SIZE // self.dsize
        ub_size = ((ub_size + burnest_len - 1) // burnest_len) * burnest_len
        self.one_max_size = ub_size
        x_len = get_shape_total_number(input_dict.get("x").get("shape"))
        x_len = ((x_len + burnest_len - 1) // burnest_len) * burnest_len
        hw = input_dict.get("y").get("shape")[2] * \
             input_dict.get("y").get("shape")[3]
        mod = hw % burnest_len
        if mod != 0:
            x_len = x_len + burnest_len
        self.x_gm = self.instance.Tensor(self.dtype, (x_len,), name="x_gm",
                                         scope=tik.scope_gm)
        self.y_gm = self.instance.Tensor(self.dtype, (x_len,), name="y_gm",
                                         scope=tik.scope_gm)
        self.input_dict = input_dict
Ejemplo n.º 16
0
    def __init__(self,
                 shape,
                 dtype,
                 depth_radius=5,
                 bias=1,
                 alpha=1,
                 beta=0.5,
                 kernel_name="lrn_grad"):
        self.shape = shape
        dtype = dtype.lower()
        self.dtype = dtype

        self.batch = shape[0]
        self.channels = shape[1]
        self.height = shape[2]
        self.width = shape[3]
        self.depth_radius = depth_radius
        self.bias = bias
        self.alpha = alpha
        self.beta = beta
        self.kernel_name = kernel_name

        tik_instance = tik.Tik()
        self.is_mini = True
        self.ub_dtype = "float16"
        if tbe_platform.cce_conf.api_check_support("tik.vln", "float32"):
            self.is_mini = False
            self.ub_dtype = "float32"

        self.ub_dtype_size = common_util.get_data_size(self.ub_dtype)
        self.tik_instance = tik_instance
        self.aicore_num = tik_instance.d_profiling.get_aicore_num()

        gm_size = 1
        for item in shape:
            gm_size *= item

        gm_shape = (gm_size, )

        self.data_input_grads = tik_instance.Tensor(dtype,
                                                    gm_shape,
                                                    name="input_grads",
                                                    scope=tik.scope_gm)
        self.data_input_image = tik_instance.Tensor(dtype,
                                                    gm_shape,
                                                    name="input_image",
                                                    scope=tik.scope_gm)
        self.data_output_image = tik_instance.Tensor(dtype,
                                                     gm_shape,
                                                     name="output_image",
                                                     scope=tik.scope_gm)
        self.data_output = tik_instance.Tensor(dtype,
                                               gm_shape,
                                               name="output",
                                               scope=tik.scope_gm)

        need_ub_segment_count = 6
        if self.dtype != self.ub_dtype:
            if self.is_mini:
                # mini only support float16, need conv float32 to float16,
                # sizeof(float32) == 2 * sizeof(float16), so need another buffer
                # of size of 2 * buffer_float16
                need_ub_segment_count += 2
            else:
                # if not mini, need conv float16 to float32 to calculate,
                # sizeof(float32) == 2 * sizeof(float16), so need another buffer
                # of size of 0.5 * buffer_float32
                need_ub_segment_count += 0.5

        self.need_ub_segment_count = need_ub_segment_count
        self.ub_segment_size = _get_ub_segment_size(need_ub_segment_count)
        self.dtype_size = common_util.get_data_size(dtype)
        self.small_hw = False
        if self.width * self.height * self.dtype_size < constant.BLOCK_SIZE:
            self.small_hw = True

        self.ub_shape = (self.ub_segment_size // self.ub_dtype_size, )
Ejemplo n.º 17
0
def _gm2ub(tik_instance: tik.Tik, dest: tik.Tensor, src: tik.Tensor, count):
    dtype_size = common_util.get_data_size(src.dtype)
    burst = math.ceil(count * dtype_size / constant.BLOCK_SIZE)
    tik_instance.data_move(dest, src, 0, 1, burst, 0, 0)
Ejemplo n.º 18
0
def ceil_32bytes_align_count(count, dtype):
    type_size = common_util.get_data_size(dtype)
    block_count = math.ceil(count * type_size / constant.BLOCK_SIZE)
    return block_count * constant.BLOCK_SIZE // type_size
Ejemplo n.º 19
0
 def _data_move(self, dest: tik.Tensor, src: tik.Tensor, count: tik.Scalar):
     dtype_size = common_util.get_data_size(src.dtype)
     burst = self._ceil_div(count * dtype_size, constant.BLOCK_SIZE)
     self.tik_instance.data_move(dest, src, 0, 1, burst, 0, 0)
Ejemplo n.º 20
0
    def __init__(self, bbox_tensor, img_metas, valid_tensor, kernel_name):
        self.kernel_name = kernel_name
        self.bbox_shape = bbox_tensor.get("shape")
        self.bbox_dtype = bbox_tensor.get("dtype").lower()
        self.bbox_dtype_size = common_util.get_data_size(self.bbox_dtype)

        self.valid_shape = valid_tensor.get("shape")
        self.valid_dtype_size = 1

        # select operations only handle 128 elements once time.
        self.__default_rows_per_job = 32 * 4 * 1

        self.job_num = self.__calc_job_num()

        self.img_metas = img_metas
        self.tik_instance = tik.Tik()

        # buffer for threshold extract
        self.img_metas_gm = self.tik_instance.Tensor("float16", (16, ),
                                                     name="img_metas_gm",
                                                     scope=tik.scope_gm)
        self.img_metas_ub = self.tik_instance.Tensor("float16", (16, ),
                                                     name="img_metas_ub",
                                                     scope=tik.scope_ubuf)
        self.threshold_h = self.tik_instance.Scalar("float16", "threshold_h")
        self.threshold_w = self.tik_instance.Scalar("float16", "threshold_w")
        self.__extract_threshold_as_scalar()

        # input bbox tensor from caller fp16, 128;  fp32, 64
        self.bbox_tensor_gm = self.tik_instance.Tensor("float16",
                                                       self.bbox_shape,
                                                       name="bbox_tensor_gm",
                                                       scope=tik.scope_gm)

        # return buffer, gm be whole
        self.data_ret_int8_gm = self.tik_instance.Tensor(
            "int8",
            self.valid_shape,
            name="data_ret_int8_gm",
            scope=tik.scope_gm)
        self.padded_bytes = 0
        self.last_job_row_aligned = self.__calc_last_job_row()

        # each job used buffer maximum
        self.job_buf_row = self.get_job_buffer_row()

        self.quad_flag_ub = self.tik_instance.Tensor("float16",
                                                     (self.job_buf_row * 4, ),
                                                     name="quad_flag_ub",
                                                     scope=tik.scope_ubuf)
        self.quad_flags_sum_ub = self.tik_instance.Tensor(
            "float16", (self.job_buf_row * 4, ),
            name="quad_flags_sum_ub",
            scope=tik.scope_ubuf)
        # need set value before each-times using!
        # this buffer will used in multi-time.
        # contains threshold and the transform tmp for return.
        self.quad_threshold_ub = self.tik_instance.Tensor(
            "float16", (self.job_buf_row * 4, ),
            name="quad_threshold_ub",
            scope=tik.scope_ubuf)

        self.ones_ub = self.tik_instance.Tensor("float16",
                                                (self.job_buf_row, 4),
                                                name="ones_ub",
                                                scope=tik.scope_ubuf)
        self.zeros_ub = self.tik_instance.Tensor("float16",
                                                 (self.job_buf_row, 4),
                                                 name="zeros_ub",
                                                 scope=tik.scope_ubuf)

        _repeat_time = max(4 * self.job_buf_row // 128, 1)
        _process_elem_count = self.get_handle_num_with_clip_128(4)

        self.tik_instance.vector_dup(_process_elem_count, self.ones_ub, 1,
                                     _repeat_time, 1, 8)
        self.tik_instance.vector_dup(_process_elem_count, self.zeros_ub, 0,
                                     _repeat_time, 1, 8)

        self.data_ret_int8_ub = self.tik_instance.Tensor(
            "int8", (self.job_buf_row, 1),
            name="data_ret_int8_ub",
            scope=tik.scope_ubuf)
        self.data_ret_mask_ub = self.tik_instance.Tensor(
            "uint16", (self.job_buf_row * 4 // 16, ),
            name="data_ret_mask_ub",
            scope=tik.scope_ubuf)
        self.data_ret_ub = self.tik_instance.Tensor("float16",
                                                    (self.job_buf_row, 1),
                                                    name="data_ret_ub",
                                                    scope=tik.scope_ubuf)
        self.ret_unfold_half_ub = self.tik_instance.Tensor(
            "float16", (self.job_buf_row * 2, ),
            name="ret_unfold_half_ub",
            scope=tik.scope_ubuf)

        self.bbox_tensor_ub = self.tik_instance.Tensor("float16",
                                                       (self.job_buf_row, 4),
                                                       name="bbox_tensor_ub",
                                                       scope=tik.scope_ubuf)
Ejemplo n.º 21
0
def check_param(input_dict):
    """
      check parameters

      Parameters
      ----------
      input_dict: input_dict is a dict, the keys as follow:
                  box1_info,box2_info,box3_info,biases1,biases2,biases3,
                  coords,boxes,classes,relative,obj_threshold,post_top_k,
                  post_top_k,nms_threshold,pre_nms_topn,
                  max_box_number_per_batch,kernel_name, for more details,
                  please check the yolov3_detection_output function

      Returns
      -------
      None
      """

    pre_nms_topn = input_dict.get("pre_nms_topn")

    if tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") in (
            "Ascend310", "Ascend910", "Hi3796CV300ES"):
        op_utils.check_dtype(input_dict.get("box1_info").get("dtype"), ["float16"], param_name="box1_info")
        op_utils.check_dtype(input_dict.get("box2_info").get("dtype"), ["float16"], param_name="box2_info")
        op_utils.check_dtype(input_dict.get("box3_info").get("dtype"), ["float16"], param_name="box3_info")
    else:
        op_utils.check_dtype(input_dict.get("box1_info").get("dtype"), ["float16", "float32"], param_name="box1_info")
        op_utils.check_dtype(input_dict.get("box2_info").get("dtype"), ["float16", "float32"], param_name="box2_info")
        op_utils.check_dtype(input_dict.get("box3_info").get("dtype"), ["float16", "float32"], param_name="box3_info")

    util.check_kernel_name(input_dict.get("kernel_name"))
    coords = input_dict.get("coords")
    post_top_k = input_dict.get("post_top_k")
    if coords != 4:
        error_info = {}
        error_info['errCode'] = 'E80017'
        error_info['opname'] = 'yolo_v3_detection_output_d'
        error_info['param_name'] = 'coords'
        error_info['expect_value'] = '4'
        error_info['real_value'] = str(coords)
        raise RuntimeError(error_info, 
            "In op[%s], the parameter[%s] should be [%s], but actually is [%s]."
            % (error_info['opname'], error_info['param_name'], error_info['expect_value'],
               error_info['real_value']))
    max_box_number_per_batch = input_dict.get("max_box_number_per_batch")
    dtype = input_dict.get("box1_info").get("dtype")
    if tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") in (
            "Hi3796CV300ES", "Hi3796CV300CS") \
            or dtype == constant.DATA_TYPE_FP32:
        if pre_nms_topn > PRE_NMS_TOPN // 2:
            check_param_range("pre_nms_topn", 1, PRE_NMS_TOPN // 2 - 1, pre_nms_topn)
    else:
        if pre_nms_topn > PRE_NMS_TOPN:
            check_param_range("pre_nms_topn", 1, PRE_NMS_TOPN - 1, pre_nms_topn)

    if max_box_number_per_batch > PRE_NMS_TOPN or max_box_number_per_batch <= 0:
        check_param_range("max_box_number_per_batch", 1, PRE_NMS_TOPN - 1, max_box_number_per_batch)

    if max_box_number_per_batch % 16 != 0:
        error_info = {}
        error_info['errCode'] = 'E81011'
        error_info['opname'] = 'yolo_v3_detection_output_d'
        error_info['real_value'] = str(max_box_number_per_batch)
        raise RuntimeError(error_info, 
            "In op[%s], max_box_number_per_batch should be a multiple of 16, but actually is [%s]."
            % (error_info['opname'], error_info['real_value']))

    if max_box_number_per_batch < pre_nms_topn or pre_nms_topn <= 0:
        check_param_range("pre_nms_topn", 1, max_box_number_per_batch-1, pre_nms_topn)

    if max_box_number_per_batch < post_top_k or post_top_k <= 0:
        check_param_range("post_top_k", 1, max_box_number_per_batch-1, post_top_k)

    dsize = common.get_data_size(input_dict.get("box1_info").get("dtype"))
    height = input_dict.get("box1_info").get("shape")[2]
    width = input_dict.get("box1_info").get("shape")[3]
    if height * width * dsize < constant.BLOCK_SIZE:
        raise RuntimeError(
            "box1_info's height[%d] multi with width[%d]'s size \
            must bigger than 32b" % (height, width))

    height = input_dict.get("box2_info").get("shape")[2]
    width = input_dict.get("box2_info").get("shape")[3]
    if height * width * dsize < constant.BLOCK_SIZE:
        raise RuntimeError(
            "box2_info's height[%d] multi with width[%d]'s size \
            must bigger than 32b" % (height, width))
    height = input_dict.get("box3_info").get("shape")[2]
    width = input_dict.get("box3_info").get("shape")[3]
    if height * width * dsize < constant.BLOCK_SIZE:
        raise RuntimeError(
            "box3_info's height[%d] multi with width[%d]'s size\
             must bigger than 32b" % (height, width))
Ejemplo n.º 22
0
def _merge_recur(instance: tik.Tik,
                 out_ub,
                 dst_ub,
                 src_ub,
                 last_dim,
                 total_region_list,
                 level,
                 region_offset=0):
    """
    _merge_recur
    merge multi sorted region proposal list to one sorted region proposal list
    """

    # vmrgsort4 can merger at most 4 sorted region list
    def is_next_to_last_merge():
        return 1 < math.ceil(total_region_list / 4) <= 4

    loops = total_region_list // 4
    remain = total_region_list % 4

    if is_next_to_last_merge() and dst_ub.name == out_ub.name:
        dst_ub = instance.Tensor(out_ub.dtype,
                                 out_ub.shape,
                                 scope=tik.scope_ubuf,
                                 name="ub_merge_recur")

    merge_n0 = 16 * (4**(level - 1))
    merge_n1 = merge_n0
    merge_n2 = merge_n0
    merge_n3 = merge_n0
    merge_repeat = loops
    need_tail_process = False
    if loops > 0 and remain == 0:
        if merge_n0 * 4 * loops > last_dim:
            merge_repeat = loops - 1
            n012 = merge_n0 + merge_n1 + merge_n2
            merge_left = last_dim - ((merge_n0 * 4 * (loops - 1)) + n012)
            need_tail_process = True
    if merge_repeat > 0:
        ub_offset = region_offset
        src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8],
                    src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8],
                    src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 +
                           merge_n2 * 8])
        element_count_list = (merge_n0, merge_n1, merge_n2, merge_n3)
        valid_bit = 15
        instance.vmrgsort4(dst_ub[ub_offset], src_list, element_count_list,
                           False, valid_bit, merge_repeat)

    if need_tail_process:
        tail_offset = 4 * merge_n0 * merge_repeat * 8
        ub_offset = region_offset + tail_offset
        src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8],
                    src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8],
                    src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 +
                           merge_n2 * 8])
        element_count_list = (merge_n0, merge_n1, merge_n2, merge_left)
        valid_bit = 15
        instance.vmrgsort4(dst_ub[ub_offset],
                           src_list,
                           element_count_list,
                           False,
                           valid_bit,
                           repeat_times=1)

    if loops > 0:
        offset = 4 * loops * 16 * (4**(level - 1))
    else:
        offset = 0

    if remain == 3:
        merge_n0 = 16 * (4**(level - 1))
        merge_n1 = merge_n0
        merge_n2 = last_dim - (offset + merge_n0 + merge_n1)
        ub_offset = region_offset + offset * 8
        src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8],
                    src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8],
                    src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 +
                           merge_n2 * 8])
        element_count_list = (merge_n0, merge_n1, merge_n2, 0)
        valid_bit = 2**remain - 1
        instance.vmrgsort4(dst_ub[ub_offset],
                           src_list,
                           element_count_list,
                           False,
                           valid_bit,
                           repeat_times=1)
    elif remain == 2:
        merge_n0 = 16 * (4**(level - 1))
        merge_n1 = last_dim - (offset + merge_n0)
        ub_offset = region_offset + offset * 8
        src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8],
                    src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8],
                    src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 +
                           merge_n2 * 8])
        element_count_list = (merge_n0, merge_n1, 0, 0)
        valid_bit = 2**remain - 1
        instance.vmrgsort4(dst_ub[ub_offset],
                           src_list,
                           element_count_list,
                           False,
                           valid_bit,
                           repeat_times=1)
    elif remain == 1:
        merge_n0 = last_dim - offset
        num_blocks_write = (
            merge_n0 * 8 * common_util.get_data_size(src_ub.dtype) + 31) // 32
        ub_offset = region_offset + offset * 8
        instance.data_move(dst_ub[ub_offset], src_ub[ub_offset], 0, 1,
                           num_blocks_write, 0, 0)

    next_total_region_list = math.ceil(total_region_list / 4)
    if next_total_region_list <= 1:
        return dst_ub

    if is_next_to_last_merge():
        src_ub = out_ub

    return _merge_recur(instance, out_ub, src_ub, dst_ub, last_dim,
                        next_total_region_list, level + 1, region_offset)
Ejemplo n.º 23
0
def _get_rep_stride(mask, dtype):
    return mask * common_util.get_data_size(dtype) // constant.BLOCK_SIZE
Ejemplo n.º 24
0
    def __init__(self, input_x, ksize, strides, padding):
        """
        init MaxPoolWithargmax parameters

        Parameters
        ----------
        input_x: dict
            shape and datatype
        ksize: list or tuple
            The size of the window for each dimension of the input tensor.
        strides: list or tuple
            The stride of the sliding window of the input tensor.
        padding: str
            The type of padding algorithm to use.

        Returns
        -------
        None
        """
        self.input_shape = input_x.get("shape")
        self.input_dtype = input_x.get("dtype").lower()
        self.input_type_size = common_util.get_data_size(self.input_dtype)
        self.tik_instance = tik.Tik()

        self.ksize = ksize
        self.strides = strides
        self.padding = padding
        self.batch_size = self.input_shape[0]
        self.c1_size = self.input_shape[1]
        self.in_size_h = self.input_shape[2]
        self.in_size_w = self.input_shape[3]
        self.c_block_size = self.input_shape[4]

        self.window_h = self.ksize[1]
        self.window_w = self.ksize[2]
        self.stride_h = self.strides[1]
        self.stride_w = self.strides[2]
        self.nc1 = self.batch_size * self.c1_size
        # scalar for load3d
        self.scalar_source_h = self.tik_instance.Scalar(dtype="int64")
        self.scalar_source_w = self.tik_instance.Scalar(dtype="int64")

        # caculate pad and output size
        self.pad, self.out_size_h, self.out_size_w = \
            self._calc_out_size_and_pad()
        # output_shape
        self.fmap_img2col_h = self.out_size_h * self.out_size_w
        self.fmap_img2col_w = self.window_h * self.window_w
        self.fmap_img2col_h_num = _ceil_div(self.fmap_img2col_h,
                                            self.c_block_size)
        if self.input_dtype == "float16":
            self.pad_value = MIN_VALUE_FP16
        # fmap is NC1HWC0 format
        fmap_gm_shape = (self.batch_size, self.c1_size, self.in_size_h,
                         self.in_size_w, self.c_block_size)

        output_gm_shape = (self.batch_size, self.c1_size, self.out_size_h,
                           self.out_size_w, self.c_block_size)
        output_mask_gm_shape = (self.batch_size, self.c1_size,
                                self.fmap_img2col_w,
                                (self.fmap_img2col_h_num + 1) *
                                self.c_block_size)
        # input and output
        self.input_fmap_gm = self.tik_instance.Tensor(self.input_dtype,
                                                      fmap_gm_shape,
                                                      name="input_fmap_gm",
                                                      scope=tik.scope_gm)
        self.output_max_gm = self.tik_instance.Tensor(self.input_dtype,
                                                      output_gm_shape,
                                                      name="output_max_gm",
                                                      scope=tik.scope_gm)
        self.output_mask_gm = self.tik_instance.Tensor("uint16",
                                                       output_mask_gm_shape,
                                                       name="output_mask_gm",
                                                       scope=tik.scope_gm)