def __init__(self, input_param, tik_instance): """ init scatter_nd parameters Parameters ---------- input_param: a tuple with indices,updates,output_y,shape input_param[0]is indices,dict,shape and datatype, datatype supports int32 input_param[1] is updates,dict,shape and datatype, datatype supports float32,float16,int32,int8,uint8 input_param[2] is output_y, dict,shape and datatype, datatype supports float32,float16,int32,int8,uint8 input_param[3] is shape,out put shape tik_instance: tik_instance Returns ------- None """ super(ScatterNd, self).__init__(input_param, tik_instance) updates = input_param[1] oneburst_num = constant.BLOCK_SIZE // common_util.get_data_size( input_param[0].get("dtype").lower()) ind_shape = input_param[0].get("shape") indices_gm_num = get_gm_number(oneburst_num, ind_shape) indices_dtype = input_param[0].get("dtype").lower() if indices_gm_num > MAX_UB_ELEMENT_NUMBER and MAX_UB_ELEMENT_NUMBER % \ ind_shape[-1] != 0: ind_ub_size = (MAX_UB_ELEMENT_NUMBER // ind_shape[-1]) * ind_shape[-1] last_ub = indices_gm_num % ind_ub_size if last_ub % oneburst_num != 0: last_size = (last_ub // oneburst_num + 1) * oneburst_num indices_gm_num = indices_gm_num - last_ub + last_size self.input_indices_gm = \ self.tik_instance.Tensor(indices_dtype, (indices_gm_num,), name="input_indices_gm", scope=tik.scope_gm) oneburst_num = constant.BLOCK_SIZE // common_util.get_data_size( updates.get("dtype").lower()) update_gm_num = self.get_last_alignment_gm_num( self.updates.get("shape"), oneburst_num) updates_dtype = updates.get("dtype").lower() self.input_updates_gm = \ self.tik_instance.Tensor(updates_dtype, (update_gm_num,), name="input_updates_gm", scope=tik.scope_gm) out_gm_num = self.get_last_alignment_gm_num(input_param[3], oneburst_num) self.output_y_gm = self.tik_instance.Tensor(updates_dtype, (out_gm_num, ), name="output_y_gm", scope=tik.scope_gm)
def __init__(self, x, boxes, box_index, crop_size, y, extrapolation_value, method): """ Init CropAndResize base parameters Returns ------- None """ self.image_shape = x.get("shape") self.image_type = x.get("dtype") self.boxes_shape = boxes.get("shape") self.boxes_type = boxes.get("dtype") self.boxes_index_shape = box_index.get("shape") self.boxes_index_type = box_index.get("dtype") self.crop_size = crop_size self.extrapolation_value = extrapolation_value self.method = method self.output_shape = y.get("shape") self.output_type = y.get("dtype") # init tik_instance self.tik_instance = tik.Tik() self.aicore_num = \ tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.CORE_NUM) self.input_gm_list = [] self.output_gm_list = [] # parsing input self.crop_height, self.crop_width = crop_size self.batch_size, self.image_c1, self.image_height, self.image_width, self.image_c0 = self.image_shape self.num_boxes, _ = self.boxes_shape byte_num_one = common_util.get_data_size(self.image_type) self.image_block_num = 32 // byte_num_one self.image_vector_num = self.image_block_num*8 byte_num_one = common_util.get_data_size(self.boxes_type) self.boxes_block_num = 32 // byte_num_one self.boxes_vector_num = self.boxes_block_num*8 self.block_num = self.boxes_block_num self.vector_num = self.boxes_vector_num self.index_ub = None self.height_mask_list = None self.width_mask_list = None
def sort_within_ub(instance: tik.Tik, src, cols): """ sort_within_ub """ with instance.new_stmt_scope(): dst = instance.Tensor(src.dtype, src.shape, scope=tik.scope_ubuf, name="ub_sort_within") _vrpsort16(instance, dst, src, cnt=cols) if cols > 16: result_ub = _merge_region(instance, out_ub=src, dst=src, src=dst, rows=1, cols=cols) else: result_ub = dst if result_ub.name != src.name: burst = math.ceil(cols * src.shape[1] * common_util.get_data_size(src.dtype) / 32) instance.data_move(src, result_ub, 0, 1, burst, 0, 0) return src
def __init__(self, input_param, tik_instance): """ init scatter_nd base parameters Parameters ---------- input_param: a tuple with indices,updates,output_y,shape input_param[0]is indices,dict,shape and datatype, datatype supports int32 input_param[1] is updates,dict,shape and datatype, datatype supports float32,float16,int32,int8,uint8 input_param[2] is output_y, dict,shape and datatype, datatype supports float32,float16,int32,int8,uint8 input_param[3] is shape,out put shape tik_instance: tik_instance Returns ------- None """ self.tik_instance = tik_instance self.indices = input_param[0] self.updates = input_param[1] self.output_y = input_param[2] self.shape = input_param[3] self.data_size = common_util.get_data_size( self.updates.get("dtype").lower())
def __init__(self, input_values, axis, kernel_name): self.tik_instance = tik.Tik() self.tik_profiling = tik.Dprofile() self.tiling_param = self.TilingParam(input_values, self.tik_instance) self.aicore_num = self.tik_profiling.get_aicore_num() self.kernel_name = kernel_name self.axis = axis self.dtype = input_values[0].get("dtype").lower() self.output_shape = (MAX_SIZE, ) self.input_shape = (MAX_SIZE, ) self.input_tensors, self.output_tensor = self._init_gm_tensor( self.input_shape, self.output_shape, len(input_values), self.dtype) dtype_bytes_size = common_util.get_data_size(self.dtype) self.ele_each_block = constant.BLOCK_SIZE // dtype_bytes_size valid_ub_size = self.tik_profiling.get_unified_buffer_size() valid_ub_size -= self.tiling_param.need_ub_size() self.ub_buffer_length = valid_ub_size # reserve one block size for not 32 bytes align self.ub_buffer_length -= constant.BLOCK_SIZE # make ub_buffer_length 32 bytes align self.ub_buffer_length //= constant.BLOCK_SIZE self.ub_buffer_length *= constant.BLOCK_SIZE self.ub_buffer_length //= dtype_bytes_size
def __init__(self, input_x, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask, kernel_name="strided_slice"): self.strides = strides self.begin_mask = begin_mask self.end_mask = end_mask self.ellipsis_mask = ellipsis_mask self.new_axis_mask = new_axis_mask self.shrink_axis_mask = shrink_axis_mask self.kernel_name = kernel_name inst = tik.Tik() self.tik_instance = inst self.tik_profiling = tik.Dprofile() self.tiling_param = self.TilingParam(input_x.get("shape"), inst) self.dtype = input_x.get("dtype").lower() self.dtype_size = common_util.get_data_size(self.dtype) self.input_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="input_gm", scope=tik.scope_gm) self.begin_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="begin_gm", scope=tik.scope_gm) self.end_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="end_gm", scope=tik.scope_gm) self.strides_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="strides_gm", scope=tik.scope_gm) self.output_gm = inst.Tensor(self.dtype, (MAX_SIZE,), name="output_gm", scope=tik.scope_gm) self.aicore_num = self.tik_profiling.get_aicore_num() self.block_element = constant.BLOCK_SIZE // self.dtype_size self.reserve_ub_size = 0 self.ub_size = (self.tik_profiling.get_unified_buffer_size() // self.dtype_size // self.block_element * self.block_element) - self.reserve_ub_size self.max_gap = 65535 * self.block_element self.max_last_dim = (self.max_gap + self.ub_size) // self.block_element
def get_blockdim_and_loop_cycle(updates, shape_out, update_each_size): """ get blockdim and loop cycle Parameters ---------- updates: dict,shape and datatype,datatype supports float32,float16,int32, int8,uint8 shape_out: dict,shape and datatype,datatype supports float32,float16,int32, int8,uint8 update_each_size: the elements number of each update data Returns ------- None """ blockdim = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) data_size = common_util.get_data_size(updates.get("dtype").lower()) output_shape = scatter_nd_d_help.get_shape_total_number(shape_out) output_spilts = output_shape // update_each_size # update_each_size less than 32b,use one core, # beacuse Less than 32B alignment to prevent multi-core coverage, # using single-core processing if update_each_size * data_size < constant.BLOCK_SIZE: return 1, output_spilts if output_spilts < blockdim: return output_spilts, output_spilts // output_spilts return blockdim, output_spilts // blockdim
def __init__(self, grad, argmax, input_x, ksize, strides, padding, dilation, ceil_mode): """ init compare and bit pack base parameters Parameters ---------- input_x: input of maxpool, useless for maxpool gard grad: input of maxpoolgard or output of maxpool argmax:output of maxpool mask or index strides: stride , minimum length is 4, just like [1, poolingStrideH, poolingStrideW, 1] padding: pad mode, just support "SANME" or "VALID" Returns ------- None """ self.blocknum = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) self.ub_size = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.UB_SIZE) self.input_gard_shape = grad.get("shape") self.argmax_shape = argmax.get("shape") self.y_shape = input_x.get("shape") self.dtype = grad.get("dtype").lower() self.dtype_size = common_util.get_data_size(self.dtype) self.nc1 = 1 self.block = self.input_gard_shape[0] * self.input_gard_shape[1] self.tik_instance = tik.Tik() self.ksize = ksize self.strides = strides self.padding = padding self.ceil_mode = ceil_mode self.dilation = dilation dyh, dyw = self.input_gard_shape[2:4] dxh, dxw = self.y_shape[2:4] strideh, stridew = self.strides[1:3] windowh, windoww = self.ksize[1:3] pad_h, pad_w = self.padding[1:3] if self.ceil_mode is False: pad_top = pad_h pad_bottom = pad_h pad_left = pad_w pad_right = pad_w else: pad_top = pad_h pad_bottom = pad_h + strideh - 1 pad_left = pad_w pad_right = pad_w + stridew - 1 self.pad = (pad_top, pad_bottom, pad_left, pad_right) self.hoverlap = 0 if windowh > strideh: self.hoverlap = windowh - strideh self.woverlap = 0 if windoww > stridew: self.woverlap = windoww - stridew
def __init__(self, input_x, output_y): """ init population_count base parameters Parameters ---------- input_x: shape and data type,datatype supports int16,uint16 output_y: shape and data type,data type supports uint8 Returns ------- None """ self.input_shape, self.input_dtype = self.get_input_params(input_x) self.output_shape, self.output_dtype = self.get_output_params(output_y) if self.output_dtype != "uint8": self.output_dtype = "uint8" self.input_data_size = common_util.get_data_size(self.input_dtype) self.output_data_size = common_util.get_data_size(self.output_dtype) self.tik_instance = tik.Tik()
def __init__(self, input_dict): """ init the Crop parameters Parameters ---------- input_dict: input_dict is a dict, the keys as follow: x1: dict,shape and datatype,datatype supports int8,uint8, int16,uint16,int32,uint32,int64,uint64,float16,float32 x2: dict,shape and datatype,datatype supports int8,uint8, int16,uint16,int32,uint32,int64,uint64,float16,float32 y: dict,shape and datatype,datatype supports int8,uint8, int16,uint16,int32,uint32,int64,uint64,float16,float32 axis: crop start with axis offsets: crop start offset of each axis kernel_name: cce kernel name, default value is "crop" Returns ------- None """ self.instance = tik.Tik(tik.Dprofile()) self.dtype = input_dict.get("x1").get("dtype").lower() self.dsize = common_util.get_data_size(self.dtype) total_size = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.UB_SIZE) ub_size = (total_size - RESERVE_SIZE) // (2 * self.dsize) burnest_len = constant.BLOCK_SIZE // self.dsize ub_size = ((ub_size + burnest_len - 1) // burnest_len) * burnest_len self.one_max_size = ub_size x1_len = get_shape_total_number(input_dict.get("x1").get("shape")) x1_len = ((x1_len + burnest_len - 1) // burnest_len) * burnest_len mod = input_dict.get("y").get("shape")[-1] % burnest_len if mod != 0: x1_len = x1_len + burnest_len self.x1_gm = self.instance.Tensor(self.dtype, (x1_len, ), name="x1_gm", scope=tik.scope_gm) self.x2_gm = self.instance.Tensor(self.dtype, (32, ), name="x2_gm", scope=tik.scope_gm) y_len = get_shape_total_number(input_dict.get("y").get("shape")) y_len = ((y_len + burnest_len - 1) // burnest_len) * burnest_len if mod != 0: y_len = y_len + burnest_len self.y_gm = self.instance.Tensor(self.dtype, (y_len, ), name="y_gm", scope=tik.scope_gm) self.input_dict = input_dict
def __init__(self, input_data, block_size): """ init space_to_depth base parameters Parameters ---------- input_data: shape and data type,data type supports float16,float32, int32,uint32,int16,uint16,int8,uint8,int64,uint64 block_size: must be greater than one. It indicates the block size """ self.input_shape = input_data.get("shape") self.dtype = input_data.get("dtype").lower() self.dtype_size = common_util.get_data_size(self.dtype) self.block_size = block_size self.tik_instance = tik.Tik(tik.Dprofile()) self.output_shape = (self.input_shape[0], self.input_shape[1] // block_size, self.input_shape[2] // block_size, self.input_shape[3] * block_size * block_size)
def __init__(self, input_values, inst: tik.Tik): self.tik_instance = inst dtype = "int64" # data in tiling_gm likes: # 0---- 1---- 2---- 3---- # axis, out_dim, max_inner_dim, min_inner_dim, # 4---- 5---- # output_inner_length, input_count # 6---- 7---- # reserve, reserve # 8---- 9---- # first_inner_dims, first_output_idx, # second_inner_dims, second_output_idx # ... self.dtype = dtype self.input_values = input_values self.axis = inst.Scalar(dtype, name="axis") self.out_dim = inst.Scalar(dtype, name="out_dim") self.max_inner_dim = inst.Scalar(dtype, name="max_inner_dim") self.min_inner_dim = inst.Scalar(dtype, name="min_inner_dim") self.output_inner_length = inst.Scalar(dtype, name="output_inner_length") tiling_ub_size = max(len(input_values) * 2, 8) tiling_gm_size = 8 + tiling_ub_size tiling_gm_size = ceil_32bytes_align_count(tiling_gm_size, dtype) tiling_ub_size = ceil_32bytes_align_count(tiling_ub_size, dtype) self.tiling_ub_size = tiling_ub_size self.tiling_gm = inst.Tensor(dtype, (tiling_gm_size, ), name="tiling_gm", scope=tik.scope_gm) self._need_ub_size = (self.tiling_ub_size * common_util.get_data_size(dtype)) self._tiling_ub = None self._out_dim = None self._inner_dim = None
def __init__(self, shape, axis, dtype): """ init the base param of cumsum Parameters ---------- shape: the shape of tensor axis: cumulative axis dtype: the data type of tensor Returns ------- None """ self.tik_instance = tik.Tik() self.each_loop = shape[axis] self.dsize = get_data_size(dtype) self.each, self.each_tail = self.get_each(shape, axis) self.reserved = self.get_reserved() self.dtype = dtype self.axis = axis self.is_last_axis = True if (axis - len(shape)) == -1 else False
def __init__(self, input_data, shape, kernel_name): """ init parallel_concat base parameters Parameters ---------- input_data: shape and data type,data type supports float16,float32, int32,uint32,int16,uint16,int8,uint8,int64,uint64 shape: list of output shape kernel_name: cce kernel name, default value is "parallel_concat" """ self.data_shape = [] self.data_dtype = [] for _, input_dict in enumerate(input_data): shape_input = input_dict.get("shape") dtype_input = (input_dict.get("dtype")).lower() self.data_shape.append(shape_input) self.data_dtype.append(dtype_input) self.output_shape = shape self.kernel_name = kernel_name self.dtype_size = common_util.get_data_size(self.data_dtype[0]) self.product_core_num = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) self.tik_instance = tik.Tik()
def __init__(self, input_dict): """ init the ShuffleChannel parameters Parameters ---------- input_dict: input_dict is a dict, the keys as follow: x: dict,shape and datatype,datatype supports int8,uint8,int16, uint16,int32,uint32,int64,uint64,float16,float32 y: dict,shape and datatype,datatype supports int8,uint8,int16, uint16,int32,uint32,int64,uint64,float16,float32 group: 1 channel group kernel_name: cce kernel name, default value is "shuffle_channel" Returns ------- None """ self.instance = tik.Tik(tik.Dprofile()) self.dtype = input_dict.get("x").get("dtype").lower() self.dsize = common_util.get_data_size(self.dtype) total_size = tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) ub_size = (total_size - RESERVE_SIZE) // (2 * self.dsize) burnest_len = constant.BLOCK_SIZE // self.dsize ub_size = ((ub_size + burnest_len - 1) // burnest_len) * burnest_len self.one_max_size = ub_size x_len = get_shape_total_number(input_dict.get("x").get("shape")) x_len = ((x_len + burnest_len - 1) // burnest_len) * burnest_len hw = input_dict.get("y").get("shape")[2] * \ input_dict.get("y").get("shape")[3] mod = hw % burnest_len if mod != 0: x_len = x_len + burnest_len self.x_gm = self.instance.Tensor(self.dtype, (x_len,), name="x_gm", scope=tik.scope_gm) self.y_gm = self.instance.Tensor(self.dtype, (x_len,), name="y_gm", scope=tik.scope_gm) self.input_dict = input_dict
def __init__(self, shape, dtype, depth_radius=5, bias=1, alpha=1, beta=0.5, kernel_name="lrn_grad"): self.shape = shape dtype = dtype.lower() self.dtype = dtype self.batch = shape[0] self.channels = shape[1] self.height = shape[2] self.width = shape[3] self.depth_radius = depth_radius self.bias = bias self.alpha = alpha self.beta = beta self.kernel_name = kernel_name tik_instance = tik.Tik() self.is_mini = True self.ub_dtype = "float16" if tbe_platform.cce_conf.api_check_support("tik.vln", "float32"): self.is_mini = False self.ub_dtype = "float32" self.ub_dtype_size = common_util.get_data_size(self.ub_dtype) self.tik_instance = tik_instance self.aicore_num = tik_instance.d_profiling.get_aicore_num() gm_size = 1 for item in shape: gm_size *= item gm_shape = (gm_size, ) self.data_input_grads = tik_instance.Tensor(dtype, gm_shape, name="input_grads", scope=tik.scope_gm) self.data_input_image = tik_instance.Tensor(dtype, gm_shape, name="input_image", scope=tik.scope_gm) self.data_output_image = tik_instance.Tensor(dtype, gm_shape, name="output_image", scope=tik.scope_gm) self.data_output = tik_instance.Tensor(dtype, gm_shape, name="output", scope=tik.scope_gm) need_ub_segment_count = 6 if self.dtype != self.ub_dtype: if self.is_mini: # mini only support float16, need conv float32 to float16, # sizeof(float32) == 2 * sizeof(float16), so need another buffer # of size of 2 * buffer_float16 need_ub_segment_count += 2 else: # if not mini, need conv float16 to float32 to calculate, # sizeof(float32) == 2 * sizeof(float16), so need another buffer # of size of 0.5 * buffer_float32 need_ub_segment_count += 0.5 self.need_ub_segment_count = need_ub_segment_count self.ub_segment_size = _get_ub_segment_size(need_ub_segment_count) self.dtype_size = common_util.get_data_size(dtype) self.small_hw = False if self.width * self.height * self.dtype_size < constant.BLOCK_SIZE: self.small_hw = True self.ub_shape = (self.ub_segment_size // self.ub_dtype_size, )
def _gm2ub(tik_instance: tik.Tik, dest: tik.Tensor, src: tik.Tensor, count): dtype_size = common_util.get_data_size(src.dtype) burst = math.ceil(count * dtype_size / constant.BLOCK_SIZE) tik_instance.data_move(dest, src, 0, 1, burst, 0, 0)
def ceil_32bytes_align_count(count, dtype): type_size = common_util.get_data_size(dtype) block_count = math.ceil(count * type_size / constant.BLOCK_SIZE) return block_count * constant.BLOCK_SIZE // type_size
def _data_move(self, dest: tik.Tensor, src: tik.Tensor, count: tik.Scalar): dtype_size = common_util.get_data_size(src.dtype) burst = self._ceil_div(count * dtype_size, constant.BLOCK_SIZE) self.tik_instance.data_move(dest, src, 0, 1, burst, 0, 0)
def __init__(self, bbox_tensor, img_metas, valid_tensor, kernel_name): self.kernel_name = kernel_name self.bbox_shape = bbox_tensor.get("shape") self.bbox_dtype = bbox_tensor.get("dtype").lower() self.bbox_dtype_size = common_util.get_data_size(self.bbox_dtype) self.valid_shape = valid_tensor.get("shape") self.valid_dtype_size = 1 # select operations only handle 128 elements once time. self.__default_rows_per_job = 32 * 4 * 1 self.job_num = self.__calc_job_num() self.img_metas = img_metas self.tik_instance = tik.Tik() # buffer for threshold extract self.img_metas_gm = self.tik_instance.Tensor("float16", (16, ), name="img_metas_gm", scope=tik.scope_gm) self.img_metas_ub = self.tik_instance.Tensor("float16", (16, ), name="img_metas_ub", scope=tik.scope_ubuf) self.threshold_h = self.tik_instance.Scalar("float16", "threshold_h") self.threshold_w = self.tik_instance.Scalar("float16", "threshold_w") self.__extract_threshold_as_scalar() # input bbox tensor from caller fp16, 128; fp32, 64 self.bbox_tensor_gm = self.tik_instance.Tensor("float16", self.bbox_shape, name="bbox_tensor_gm", scope=tik.scope_gm) # return buffer, gm be whole self.data_ret_int8_gm = self.tik_instance.Tensor( "int8", self.valid_shape, name="data_ret_int8_gm", scope=tik.scope_gm) self.padded_bytes = 0 self.last_job_row_aligned = self.__calc_last_job_row() # each job used buffer maximum self.job_buf_row = self.get_job_buffer_row() self.quad_flag_ub = self.tik_instance.Tensor("float16", (self.job_buf_row * 4, ), name="quad_flag_ub", scope=tik.scope_ubuf) self.quad_flags_sum_ub = self.tik_instance.Tensor( "float16", (self.job_buf_row * 4, ), name="quad_flags_sum_ub", scope=tik.scope_ubuf) # need set value before each-times using! # this buffer will used in multi-time. # contains threshold and the transform tmp for return. self.quad_threshold_ub = self.tik_instance.Tensor( "float16", (self.job_buf_row * 4, ), name="quad_threshold_ub", scope=tik.scope_ubuf) self.ones_ub = self.tik_instance.Tensor("float16", (self.job_buf_row, 4), name="ones_ub", scope=tik.scope_ubuf) self.zeros_ub = self.tik_instance.Tensor("float16", (self.job_buf_row, 4), name="zeros_ub", scope=tik.scope_ubuf) _repeat_time = max(4 * self.job_buf_row // 128, 1) _process_elem_count = self.get_handle_num_with_clip_128(4) self.tik_instance.vector_dup(_process_elem_count, self.ones_ub, 1, _repeat_time, 1, 8) self.tik_instance.vector_dup(_process_elem_count, self.zeros_ub, 0, _repeat_time, 1, 8) self.data_ret_int8_ub = self.tik_instance.Tensor( "int8", (self.job_buf_row, 1), name="data_ret_int8_ub", scope=tik.scope_ubuf) self.data_ret_mask_ub = self.tik_instance.Tensor( "uint16", (self.job_buf_row * 4 // 16, ), name="data_ret_mask_ub", scope=tik.scope_ubuf) self.data_ret_ub = self.tik_instance.Tensor("float16", (self.job_buf_row, 1), name="data_ret_ub", scope=tik.scope_ubuf) self.ret_unfold_half_ub = self.tik_instance.Tensor( "float16", (self.job_buf_row * 2, ), name="ret_unfold_half_ub", scope=tik.scope_ubuf) self.bbox_tensor_ub = self.tik_instance.Tensor("float16", (self.job_buf_row, 4), name="bbox_tensor_ub", scope=tik.scope_ubuf)
def check_param(input_dict): """ check parameters Parameters ---------- input_dict: input_dict is a dict, the keys as follow: box1_info,box2_info,box3_info,biases1,biases2,biases3, coords,boxes,classes,relative,obj_threshold,post_top_k, post_top_k,nms_threshold,pre_nms_topn, max_box_number_per_batch,kernel_name, for more details, please check the yolov3_detection_output function Returns ------- None """ pre_nms_topn = input_dict.get("pre_nms_topn") if tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") in ( "Ascend310", "Ascend910", "Hi3796CV300ES"): op_utils.check_dtype(input_dict.get("box1_info").get("dtype"), ["float16"], param_name="box1_info") op_utils.check_dtype(input_dict.get("box2_info").get("dtype"), ["float16"], param_name="box2_info") op_utils.check_dtype(input_dict.get("box3_info").get("dtype"), ["float16"], param_name="box3_info") else: op_utils.check_dtype(input_dict.get("box1_info").get("dtype"), ["float16", "float32"], param_name="box1_info") op_utils.check_dtype(input_dict.get("box2_info").get("dtype"), ["float16", "float32"], param_name="box2_info") op_utils.check_dtype(input_dict.get("box3_info").get("dtype"), ["float16", "float32"], param_name="box3_info") util.check_kernel_name(input_dict.get("kernel_name")) coords = input_dict.get("coords") post_top_k = input_dict.get("post_top_k") if coords != 4: error_info = {} error_info['errCode'] = 'E80017' error_info['opname'] = 'yolo_v3_detection_output_d' error_info['param_name'] = 'coords' error_info['expect_value'] = '4' error_info['real_value'] = str(coords) raise RuntimeError(error_info, "In op[%s], the parameter[%s] should be [%s], but actually is [%s]." % (error_info['opname'], error_info['param_name'], error_info['expect_value'], error_info['real_value'])) max_box_number_per_batch = input_dict.get("max_box_number_per_batch") dtype = input_dict.get("box1_info").get("dtype") if tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") in ( "Hi3796CV300ES", "Hi3796CV300CS") \ or dtype == constant.DATA_TYPE_FP32: if pre_nms_topn > PRE_NMS_TOPN // 2: check_param_range("pre_nms_topn", 1, PRE_NMS_TOPN // 2 - 1, pre_nms_topn) else: if pre_nms_topn > PRE_NMS_TOPN: check_param_range("pre_nms_topn", 1, PRE_NMS_TOPN - 1, pre_nms_topn) if max_box_number_per_batch > PRE_NMS_TOPN or max_box_number_per_batch <= 0: check_param_range("max_box_number_per_batch", 1, PRE_NMS_TOPN - 1, max_box_number_per_batch) if max_box_number_per_batch % 16 != 0: error_info = {} error_info['errCode'] = 'E81011' error_info['opname'] = 'yolo_v3_detection_output_d' error_info['real_value'] = str(max_box_number_per_batch) raise RuntimeError(error_info, "In op[%s], max_box_number_per_batch should be a multiple of 16, but actually is [%s]." % (error_info['opname'], error_info['real_value'])) if max_box_number_per_batch < pre_nms_topn or pre_nms_topn <= 0: check_param_range("pre_nms_topn", 1, max_box_number_per_batch-1, pre_nms_topn) if max_box_number_per_batch < post_top_k or post_top_k <= 0: check_param_range("post_top_k", 1, max_box_number_per_batch-1, post_top_k) dsize = common.get_data_size(input_dict.get("box1_info").get("dtype")) height = input_dict.get("box1_info").get("shape")[2] width = input_dict.get("box1_info").get("shape")[3] if height * width * dsize < constant.BLOCK_SIZE: raise RuntimeError( "box1_info's height[%d] multi with width[%d]'s size \ must bigger than 32b" % (height, width)) height = input_dict.get("box2_info").get("shape")[2] width = input_dict.get("box2_info").get("shape")[3] if height * width * dsize < constant.BLOCK_SIZE: raise RuntimeError( "box2_info's height[%d] multi with width[%d]'s size \ must bigger than 32b" % (height, width)) height = input_dict.get("box3_info").get("shape")[2] width = input_dict.get("box3_info").get("shape")[3] if height * width * dsize < constant.BLOCK_SIZE: raise RuntimeError( "box3_info's height[%d] multi with width[%d]'s size\ must bigger than 32b" % (height, width))
def _merge_recur(instance: tik.Tik, out_ub, dst_ub, src_ub, last_dim, total_region_list, level, region_offset=0): """ _merge_recur merge multi sorted region proposal list to one sorted region proposal list """ # vmrgsort4 can merger at most 4 sorted region list def is_next_to_last_merge(): return 1 < math.ceil(total_region_list / 4) <= 4 loops = total_region_list // 4 remain = total_region_list % 4 if is_next_to_last_merge() and dst_ub.name == out_ub.name: dst_ub = instance.Tensor(out_ub.dtype, out_ub.shape, scope=tik.scope_ubuf, name="ub_merge_recur") merge_n0 = 16 * (4**(level - 1)) merge_n1 = merge_n0 merge_n2 = merge_n0 merge_n3 = merge_n0 merge_repeat = loops need_tail_process = False if loops > 0 and remain == 0: if merge_n0 * 4 * loops > last_dim: merge_repeat = loops - 1 n012 = merge_n0 + merge_n1 + merge_n2 merge_left = last_dim - ((merge_n0 * 4 * (loops - 1)) + n012) need_tail_process = True if merge_repeat > 0: ub_offset = region_offset src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 + merge_n2 * 8]) element_count_list = (merge_n0, merge_n1, merge_n2, merge_n3) valid_bit = 15 instance.vmrgsort4(dst_ub[ub_offset], src_list, element_count_list, False, valid_bit, merge_repeat) if need_tail_process: tail_offset = 4 * merge_n0 * merge_repeat * 8 ub_offset = region_offset + tail_offset src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 + merge_n2 * 8]) element_count_list = (merge_n0, merge_n1, merge_n2, merge_left) valid_bit = 15 instance.vmrgsort4(dst_ub[ub_offset], src_list, element_count_list, False, valid_bit, repeat_times=1) if loops > 0: offset = 4 * loops * 16 * (4**(level - 1)) else: offset = 0 if remain == 3: merge_n0 = 16 * (4**(level - 1)) merge_n1 = merge_n0 merge_n2 = last_dim - (offset + merge_n0 + merge_n1) ub_offset = region_offset + offset * 8 src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 + merge_n2 * 8]) element_count_list = (merge_n0, merge_n1, merge_n2, 0) valid_bit = 2**remain - 1 instance.vmrgsort4(dst_ub[ub_offset], src_list, element_count_list, False, valid_bit, repeat_times=1) elif remain == 2: merge_n0 = 16 * (4**(level - 1)) merge_n1 = last_dim - (offset + merge_n0) ub_offset = region_offset + offset * 8 src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 + merge_n2 * 8]) element_count_list = (merge_n0, merge_n1, 0, 0) valid_bit = 2**remain - 1 instance.vmrgsort4(dst_ub[ub_offset], src_list, element_count_list, False, valid_bit, repeat_times=1) elif remain == 1: merge_n0 = last_dim - offset num_blocks_write = ( merge_n0 * 8 * common_util.get_data_size(src_ub.dtype) + 31) // 32 ub_offset = region_offset + offset * 8 instance.data_move(dst_ub[ub_offset], src_ub[ub_offset], 0, 1, num_blocks_write, 0, 0) next_total_region_list = math.ceil(total_region_list / 4) if next_total_region_list <= 1: return dst_ub if is_next_to_last_merge(): src_ub = out_ub return _merge_recur(instance, out_ub, src_ub, dst_ub, last_dim, next_total_region_list, level + 1, region_offset)
def _get_rep_stride(mask, dtype): return mask * common_util.get_data_size(dtype) // constant.BLOCK_SIZE
def __init__(self, input_x, ksize, strides, padding): """ init MaxPoolWithargmax parameters Parameters ---------- input_x: dict shape and datatype ksize: list or tuple The size of the window for each dimension of the input tensor. strides: list or tuple The stride of the sliding window of the input tensor. padding: str The type of padding algorithm to use. Returns ------- None """ self.input_shape = input_x.get("shape") self.input_dtype = input_x.get("dtype").lower() self.input_type_size = common_util.get_data_size(self.input_dtype) self.tik_instance = tik.Tik() self.ksize = ksize self.strides = strides self.padding = padding self.batch_size = self.input_shape[0] self.c1_size = self.input_shape[1] self.in_size_h = self.input_shape[2] self.in_size_w = self.input_shape[3] self.c_block_size = self.input_shape[4] self.window_h = self.ksize[1] self.window_w = self.ksize[2] self.stride_h = self.strides[1] self.stride_w = self.strides[2] self.nc1 = self.batch_size * self.c1_size # scalar for load3d self.scalar_source_h = self.tik_instance.Scalar(dtype="int64") self.scalar_source_w = self.tik_instance.Scalar(dtype="int64") # caculate pad and output size self.pad, self.out_size_h, self.out_size_w = \ self._calc_out_size_and_pad() # output_shape self.fmap_img2col_h = self.out_size_h * self.out_size_w self.fmap_img2col_w = self.window_h * self.window_w self.fmap_img2col_h_num = _ceil_div(self.fmap_img2col_h, self.c_block_size) if self.input_dtype == "float16": self.pad_value = MIN_VALUE_FP16 # fmap is NC1HWC0 format fmap_gm_shape = (self.batch_size, self.c1_size, self.in_size_h, self.in_size_w, self.c_block_size) output_gm_shape = (self.batch_size, self.c1_size, self.out_size_h, self.out_size_w, self.c_block_size) output_mask_gm_shape = (self.batch_size, self.c1_size, self.fmap_img2col_w, (self.fmap_img2col_h_num + 1) * self.c_block_size) # input and output self.input_fmap_gm = self.tik_instance.Tensor(self.input_dtype, fmap_gm_shape, name="input_fmap_gm", scope=tik.scope_gm) self.output_max_gm = self.tik_instance.Tensor(self.input_dtype, output_gm_shape, name="output_max_gm", scope=tik.scope_gm) self.output_mask_gm = self.tik_instance.Tensor("uint16", output_mask_gm_shape, name="output_mask_gm", scope=tik.scope_gm)