def sort_within_ub(instance: tik.Tik, src, cols): """ sort_within_ub """ with instance.new_stmt_scope(): dst = instance.Tensor(src.dtype, src.shape, scope=tik.scope_ubuf, name="ub_sort_within") _vrpsort16(instance, dst, src, cnt=cols) if cols > 16: result_ub = _merge_region(instance, out_ub=src, dst=src, src=dst, rows=1, cols=cols) else: result_ub = dst if result_ub.name != src.name: burst = math.ceil(cols * src.shape[1] * common_util.get_data_size(src.dtype) / 32) instance.data_move(src, result_ub, 0, 1, burst, 0, 0) return src
def __init__(self, input_values, inst: tik.Tik): self.tik_instance = inst dtype = "int64" # data in tiling_gm likes: # 0---- 1---- 2---- 3---- # axis, out_dim, max_inner_dim, min_inner_dim, # 4---- 5---- # output_inner_length, input_count # 6---- 7---- # reserve, reserve # 8---- 9---- # first_inner_dims, first_output_idx, # second_inner_dims, second_output_idx # ... self.dtype = dtype self.input_values = input_values self.axis = inst.Scalar(dtype, name="axis") self.out_dim = inst.Scalar(dtype, name="out_dim") self.max_inner_dim = inst.Scalar(dtype, name="max_inner_dim") self.min_inner_dim = inst.Scalar(dtype, name="min_inner_dim") self.output_inner_length = inst.Scalar(dtype, name="output_inner_length") tiling_ub_size = max(len(input_values) * 2, 8) tiling_gm_size = 8 + tiling_ub_size tiling_gm_size = ceil_32bytes_align_count(tiling_gm_size, dtype) tiling_ub_size = ceil_32bytes_align_count(tiling_ub_size, dtype) self.tiling_ub_size = tiling_ub_size self.tiling_gm = inst.Tensor(dtype, (tiling_gm_size, ), name="tiling_gm", scope=tik.scope_gm) self._need_ub_size = (self.tiling_ub_size * common_util.get_data_size(dtype)) self._tiling_ub = None self._out_dim = None self._inner_dim = None
def __init__(self, input_x_shape, inst: tik.Tik): """ tiling param :param input_x_shape: input shape :param inst: tik instance """ self.tik_instance = inst dtype = "int64" self.dtype = dtype # input_shape, output_shape, begin, end, stride tiling_gm_size = len(input_x_shape) * 5 self.tiling_gm = inst.Tensor(dtype, (tiling_gm_size,), name="tiling_gm", scope=tik.scope_gm) def gen_shape(name, index): name += str(index) return inst.Scalar(dtype, name=name) self.input_shape = tuple(map(lambda x: gen_shape("input_dim", x[0]), enumerate(input_x_shape))) self.begin = tuple(map(lambda x: gen_shape("begin_", x[0]), enumerate(input_x_shape))) self.end = tuple(map(lambda x: gen_shape("end_", x[0]), enumerate(input_x_shape))) self.stride = tuple(map(lambda x: gen_shape("stride_", x[0]), enumerate(input_x_shape))) self.output_shape = tuple(map(lambda x: gen_shape("out_dim_", x[0]), enumerate(input_x_shape))) self.out_dim = inst.Scalar(dtype, name="out_dim")
def _merge_recur(instance: tik.Tik, out_ub, dst_ub, src_ub, last_dim, total_region_list, level, region_offset=0): """ _merge_recur merge multi sorted region proposal list to one sorted region proposal list """ # vmrgsort4 can merger at most 4 sorted region list def is_next_to_last_merge(): return 1 < math.ceil(total_region_list / 4) <= 4 loops = total_region_list // 4 remain = total_region_list % 4 if is_next_to_last_merge() and dst_ub.name == out_ub.name: dst_ub = instance.Tensor(out_ub.dtype, out_ub.shape, scope=tik.scope_ubuf, name="ub_merge_recur") merge_n0 = 16 * (4**(level - 1)) merge_n1 = merge_n0 merge_n2 = merge_n0 merge_n3 = merge_n0 merge_repeat = loops need_tail_process = False if loops > 0 and remain == 0: if merge_n0 * 4 * loops > last_dim: merge_repeat = loops - 1 n012 = merge_n0 + merge_n1 + merge_n2 merge_left = last_dim - ((merge_n0 * 4 * (loops - 1)) + n012) need_tail_process = True if merge_repeat > 0: ub_offset = region_offset src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 + merge_n2 * 8]) element_count_list = (merge_n0, merge_n1, merge_n2, merge_n3) valid_bit = 15 instance.vmrgsort4(dst_ub[ub_offset], src_list, element_count_list, False, valid_bit, merge_repeat) if need_tail_process: tail_offset = 4 * merge_n0 * merge_repeat * 8 ub_offset = region_offset + tail_offset src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 + merge_n2 * 8]) element_count_list = (merge_n0, merge_n1, merge_n2, merge_left) valid_bit = 15 instance.vmrgsort4(dst_ub[ub_offset], src_list, element_count_list, False, valid_bit, repeat_times=1) if loops > 0: offset = 4 * loops * 16 * (4**(level - 1)) else: offset = 0 if remain == 3: merge_n0 = 16 * (4**(level - 1)) merge_n1 = merge_n0 merge_n2 = last_dim - (offset + merge_n0 + merge_n1) ub_offset = region_offset + offset * 8 src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 + merge_n2 * 8]) element_count_list = (merge_n0, merge_n1, merge_n2, 0) valid_bit = 2**remain - 1 instance.vmrgsort4(dst_ub[ub_offset], src_list, element_count_list, False, valid_bit, repeat_times=1) elif remain == 2: merge_n0 = 16 * (4**(level - 1)) merge_n1 = last_dim - (offset + merge_n0) ub_offset = region_offset + offset * 8 src_list = (src_ub[ub_offset], src_ub[ub_offset + merge_n0 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8], src_ub[ub_offset + merge_n0 * 8 + merge_n1 * 8 + merge_n2 * 8]) element_count_list = (merge_n0, merge_n1, 0, 0) valid_bit = 2**remain - 1 instance.vmrgsort4(dst_ub[ub_offset], src_list, element_count_list, False, valid_bit, repeat_times=1) elif remain == 1: merge_n0 = last_dim - offset num_blocks_write = ( merge_n0 * 8 * common_util.get_data_size(src_ub.dtype) + 31) // 32 ub_offset = region_offset + offset * 8 instance.data_move(dst_ub[ub_offset], src_ub[ub_offset], 0, 1, num_blocks_write, 0, 0) next_total_region_list = math.ceil(total_region_list / 4) if next_total_region_list <= 1: return dst_ub if is_next_to_last_merge(): src_ub = out_ub return _merge_recur(instance, out_ub, src_ub, dst_ub, last_dim, next_total_region_list, level + 1, region_offset)