def get_nms_all_class_result(self, batch, topk_num_ecah_class): """ handle nms result Parameters ---------- batch: batch num topk_num_ecah_class: nms result Returns ------- None """ with self.instance.if_scope( tik.all(self.keep_top_k > -1, topk_num_ecah_class > self.keep_top_k)): with self.instance.new_stmt_scope(): topk2_tail_init_tmp_ub = self.instance.Tensor( self.dtype, (128, ), name="topk2_in_data_tmp_ub", scope=tik.scope_ubuf) self.instance.vector_dup(self.mask, topk2_tail_init_tmp_ub, 0, 128 // self.mask, 1, 8) topk2_tail_num = self.instance.Scalar("int32", "topk_num_ecah_class", 16) burst_tail_scalar = self.instance.Scalar( "int32", "burst_tail_scalar", 0) self.get_tersor_data_burst_val(True, topk2_tail_num, burst_tail_scalar) self.instance.data_move( self.topk2_in_gm[batch, topk_num_ecah_class, 0], topk2_tail_init_tmp_ub, 0, 1, burst_tail_scalar, 0, 0) self.instance.data_move( self.topk3_in_gm[batch, topk_num_ecah_class, 0], topk2_tail_init_tmp_ub, 0, 1, burst_tail_scalar, 0, 0) self.sort_all_class(batch) self.sort_for_get_label(batch) with self.instance.new_stmt_scope(): #set out box num and tensor out_box_num_ub = self.instance.Tensor("int32", (8, ), name="out_box_num_ub", scope=tik.scope_ubuf) with self.instance.if_scope( tik.all(self.keep_top_k > -1, topk_num_ecah_class > self.keep_top_k)): out_box_num_ub[0].set_as(self.topk2_num) with self.instance.else_scope(): out_box_num_ub[0].set_as(topk_num_ecah_class) self.instance.data_move(self.out_box_num_gm[batch, 0], out_box_num_ub, 0, 1, 1, 0, 0) self.adjust_topk_crood(batch, topk_num_ecah_class)
def pad_case1(self, tik_instance): in_num = _prod(self.ou_shape) total_num = math.ceil(in_num * self.num_bit / MINI_UNIT) core_num = total_num if core_num > MAX_CORE: core_num = MAX_CORE split_core_index, \ core_loop_before, \ core_loop_after = _cal_core(total_num, core_num, MAX_CORE) ac_num_one = (MINI_UNIT // self.num_bit) * core_loop_before ac_num_two = (MINI_UNIT // self.num_bit) * core_loop_after with tik_instance.for_range(0, core_num, block_num=core_num) as blk_idx: if split_core_index + 1 == core_num: with tik_instance.if_scope(blk_idx <= core_num - 1): begin_index = blk_idx * ac_num_one self._pad_case1_main(tik_instance, ac_num_one, begin_index, self.ubuf) else: with tik_instance.if_scope(blk_idx <= split_core_index): begin_index = blk_idx * ac_num_one self._pad_case1_main(tik_instance, ac_num_one, begin_index, self.ubuf) with tik_instance.if_scope( tik.all(blk_idx > split_core_index, blk_idx < core_num)): begin_index = ac_num_one * (split_core_index + 1) block_index = blk_idx - (split_core_index + 1) begin_index += block_index * ac_num_two self._pad_case1_main(tik_instance, ac_num_two, begin_index, self.ubuf)
def proposal_pooling_h(self, block_id, proposal_id, fm_c1_index): """ load the pooled_h * fm_width size featuremap to ub. maxpooling accroing to h direction Parameters ---------- block_id: aicore id proposal_id: which proposal is now being processed fm_c1_index: c1 index of the feature map Returns ------- None """ with self.tik_instance.for_range(0, self.pooled_h) as poolh: proposal_fm_data = \ self.tik_instance.Tensor(self.dtype, (self.fm_h//self.pooled_h+2, self.fm_w_align, self.fm_c0), name="proposal_data", scope=tik.scope_ubuf) scalar_roi_start_h = self.tik_instance.Scalar("int32") scalar_roi_start_h.set_as(self.roi_start_h[poolh, proposal_id]) scalar_roi_start_w = self.tik_instance.Scalar("int32") scalar_roi_start_w.set_as(self.roi_start_w[0, proposal_id]) scalar_roi_width = self.tik_instance.Scalar("int32") scalar_roi_width.set_as(self.roi_width[proposal_id]) scalar_roi_bin_h = self.tik_instance.Scalar("int32") scalar_roi_bin_h.set_as(self.roi_bin_h[poolh, proposal_id]) with self.tik_instance.if_scope( tik.all(scalar_roi_bin_h != 0, scalar_roi_width != 0)): coeff = self.fm_c0 * TYPELEN_DICT[self.dtype] // 32 self.tik_instance.data_move( proposal_fm_data, self.fm_c0_data[scalar_roi_start_h, scalar_roi_start_w, 0], 0, scalar_roi_bin_h, scalar_roi_width * coeff, (self.fm_w - scalar_roi_width) * coeff, (self.fm_w_align - scalar_roi_width) * coeff) ceil_loop = 16 // TYPELEN_DICT[self.dtype] with self.tik_instance.for_range(0, ceil_div(scalar_roi_width, ceil_loop)) as \ loop_w: self.tik_instance.vec_max( 256 // TYPELEN_DICT[self.dtype], self.pooled_h_res[poolh, ceil_loop * loop_w, 0], proposal_fm_data[0, ceil_loop * loop_w, 0], self.pooled_h_res[poolh, ceil_loop * loop_w, 0], scalar_roi_bin_h, 0, self.fm_w_align * coeff, 0)
def _do_vec_dup(pattern, obj, max_num, blk_idx, mark, axis): """ Params: top_address: start address for top padding. top_div_core: dividing line between two types of cores in top padding. top_total_core: physical cores for top padding. top_core_vol_x: volume of data processed by each core(type_x) for top padding. top_core_gap_x: gap between different cores(type_x) for top padding. Solution: MAX_CORE = 32 in_shape is [34,16,16,16,...],func will work in [0, ] only. in_shape is [16,16,16,16,...],func will work in [0, 1]. """ if pattern == "top": begin_index = obj.top_address[axis] division_core = obj.top_div_core[axis] total_core = obj.top_total_core[axis] core_data_0 = obj.top_core_vol_0[axis] core_data_1 = obj.top_core_vol_1[axis] core_gap_0 = obj.top_core_gap_0[axis] core_gap_1 = obj.top_core_gap_1[axis] else: begin_index = obj.bottom_address[axis] division_core = obj.bottom_div_core[axis] total_core = obj.bottom_total_core[axis] core_data_0 = obj.bottom_core_vol_0[axis] core_data_1 = obj.bottom_core_vol_1[axis] core_gap_0 = obj.bottom_core_gap_0[axis] core_gap_1 = obj.bottom_core_gap_1[axis] vir_num, block_index = max_num, blk_idx # vector_dup: all physical cores. with obj.tik_instance.if_scope(mark != 1): set_vector_dup(obj, vir_num, 0) # data_move: part of physical cores. with obj.tik_instance.if_scope(block_index <= division_core): dst_idx = begin_index + block_index * core_gap_0 copy_buf2gm_circulation(obj.tik_instance, obj.num_bit, core_data_0, vir_num, obj.buf, obj.output_gm, dst_idx) with obj.tik_instance.if_scope( tik.all(block_index > division_core, block_index < total_core)): begin_index += core_gap_0 * (division_core + 1) block_index = block_index - (division_core + 1) dst_idx = begin_index + block_index * core_gap_1 copy_buf2gm_circulation(obj.tik_instance, obj.num_bit, core_data_1, vir_num, obj.buf, obj.output_gm, dst_idx)
def row_in_core_exp(self, block_idx): """ expression of whether current row is processed in the core Parameters ---------- block_idx: core index Returns ------- expression """ return tik.all(self.reg_cur_row >= block_idx * self.indices_step, self.reg_cur_row < self.reg_core_last_rows)
def move_ub_to_gm_with_tail(self, input_dict): """ move data from ub to gm when c < 16 """ shape = input_dict.get("shape") dst_ub = input_dict.get("dst_ub") ub_tail = input_dict.get("ub_tail") tail_offset = input_dict.get("tail_offset") tail_num = input_dict.get("tail_num") block_num = input_dict.get("block_num") row_index = input_dict.get("row_index") out_index = input_dict.get("out_index") tail_start = input_dict.get("tail_start") total_loop = input_dict.get("total_loop") r_i = input_dict.get("r_i") num = input_dict.get("num") _, col_len, row_len = shape col_len_align = (col_len + 15) // 16 * 16 with self.instance.if_scope(tik.all(row_index >= num, block_num > 1)): scalar = self.instance.Scalar(ub_tail.dtype) with self.instance.for_range(0, col_len) as time: scalar.set_as(dst_ub[r_i * col_len_align + time]) ub_tail[tail_offset + time].set_as(scalar) tail_offset.set_as(tail_offset + col_len) with self.instance.if_scope(row_index == total_loop * row_len - 1): each_burst_num = 32 // self.dsize n_burst = self.instance.Scalar("int32") n_burst.set_as((tail_num * self.dsize) // 32) mod = self.instance.Scalar("int32") mod.set_as((tail_num * self.dsize) % 32) # 32b alignment with self.instance.if_scope(mod == 0): self.instance.data_move(self.y_gm[tail_start], ub_tail, 0, 1, n_burst, 0, 0) # bigger than 32b with self.instance.else_scope(): self.instance.data_move(self.y_gm[tail_start], ub_tail, 0, 1, n_burst, 0, 0) offset = tail_num - each_burst_num scalar = self.instance.Scalar(ub_tail.dtype) with self.instance.for_range(0, each_burst_num) as time: scalar.set_as(ub_tail[offset + time]) ub_tail[time].set_as(scalar) self.instance.data_move(self.y_gm[tail_start + offset], ub_tail, 0, 1, 1, 0, 0) with self.instance.else_scope(): burst_len = col_len_align // 16 self.instance.data_move(self.y_gm[out_index], dst_ub[r_i * col_len_align], 0, 1, burst_len, 0, 0)
def indices_inner_gather_last_1(self, indices_ub, res_ub, row_num_last, inner_indices_offset, gm_offset_base, output_offset, burst_len_row, burst_len_res): """ process last indices for tiling mode 1 Parameters ---------- indices_ub: cache indices data in UB res_ub: cache result data in UB row_num_last: the last indices num inner_indices_offset: inner indices num offset gm_offset_base: base of gm offset output_offset: output offset burst_len_row: burst length of one params row burst_len_res: burst length of result Returns ------- None """ tik_instance = self.tik_instance block_ub = tik_instance.Tensor(self.params_dtype, (self.block_elem,), name="block_ub", scope=tik.scope_ubuf) with tik_instance.for_range(0, row_num_last, thread_num=1) as row_i: indices_i_value = tik_instance.Scalar(dtype=self.indices_dtype, name="indices_i_value") indices_i_value.set_as(indices_ub[inner_indices_offset + row_i]) gm_offset_i = (gm_offset_base + indices_i_value) * self.params_row # copy params row to block_ub from gm tik_instance.data_move(block_ub, self.x[gm_offset_i], 0, 1, burst_len_row, 0, 0) res_ub_offset = row_i * self.params_row with tik_instance.for_range(0, self.params_row) as i: res_ub[res_ub_offset + i].set_as(block_ub[i]) # copy result data to gm from ub tail_elem = (row_num_last * self.params_row) % self.block_elem with tik_instance.if_scope(tik.all(tail_elem != 0, burst_len_res > 1)): with tik_instance.for_range(0, self.block_elem) as num_i: block_ub[num_i].set_as(res_ub[row_num_last * self.params_row - self.block_elem + num_i]) tik_instance.data_move(self.y[output_offset], res_ub, 0, 1, burst_len_res - 1, 0, 0) tik_instance.data_move(self.y[output_offset + (row_num_last * self.params_row - self.block_elem)], block_ub, 0, 1, 1, 0, 0) with tik_instance.else_scope(): tik_instance.data_move(self.y[output_offset], res_ub, 0, 1, burst_len_res, 0, 0)
def row_in_ub_exp(self): """ expression of whether current row is already loaded on ubuf Parameters ---------- None Returns ------- expression """ return tik.all( self.reg_cur_row >= self.reg_row_start, self.reg_cur_row < self.reg_row_start + self.num_multi_rows)
def calc_multi_indices(self, indices_ub, indices_num, burst_len_multi_row, ub_tuples, ub_block_tuples): """ calculate multi rows, multi rows will read at one to avoid loading little data from gm to ubuf at a high frequency Parameters ---------- indices_ub: indices_ub indices_num: how many indices to calculate burst_len_multi_row: burst length of multi row ub_tuples: contains var_ub, accum_ub, linear_ub, grad_ub, tmp_ub, tmp2_ub ub_block_tuples: contains var_ub_block, accum_ub_block, linear_ub_block, grad_ub_block Returns ------- None """ tik_instance = self.tik_instance with tik_instance.for_range(0, indices_num) as indices_i: self.var_cur_row.set_as(indices_ub[indices_i]) # check whether current indices is within the processing range of the core with tik_instance.if_scope(tik.all(self.var_cur_row >= self.core_rows_start_index, self.var_cur_row < self.core_rows_end_index)): # check whether the var, accum, linear corresponding to current indices is cached in the UB with tik_instance.if_scope(tik.all(self.var_cur_row >= self.cached_rows_start_index, self.var_cur_row < self.cached_rows_start_index + self.num_multi_rows)): self.calc_a_small_row(indices_i, ub_tuples, ub_block_tuples) with tik_instance.else_scope(): with tik_instance.if_scope(self.cached_rows_start_index < self.var_rows): self.save_multi_rows(ub_tuples, burst_len_multi_row) self.load_multi_rows(ub_tuples, burst_len_multi_row) self.calc_a_small_row(indices_i, ub_tuples, ub_block_tuples) with tik_instance.if_scope(self.cached_rows_start_index < self.var_rows): self.save_multi_rows(ub_tuples, burst_len_multi_row)
def adjust_topk_crood(self, batch, topk_num_ecah_class): """ modify x1 and y1 value Parameters ---------- batch: batch num topk_num_ecah_class: out box data num Returns ------- None """ with self.instance.new_stmt_scope(): box_data_ub = self.instance.Tensor(self.dtype, (self.out_box_gm.shape[1], 8), name="box_data_ub", scope=tik.scope_ubuf) topk3_out_ub = self.instance.Tensor(self.dtype, (self.out_box_gm.shape[1], 8), name="topk3_out_ub", scope=tik.scope_ubuf) burst_val_tmp_scalar = self.instance.Scalar( "int32", "burst_val_tmp_scalar", 0) with self.instance.if_scope( tik.all(self.keep_top_k > -1, topk_num_ecah_class > self.keep_top_k)): if self.keep_top_k > 0: self.instance.data_move( box_data_ub, self.out_box_gm_tmp[batch, 0, 0], 0, 1, math.ceil(self.keep_top_k * 8 / self.burnest_len), 0, 0) self.instance.data_move( topk3_out_ub, self.topk3_out_gm[batch, 0, 0], 0, 1, math.ceil(self.keep_top_k * 8 / self.burnest_len), 0, 0) with self.instance.else_scope(): self.get_tersor_data_burst_val(True, topk_num_ecah_class, burst_val_tmp_scalar) with self.instance.if_scope(burst_val_tmp_scalar > 0): self.instance.data_move(box_data_ub, self.topk2_in_gm[batch, 0, 0], 0, 1, burst_val_tmp_scalar, 0, 0) self.set_crood_data_order(batch, topk_num_ecah_class, box_data_ub, topk3_out_ub)
def format_transfer_case_one(self, tik_instance): """ the transfer process when UB can not put in N1 * N0 * X0 data """ ub_ori_data = self.ub_memory - self.ub_memory % (CUBE_SIZE * CUBE_SIZE) ub_trans_data = ub_ori_data loop_n, loop_remainder = _cal_core_loop_python(CUBE_SIZE * CUBE_SIZE, self.dst_shape[1], ub_ori_data) # divide the core according to X0 total_core_loop_num = self.dst_shape[0] core_number = _set_core_num(total_core_loop_num) with tik_instance.for_range(0, core_number, block_num=core_number) \ as num_core: ub_ori = tik_instance.Tensor(self.dtype, (ub_ori_data, ), name="ub_ori", scope=tik.scope_ubuf) ub_trans = tik_instance.Tensor(self.dtype, (ub_trans_data, ), name="ub_trans", scope=tik.scope_ubuf) core_loop, sum_core = _cal_core(tik_instance, total_core_loop_num, num_core, core_number) with tik_instance.for_range(0, core_loop) as num_core_loop: total_core_loop = sum_core + num_core_loop num_x0 = total_core_loop is_last = tik_instance.Scalar("uint64", init_value=0) with tik_instance.for_range(0, self.dst_shape[1] // loop_n) \ as num_n_loop: with tik_instance.if_scope( tik.all( num_n_loop == self.dst_shape[1] // loop_n - 1, self.dst_shape[1] % loop_n == 0)): is_last.set_as(1) self.data_move_case_two(tik_instance, ub_ori, ub_trans, is_last, num_x0, num_n_loop, loop_n, loop_n) if loop_remainder != 0: is_last.set_as(1) self.data_move_case_two(tik_instance, ub_ori, ub_trans, is_last, num_x0, self.dst_shape[1] // loop_n, loop_n, loop_remainder) return tik_instance
def compute_mode_1(self, block_id): """ compute for tiling mode 1 of 32B aligned for var row Parameters ---------- block_id: id of ai core Returns ------- None """ tik_instance = self.tik_instance indices_ub = tik_instance.Tensor(self.indices_dtype, (self.indices_nums_once,), name="indices_ub", scope=tik.scope_ubuf) var_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,), name="var_ub", scope=tik.scope_ubuf) accum_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,), name="accum_ub", scope=tik.scope_ubuf) linear_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,), name="linear_ub", scope=tik.scope_ubuf) grad_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,), name="grad_ub", scope=tik.scope_ubuf) tmp_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,), name="tmp_ub", scope=tik.scope_ubuf) tmp2_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,), name="tmp2_ub", scope=tik.scope_ubuf) ub_tuples = (var_ub, accum_ub, linear_ub, grad_ub, tmp_ub, tmp2_ub) self.var_cur_row = tik_instance.Scalar(dtype=self.tiling_dtype, name="var_cur_row") self.var_row_repeat = ceil_value(self.var_row_elem, self.vector_elem) # process indices_num_each_core: indices_nums_once * indices_loop_num + indices_nums_last with tik_instance.for_range(0, self.indices_loop_num) as indices_loop_i: indices_num_offset = block_id * self.indices_num_each_core + indices_loop_i * self.indices_nums_once self.process_num_indices(ub_tuples, indices_ub, self.indices_nums_once, indices_num_offset) with tik_instance.if_scope(self.indices_nums_last > 0): indices_num_offset = block_id * self.indices_num_each_core + \ self.indices_loop_num * self.indices_nums_once self.process_num_indices(ub_tuples, indices_ub, self.indices_nums_last, indices_num_offset) with tik_instance.if_scope(tik.all(self.indices_num_remaining > 0, block_id < self.indices_num_remaining)): indices_num_offset = self.indices_num_each_core * self.need_core_num + block_id self.process_num_indices(ub_tuples, indices_ub, 1, indices_num_offset)
def pad_case0(self, tik_instance, split_core_idx, core_loop_list, model_list): with tik_instance.for_range(0, MAX_CORE, block_num=MAX_CORE) as blk_idx: # use as many as possible core (MAX_CORE) # outermost padding (top, bottom) # vec_mark: pad_vec_dup_outermost had worked # and model_list[0][x] is 'ub_reorder',it will be 'True' # if vec_mark = True, the followed # computation will not vec_dup again in_num_top = self.in_paddings[0][0] * _prod(self.ou_shape[1:]) in_num_bottom = self.in_paddings[0][1] * _prod(self.ou_shape[1:]) vec_mark = [False, False] if max(in_num_top, in_num_bottom) > 0: self.pad_vec_dup_outermost(tik_instance, in_num_top, in_num_bottom, blk_idx) # vec_dup doesn't care about core # different core must obey the same rule if model_list[0][0] != "ub_reorder": vec_mark[0] = True if model_list[0][1] != "ub_reorder": vec_mark[1] = True with tik_instance.if_scope(blk_idx <= split_core_idx): src_gm = 0 dst_gm = in_num_top self._pad_case0_main(tik_instance, core_loop_list[0], model_list[0], blk_idx, src_gm, dst_gm, vec_mark[0]) if core_loop_list[0] != core_loop_list[1]: with tik_instance.if_scope( tik.all(blk_idx > split_core_idx, blk_idx < self.core)): processed_in_shape = self.in_shape.copy() processed_in_shape[0] = core_loop_list[0] processed_ou_shape = self.ou_shape.copy() processed_ou_shape[0] = core_loop_list[0] src_gm += (split_core_idx + 1) * _prod(processed_in_shape) dst_gm += (split_core_idx + 1) * _prod(processed_ou_shape) blk_idx = blk_idx - split_core_idx - 1 self._pad_case0_main(tik_instance, core_loop_list[1], model_list[1], blk_idx, src_gm, dst_gm, vec_mark[1])
def check_batch(num_elements, elements_offset): idx_blocks = (num_elements + indices_block_len - 1) // indices_block_len uds_blocks = (num_elements + updates_block_len - 1) // updates_block_len tik_instance.data_move(indices_ub, indices_gm[elements_offset], 0, 1, idx_blocks, 0, 0) tik_instance.data_move(updates_ub, updates_gm[elements_offset], 0, 1, uds_blocks, 0, 0) with tik_instance.for_range(0, num_elements) as k: indices_var.set_as(indices_ub[k]) with tik_instance.if_scope( tik.all(indices_var >= core_start, indices_var < core_end)): cur_var.set_as(var_ub[indices_var - core_start]) cur_update.set_as(updates_ub[k]) acc_var.set_as(cur_var + cur_update) var_ub[indices_var - core_start] = acc_var
def _recursion_compute(obj, blk_idx): """ recur_cond: condition that torch off stride between different cores. recur_gap_x: gap_x between in diff cores. recur_loop_x: work times by each core(type_x). recur_in_vol: volume of input_data by each core do once. recur_div_core: dividing line between two types of core. recur_total_core: physical cores in recursion. recur_start_address: start address in recursion """ tik_instance = obj.tik_instance cond, gap0, gap1 = obj.recur_cond[0], obj.recur_gap_0[0], obj.recur_gap_1[ 0] loop0, loop1, in_vol = obj.recur_loop_0[0], obj.recur_loop_1[ 0], obj.recur_in_vol[0] max_num = obj.tik_instance.Scalar("int32", name="max_num_") def _main(processed, loop, block_index): src_ub = 0 dst_ub = 0 dst_gm = obj.recur_start_address[0] src_gm = 0 axis = 0 with tik_instance.for_range(0, loop) as idx: sum_core = processed + block_index * loop + idx dst_gm += sum_core / cond * gap0 + sum_core % cond * gap1 src_gm += sum_core * in_vol _recursion(obj, axis, dst_gm, src_gm, src_ub, dst_ub, max_num, False) with tik_instance.if_scope(blk_idx <= obj.recur_div_core[0]): pro = 0 _main(pro, loop0, blk_idx) with tik_instance.if_scope( tik.all(blk_idx > obj.recur_div_core[0], blk_idx < obj.recur_total_core[0])): pro = (obj.recur_div_core[0] + 1) * loop0 blk_idx = blk_idx - obj.recur_div_core[0] - 1 _main(pro, loop1, blk_idx)
def _do_vec_dup(ac_num, vir_num, begin_index, block_index, mark): total_num = ac_num // (MINI_UNIT // self.num_bit) if total_num >= MAX_CORE: core_num = MAX_CORE else: core_num = total_num split_core_index, \ core_loop_before, \ core_loop_after = _cal_core(total_num, core_num, MAX_CORE) ac_num_one = (MINI_UNIT // self.num_bit) * core_loop_before ac_num_two = (MINI_UNIT // self.num_bit) * core_loop_after if not mark: self.set_vector_dup(tik_instance, vir_num, self.ubuf, 0) if split_core_index + 1 == core_num: with tik_instance.if_scope(block_index <= split_core_index): begin_index += block_index * ac_num_one self.copy_ubuf_2_gm_case01(tik_instance, ac_num_one, vir_num, self.ubuf, 0, begin_index) else: with tik_instance.if_scope(block_index <= split_core_index): begin_index_new = begin_index + block_index * ac_num_one self.copy_ubuf_2_gm_case01(tik_instance, ac_num_one, vir_num, self.ubuf, 0, begin_index_new) with tik_instance.if_scope( tik.all(block_index > split_core_index, block_index < core_num)): begin_index += ac_num_one * (split_core_index + 1) block_index = block_index - (split_core_index + 1) begin_inde_new = begin_index + block_index * ac_num_two self.copy_ubuf_2_gm_case01(tik_instance, ac_num_two, vir_num, self.ubuf, 0, begin_inde_new)
def process_each_indices(self, process, loop_param, output_offset): """ process each indices Parameters ---------- process: ScatterProcess class,which used to store scatter nd parameters loop_param: a tupe keep the loop params output_offset: the offset of output data Returns ------- None """ update_offset = self.tik_instance.Scalar("int32") update_offset.set_as(0) with self.tik_instance.for_range(0, loop_param[2]) as ind_cycle: start_address = self.tik_instance.Scalar("int32") start_address.set_as(0) for k in range(process.get_each_size()): indices_saclar = self.tik_instance.Scalar("int32") indices_saclar.set_as(0) indices_saclar.set_as(ind_cycle * process.get_each_size() + k) indices_saclar.set_as(process.input_indices_ub[indices_saclar]) start_address.set_as(start_address + \ indices_saclar * \ process.elem_of_each_dim[k]) end_address = self.tik_instance.Scalar("int32") end_address.set_as(output_offset + (loop_param[3] - 1) * process.update_each_size) with self.tik_instance.if_scope(tik.all(start_address \ >= output_offset, start_address \ <= end_address)): update_offset.set_as((loop_param[0] * loop_param[1] // \ process.get_each_size() + \ ind_cycle) * process.update_each_size) self.update_each_slice(process, update_offset, start_address)
def move_without_transform(self, shape): """ when C = 1 or H*W = 1, directly move data in and out """ ub_size = (UB_SIZE_B - 1024) // 2 // self.dsize // 16 * 16 if shape[0] <= 16: block_num = 1 else: all_block_num = shape[0] // 16 block_num = AICORE_NUM if all_block_num < AICORE_NUM: block_num = all_block_num each_len = shape[0] // block_num each_mod = shape[0] % block_num thread_num = 1 if each_len // ub_size > 1: thread_num = 2 with self.instance.for_range(0, block_num, block_num=block_num) \ as block_id: each_size = self.instance.Scalar("int32") each_size.set_as(each_len) with self.instance.if_scope(block_id == block_num - 1): each_size.set_as(each_len + each_mod) ub_loop = each_size // ub_size ub_mod = each_size % ub_size with self.instance.for_range(0, ub_loop, thread_num=thread_num) as loop_id: src_ub = self.instance.Tensor(self.dtype, (ub_size, ), name="src_ub", scope=tik.scope_ubuf) burst_len = ub_size // 16 self.instance.data_move( src_ub, self.x_gm[each_len * block_id + loop_id * ub_size], 0, 1, burst_len, 0, 0) self.instance.data_move( self.y_gm[each_len * block_id + loop_id * ub_size], src_ub, 0, 1, burst_len, 0, 0) with self.instance.if_scope(ub_mod > 0): src_ub = self.instance.Tensor(self.dtype, (ub_size, ), name="src_ub", scope=tik.scope_ubuf) with self.instance.if_scope( tik.all(block_num > 1, ub_mod % 16 != 0)): src_ub_1 = self.instance.Tensor(self.dtype, (16, ), name="src_ub_1", scope=tik.scope_ubuf) index = each_len * block_id + ub_loop * ub_size with self.instance.if_scope(ub_mod >= 16): burst_len = ub_mod // 16 self.instance.data_move(src_ub, self.x_gm[index], 0, 1, burst_len, 0, 0) self.instance.data_move(self.y_gm[index], src_ub, 0, 1, burst_len, 0, 0) offset = index + burst_len * 16 - 16 + ub_mod % 16 self.instance.data_move(src_ub_1, self.x_gm[offset], 0, 1, 1, 0, 0) self.instance.data_move(self.y_gm[offset], src_ub_1, 0, 1, 1, 0, 0) with self.instance.else_scope(): offset = index - 16 + ub_mod % 16 self.instance.data_move(src_ub_1, self.x_gm[offset], 0, 1, 1, 0, 0) self.instance.data_move(self.y_gm[offset], src_ub_1, 0, 1, 1, 0, 0) with self.instance.else_scope(): burst_len = (ub_mod + 15) // 16 self.instance.data_move( src_ub, self.x_gm[each_len * block_id + ub_loop * ub_size], 0, 1, burst_len, 0, 0) self.instance.data_move( self.y_gm[each_len * block_id + ub_loop * ub_size], src_ub, 0, 1, burst_len, 0, 0)
def compute_mode_3(self, half_ub_size, block_id): """ compute for tiling mode 3 Parameters ---------- half_ub_size: bytes of half UB block_id: id of ai core Returns ------- None """ tik_instance = self.tik_instance indices_dsize = self.indices_dsize params_dsize = self.params_dsize with tik_instance.if_scope(block_id < self.need_core_num): indices_ub = tik_instance.Tensor(self.indices_dtype, (half_ub_size // indices_dsize,), name="indices_ub", scope=tik.scope_ubuf) res_ub = tik_instance.Tensor(self.params_dtype, (half_ub_size // params_dsize,), name="res_ub", scope=tik.scope_ubuf) burst_len_row = self.params_row * params_dsize // BLOCK_SIZE with tik_instance.for_range(0, self.params_pre) as pre_i: gm_offset_base = pre_i * self.params_axis indices_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="indices_offset") with tik_instance.for_range(0, self.indices_loop_num) as indices_loop_i: indices_offset.set_as(block_id * self.indices_num_each_core + indices_loop_i * self.indices_row_num_once) # copy indices data to ub from gm tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1, ceil_value(self.indices_row_num_once * indices_dsize, BLOCK_SIZE), 0, 0) burst_len_res = self.row_num_once_ub * self.params_row * params_dsize // BLOCK_SIZE inner_indices_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="inner_indices_offset") output_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="output_offset") with tik_instance.for_range(0, self.inner_loop_num) as inner_loop_i: inner_indices_offset.set_as(inner_loop_i * self.row_num_once_ub) output_offset.set_as((pre_i * self.indices_num + block_id * self.indices_num_each_core + indices_loop_i * self.indices_row_num_once + inner_loop_i * self.row_num_once_ub) * self.params_row) self.indices_inner_gather(indices_ub, res_ub, self.row_num_once_ub, inner_indices_offset, gm_offset_base, output_offset, burst_len_row, burst_len_res) with tik_instance.if_scope(self.row_num_once_tail_ub > 0): burst_len_res = self.row_num_once_tail_ub * self.params_row * params_dsize // BLOCK_SIZE inner_indices_offset.set_as(self.inner_loop_num * self.row_num_once_ub) output_offset.set_as((pre_i * self.indices_num + block_id * self.indices_num_each_core + indices_loop_i * self.indices_row_num_once + self.inner_loop_num * self.row_num_once_ub) * self.params_row) self.indices_inner_gather(indices_ub, res_ub, self.row_num_once_tail_ub, inner_indices_offset, gm_offset_base, output_offset, burst_len_row, burst_len_res) with tik_instance.if_scope(self.indices_row_num_last > 0): burst_len_res = self.row_num_last_ub * self.params_row * params_dsize // BLOCK_SIZE indices_offset.set_as(block_id * self.indices_num_each_core + self.indices_loop_num * self.indices_row_num_once) # copy indices data to ub from gm tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1, ceil_value(self.indices_row_num_last * indices_dsize, BLOCK_SIZE), 0, 0) inner_indices_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="inner_indices_offset") output_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="output_offset") with tik_instance.for_range(0, self.inner_loop_num_last) as inner_loop_i: inner_indices_offset.set_as(inner_loop_i * self.row_num_last_ub) output_offset.set_as((pre_i * self.indices_num + block_id * self.indices_num_each_core + self.indices_loop_num * self.indices_row_num_once + inner_loop_i * self.row_num_last_ub) * self.params_row) self.indices_inner_gather(indices_ub, res_ub, self.row_num_last_ub, inner_indices_offset, gm_offset_base, output_offset, burst_len_row, burst_len_res) with tik_instance.if_scope(self.row_num_last_tail_ub > 0): burst_len_res = self.row_num_last_tail_ub * self.params_row * params_dsize // BLOCK_SIZE inner_indices_offset.set_as(self.inner_loop_num_last * self.row_num_last_ub) output_offset.set_as((pre_i * self.indices_num + block_id * self.indices_num_each_core + self.indices_loop_num * self.indices_row_num_once + self.inner_loop_num_last * self.row_num_last_ub) * self.params_row) self.indices_inner_gather(indices_ub, res_ub, self.row_num_last_tail_ub, inner_indices_offset, gm_offset_base, output_offset, burst_len_row, burst_len_res) with tik_instance.if_scope(tik.all(self.indices_num_remaining > 0, block_id == self.tail_process_core)): indices_offset.set_as(self.need_core_num * self.indices_num_each_core) # copy indices data to ub from gm tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1, ceil_value(self.indices_num_remaining * indices_dsize, BLOCK_SIZE), 0, 0) output_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="output_offset") output_offset.set_as((pre_i * self.indices_num + self.need_core_num * self.indices_num_each_core) * self.params_row) burst_len_res_tail = self.indices_num_remaining * self.params_row * params_dsize // BLOCK_SIZE self.indices_inner_gather(indices_ub, res_ub, self.indices_num_remaining, 0, gm_offset_base, output_offset, burst_len_row, burst_len_res_tail)
def data_move_case_one(self, tik_instance, ub_ori, ub_trans, core_loop, sum_core, align_loop, remainder, num_data_one_loop): """ the data_move process when UB can put in N1 * N0 * X0 data and N % 16 != 0 """ is_x_padding = tik_instance.Scalar("uint64", init_value=0) with tik_instance.for_range(0, core_loop) as num_core_loop: total_core_loop = sum_core + num_core_loop num_x0 = total_core_loop # zero padding if C != 4 with tik_instance.if_scope(num_core_loop % align_loop == 0): if self.src_shape[2] != C0: self.vector_dup_zero(tik_instance, ub_ori, align_loop * num_data_one_loop, 0) src_gm_index = num_x0 * self.src_shape[3] * self.src_shape[2] * \ CUBE_SIZE // C0 src_ub_index = (num_core_loop % align_loop) * num_data_one_loop if C0 * self.src_shape[0] * self.src_shape[1] % CUBE_SIZE != 0: with tik_instance.if_scope(num_x0 == self.dst_shape[0] - 1): is_x_padding.set_as(1) with tik_instance.for_range( 0, self.src_shape[0] * self.src_shape[1] % (CUBE_SIZE // C0)) as num_cn: with tik_instance.for_range(0, self.src_shape[2])\ as num_row: tik_instance.data_move( ub_ori[src_ub_index + (num_cn * C0 + num_row) * self.dst_shape[1] * self.dst_shape[2]], self.src_gm[src_gm_index + (num_cn * self.src_shape[2] + num_row) * self.src_shape[3]], 0, 1, self.dst_shape[1] * self.dst_shape[2] // self.num_data, 0, 0) with tik_instance.else_scope(): with tik_instance.for_range(0, CUBE_SIZE // C0) as num_cn: with tik_instance.for_range(0, self.src_shape[2])\ as num_row: tik_instance.data_move( ub_ori[src_ub_index + (num_cn * C0 + num_row) * self.dst_shape[1] * self.dst_shape[2]], self.src_gm[src_gm_index + (num_cn * self.src_shape[2] + num_row) * self.src_shape[3]], 0, 1, self.dst_shape[1] * self.dst_shape[2] // self.num_data, 0, 0) else: with tik_instance.for_range(0, CUBE_SIZE // C0) as num_cn: with tik_instance.for_range(0, self.src_shape[2])\ as num_row: tik_instance.data_move( ub_ori[src_ub_index + (num_cn * C0 + num_row) * self.dst_shape[1] * self.dst_shape[2]], self.src_gm[src_gm_index + (num_cn * self.src_shape[2] + num_row) * self.src_shape[3]], 0, 1, self.dst_shape[1] * self.dst_shape[2] // self.num_data, 0, 0) with tik_instance.if_scope( tik.all((num_core_loop + 1) % align_loop == 0, num_core_loop != core_loop - 1)): self.data_rearrange_case_zero(tik_instance, ub_ori, ub_trans, align_loop, is_x_padding) dst_gm_index = (num_x0 - (align_loop - 1)) * num_data_one_loop tik_instance.data_move( self.dst_gm[dst_gm_index], ub_trans[0], 0, 1, align_loop * num_data_one_loop // self.num_data, 0, 0) with tik_instance.if_scope(num_core_loop == core_loop - 1): self.data_rearrange_case_zero(tik_instance, ub_ori, ub_trans, remainder, is_x_padding) dst_gm_index = (num_x0 - (remainder - 1)) * num_data_one_loop tik_instance.data_move( self.dst_gm[dst_gm_index], ub_trans[0], 0, 1, remainder * num_data_one_loop // self.num_data, 0, 0)
def compute_mode_1(self, half_ub_size, block_id): """ compute for tiling mode 1 Parameters ---------- half_ub_size: bytes of half UB block_id: id of ai core Returns ------- None """ tik_instance = self.tik_instance indices_dsize = self.indices_dsize params_dsize = self.params_dsize with tik_instance.if_scope(block_id < self.need_core_num): indices_ub = tik_instance.Tensor(self.indices_dtype, ((half_ub_size + 256) // indices_dsize,), name="indices_ub", scope=tik.scope_ubuf) res_ub = tik_instance.Tensor(self.params_dtype, ((half_ub_size + BLOCK_SIZE) // params_dsize,), name="res_ub", scope=tik.scope_ubuf) burst_len_row = ceil_value(self.params_row * params_dsize, BLOCK_SIZE) with tik_instance.for_range(0, self.params_pre) as pre_i: gm_offset_base = pre_i * self.params_axis # indices_num_each_core = indices_row_num_once * indices_loop_num + indices_row_num_last with tik_instance.for_range(0, self.indices_loop_num) as indices_loop_i: indices_offset = block_id * self.indices_num_each_core + \ indices_loop_i * self.indices_row_num_once # copy indices data to ub from gm tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1, ceil_value(self.indices_row_num_once * indices_dsize, BLOCK_SIZE), 0, 0) # indices_row_num_once = row_num_once_ub * inner_loop_num + row_num_once_tail_ub # a1. row_num_once_ub * inner_loop_num burst_len_res = ceil_value(self.row_num_once_ub * self.params_row * params_dsize, BLOCK_SIZE) with tik_instance.for_range(0, self.inner_loop_num) as inner_loop_i: inner_indices_offset = inner_loop_i * self.row_num_once_ub output_offset = (pre_i * self.indices_num + block_id * self.indices_num_each_core + indices_loop_i * self.indices_row_num_once + inner_loop_i * self.row_num_once_ub) * self.params_row self.indices_inner_gather_1(indices_ub, res_ub, self.row_num_once_ub, inner_indices_offset, gm_offset_base, output_offset, burst_len_row, burst_len_res) # a2. row_num_once_tail_ub with tik_instance.if_scope(self.row_num_once_tail_ub > 0): burst_len_res = ceil_value(self.row_num_once_tail_ub * self.params_row * params_dsize, BLOCK_SIZE) inner_indices_offset = self.inner_loop_num * self.row_num_once_ub output_offset = (pre_i * self.indices_num + block_id * self.indices_num_each_core + indices_loop_i * self.indices_row_num_once + self.inner_loop_num * self.row_num_once_ub) * self.params_row self.indices_inner_gather_last_1(indices_ub, res_ub, self.row_num_once_tail_ub, inner_indices_offset, gm_offset_base, output_offset, burst_len_row, burst_len_res) with tik_instance.if_scope(self.indices_row_num_last > 0): burst_len_res = ceil_value(self.row_num_last_ub * self.params_row * params_dsize, BLOCK_SIZE) indices_offset = block_id * self.indices_num_each_core + \ self.indices_loop_num * self.indices_row_num_once # copy indices data to ub from gm tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1, ceil_value(self.indices_row_num_last * indices_dsize, BLOCK_SIZE), 0, 0) with tik_instance.for_range(0, self.inner_loop_num_last) as inner_loop_i: inner_indices_offset = inner_loop_i * self.row_num_last_ub output_offset = (pre_i * self.indices_num + block_id * self.indices_num_each_core + self.indices_loop_num * self.indices_row_num_once + inner_loop_i * self.row_num_last_ub) * self.params_row self.indices_inner_gather_1(indices_ub, res_ub, self.row_num_last_ub, inner_indices_offset, gm_offset_base, output_offset, burst_len_row, burst_len_res) with tik_instance.if_scope(self.row_num_last_tail_ub > 0): burst_len_res = ceil_value(self.row_num_last_tail_ub * self.params_row * params_dsize, BLOCK_SIZE) inner_indices_offset = self.inner_loop_num_last * self.row_num_last_ub output_offset = (pre_i * self.indices_num + block_id * self.indices_num_each_core + self.indices_loop_num * self.indices_row_num_once + self.inner_loop_num_last * self.row_num_last_ub) * self.params_row self.indices_inner_gather_last_1(indices_ub, res_ub, self.row_num_last_tail_ub, inner_indices_offset, gm_offset_base, output_offset, burst_len_row, burst_len_res) with tik_instance.if_scope(tik.all(self.indices_num_remaining > 0, block_id == self.tail_process_core)): indices_offset = self.need_core_num * self.indices_num_each_core # copy indices data to ub from gm tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1, ceil_value(self.indices_num_remaining * indices_dsize, BLOCK_SIZE), 0, 0) output_offset = (pre_i * self.indices_num + self.need_core_num * self.indices_num_each_core) * self.params_row burst_len_res_tail = ceil_value(self.indices_num_remaining * self.params_row * params_dsize, BLOCK_SIZE) self.indices_inner_gather_1(indices_ub, res_ub, self.indices_num_remaining, 0, gm_offset_base, output_offset, burst_len_row, burst_len_res_tail)
def data_move(self, input_dict): """ move data from ub to gm Parameters ---------- input_dict: input_dict is a dict, the keys as follow: x_ub: x_ub is a tensor,store data from gm src_start: the start address of src tensor dest_start: the start address of dest tensor element_num: each continuous segment block_num: blcok number Returns ------- None """ x_ub = input_dict.get("x_ub") element_num = input_dict.get("element_num") block_num = input_dict.get("block_num") loop_num, last_ub_num = get_loop_param(element_num, self.one_max_size) cur_size = self.instance.Scalar("int32") cur_size.set_as(self.one_max_size * self.dsize) ub_num = self.instance.Scalar("int32") ub_num.set_as(self.one_max_size) offset_in = self.instance.Scalar("int32") offset_in.set_as(input_dict.get("src_start")) offset_out = self.instance.Scalar("int32") offset_out.set_as(input_dict.get("dest_start")) each_burst_num = constant.BLOCK_SIZE // self.dsize with self.instance.for_range(0, loop_num) as cycle: with self.instance.if_scope(cycle == loop_num - 1): cur_size.set_as(last_ub_num * self.dsize) ub_num.set_as(last_ub_num) n_burst = common_util.get_datamove_nburst(self.instance, cur_size) mod = cur_size % constant.BLOCK_SIZE with self.instance.if_scope( tik.all(cycle == loop_num - 1, mod != 0, block_num > 1)): x_ub_tail = self.instance.Tensor(self.dtype, (32,), name="x_ub_tail", scope=tik.scope_ubuf) self.instance.data_move(x_ub_tail, self.x_gm[offset_in + ub_num - each_burst_num], constant.SID, constant.DEFAULT_NBURST, 1, constant.STRIDE_ZERO, constant.STRIDE_ZERO) self.instance.data_move(self.y_gm[offset_out + ub_num - each_burst_num], x_ub_tail, constant.SID, constant.DEFAULT_NBURST, 1, constant.STRIDE_ZERO, constant.STRIDE_ZERO) with self.instance.if_scope(cur_size > constant.BLOCK_SIZE): self.instance.data_move(x_ub, self.x_gm[offset_in], constant.SID, constant.DEFAULT_NBURST, n_burst - 1, constant.STRIDE_ZERO, constant.STRIDE_ZERO) self.instance.data_move(self.y_gm[offset_out], x_ub, constant.SID, constant.DEFAULT_NBURST, n_burst - 1, constant.STRIDE_ZERO, constant.STRIDE_ZERO) with self.instance.else_scope(): self.instance.data_move(x_ub, self.x_gm[offset_in], constant.SID, constant.DEFAULT_NBURST, n_burst, constant.STRIDE_ZERO, constant.STRIDE_ZERO) self.instance.data_move(self.y_gm[offset_out], x_ub, constant.SID, constant.DEFAULT_NBURST, n_burst, constant.STRIDE_ZERO, constant.STRIDE_ZERO) offset_in.set_as(offset_in + ub_num) offset_out.set_as(offset_out + ub_num)
def proposal_pooling_fp32(self, proposal_id, c1_loop_index): """ max pooling from the h direction, then max pooling from the w direction, for fp32 Parameters ---------- proposal_id: which proposal is now being processed c1_loop_index: c1 index of the feature map 4C0 Returns ------- None """ scalar_roi_start_w = self.tik_instance.Scalar("int32", \ name="scalar_roi_start_w") scalar_roi_start_w.set_as(self.roi_start_w[0, proposal_id]) scalar_roi_start_h = self.tik_instance.Scalar("int32", \ name="scalar_roi_start_h") scalar_roi_bin_h = self.tik_instance.Scalar("int32", name="scalar_roi_bin_h") scalar_roi_width = self.tik_instance.Scalar("int32", name="scalar_roi_width") scalar_roi_width.set_as(self.roi_width[proposal_id]) scalar_roi_height = self.tik_instance.Scalar("int32", name="scalar_roi_height") scalar_roi_height.set_as(self.roi_height[proposal_id]) pooled_res = self.tik_instance.Tensor(FP32, \ shape=(FOUR_C0, self.pooled_h, self.pooled_w, self.fm_c0), \ scope=tik.scope_ubuf, name="pooled_res") res_size = FOUR_C0 * self.pooled_h * self.pooled_w * self.fm_c0 if res_size // DIGIT_64 >= 1: self.tik_instance.vec_dup(DIGIT_256 // self.dsize, pooled_res[0, 0, 0, 0], 0, res_size // DIGIT_64, DIGIT_8) if res_size % DIGIT_64 != 0: # tail self.tik_instance.vec_dup( res_size % DIGIT_64, pooled_res[res_size // DIGIT_64 * DIGIT_64], 0, DIGIT_1, DIGIT_8) pooled_h_res = self.tik_instance.Tensor(FP32, \ shape=(FOUR_C0, 1, self.fm_w_align, self.fm_c0), \ scope=tik.scope_ubuf, name="pooled_h_res") pooled_h_res_size = FOUR_C0 * 1 * self.fm_w_align * self.fm_c0 with self.tik_instance.for_range(0, self.pooled_h) as pooled_h_i: scalar_roi_start_h.set_as(self.roi_start_h[pooled_h_i, proposal_id]) scalar_roi_bin_h.set_as(self.roi_bin_h[pooled_h_i, proposal_id]) with self.tik_instance.if_scope( tik.all(scalar_roi_bin_h != 0, scalar_roi_width != 0)): self.tik_instance.vec_dup(DIGIT_256 // self.dsize, pooled_h_res[0, 0, 0, 0], 0, pooled_h_res_size // DIGIT_64, DIGIT_8) if self.fm_h * self.fm_w < DIGIT_128: with self.tik_instance.for_range(0, scalar_roi_width) \ as w_index: self.tik_instance.vmax( FOUR_C0 * self.fm_c0, pooled_h_res[0, 0, w_index, 0], self.proposal_fm_data[0, scalar_roi_start_h, scalar_roi_start_w + w_index, 0], pooled_h_res[0, 0, w_index, 0], scalar_roi_bin_h, self.fm_w_align * C0 * self.dsize // BLOCK_SIZE, self.fm_h * self.fm_w * C0 * self.dsize // BLOCK_SIZE, self.fm_w_align * C0 * self.dsize // BLOCK_SIZE, 0, self.fm_w * C0 * self.dsize // BLOCK_SIZE, 0) else: with self.tik_instance.for_range(0, FOUR_C0) as c0_i: with self.tik_instance.if_scope( scalar_roi_width <= DIGIT_4): self.tik_instance.vec_max( scalar_roi_width * self.fm_c0, pooled_h_res[c0_i, 0, 0, 0], self.proposal_fm_data[c0_i, scalar_roi_start_h, scalar_roi_start_w, 0], pooled_h_res[c0_i, 0, 0, 0], scalar_roi_bin_h, 0, self.fm_w * C0 * self.dsize // BLOCK_SIZE, 0) with self.tik_instance.else_scope(): with self.tik_instance.for_range(0, \ scalar_roi_width // DIGIT_4) as loop_4w_i: self.tik_instance.vec_max( DIGIT_256 // self.dsize, pooled_h_res[c0_i, 0, DIGIT_4 * loop_4w_i, 0], self.proposal_fm_data[c0_i, scalar_roi_start_h, scalar_roi_start_w + DIGIT_4 * loop_4w_i, 0], pooled_h_res[c0_i, 0, DIGIT_4 * loop_4w_i, 0], scalar_roi_bin_h, 0, self.fm_w * C0 * self.dsize // BLOCK_SIZE, 0) with self.tik_instance.if_scope( scalar_roi_width % DIGIT_4 != 0): tmp_w = scalar_roi_width // DIGIT_4 * DIGIT_4 self.tik_instance.vec_max( (scalar_roi_width - tmp_w)*self.fm_c0, pooled_h_res[c0_i, 0, tmp_w, 0], self.proposal_fm_data[c0_i, \ scalar_roi_start_h, \ scalar_roi_start_w + tmp_w, 0], pooled_h_res[c0_i, 0, tmp_w, 0], scalar_roi_bin_h, 0, self.fm_w*C0*self.dsize // BLOCK_SIZE, 0) self.proposal_pooling_w(proposal_id, pooled_h_i, pooled_res, pooled_h_res) # move result to out with self.tik_instance.if_scope(c1_loop_index != self.c1_looptime - 1): self.tik_instance.data_move( self.y[self.ouput_proposal_offset + self.calced_rois + proposal_id, c1_loop_index * FOUR_C0, 0, 0, 0], pooled_res[0, 0, 0, 0], 0, 1, FOUR_C0 * self.pooled_h * self.pooled_w * C0 * self.dsize // BLOCK_SIZE, 0, 0) with self.tik_instance.else_scope(): # tail self.tik_instance.data_move( self.y[self.ouput_proposal_offset + self.calced_rois+proposal_id, (self.c1_looptime - 1) * FOUR_C0, 0, 0, 0], pooled_res[0, 0, 0, 0], 0, 1, self.tail_c0_num * self.pooled_h * self.pooled_w * \ C0 * self.dsize // BLOCK_SIZE, 0, 0)
def do_crop_and_resize_compute_one_core(box_num_sigment, obj, box_num_offset): """do crop and resize in one core step 1 read boxes from boxes and calc h_top_index/h_bottom_index/h_lerp/w_left_index/w_right_index/w_lerp step 2 read input_batch_num from box_index step 3 copy 4 data(Total C(C1*C0)) in ub use use input_batch_num/h_top_index/h_bottom_index/w_left_index/w_right_index step 4 calcu the out top = top_left + (top_right - top_left) * x_lerp bottom = bottom_left + (bottom_right - bottom_left) * x_lerp out = top + (bottom - top) * y_lerp; Parameters: ---------- box_num_sigment : int. the crop boxes num for one core obj : class. crop_and_resize par object box_num_offset: int copy boxes offset Returns ------- None """ tik_instance = obj.get_tik_instance() # get float32 index ub index_ub = obj.index_ub men_len = get_ceil_int(box_num_sigment*4, obj.boxes_vector_num) * obj.boxes_vector_num # apply ub mem for index boxes_ub_small = obj.apply_mem((men_len,), "boxes_ub_h1", tik.scope_ubuf, obj.boxes_type) boxes_ub_big = obj.apply_mem((men_len,), "boxes_ub_h2", tik.scope_ubuf, obj.boxes_type) boxes_ub_scale = obj.apply_mem((men_len,), "boxes_ub_scale", tik.scope_ubuf, obj.boxes_type) copy_burst_len = get_ceil_int(box_num_sigment*4, obj.boxes_block_num) # init ub for input offset batch_offset_ub = obj.apply_mem((obj.boxes_vector_num,), "batch_offset_ub", tik.scope_ubuf, "int32") height_offset_ub = obj.apply_mem((obj.boxes_vector_num,), "height_offset_ub", tik.scope_ubuf, "int32") width_offset_ub = obj.apply_mem((obj.boxes_vector_num,), "width_offset_ub", tik.scope_ubuf, "int32") tik_instance.vector_dup(obj.boxes_vector_num, batch_offset_ub, obj.image_c1*obj.image_c0*obj.image_height*obj.image_width, 1, 1, 8) tik_instance.vector_dup(obj.boxes_vector_num, height_offset_ub, obj.image_c0*obj.image_width, 1, 1, 8) tik_instance.vector_dup(obj.boxes_vector_num, width_offset_ub, obj.image_c0, 1, 1, 8) # copy boxes in boxes_ub_small tik_instance.data_move(boxes_ub_small, obj.input_gm_list[1][box_num_offset*4], 0, 1, copy_burst_len, 0, 0) copy_burst_len = get_ceil_int(box_num_sigment*4 - 2, obj.boxes_block_num) # copy boxes[2] in boxes_ub_small tik_instance.data_move(boxes_ub_big, obj.input_gm_list[1][box_num_offset*4 + 2], 0, 1, copy_burst_len, 0, 0) # calc boxes[2] - boxes means y2 - y1 and x2 - x1 tik_func_vcomple(tik_instance, "vsub", boxes_ub_scale, boxes_ub_big, boxes_ub_small, men_len) if obj.crop_height <= 1 or obj.crop_width <= 1: tik_func_vcomple(tik_instance, "vadd", boxes_ub_big, boxes_ub_big, boxes_ub_small, men_len) # calc resize scale for h and w repeat_time = get_ceil_int(box_num_sigment*4, obj.boxes_vector_num) if obj.crop_height > 1: # to get scale_h: scale * (image_height - 1) / (crop_height - 1) tik_instance.vmuls([obj.height_mask_list[0], obj.height_mask_list[1]], boxes_ub_scale, boxes_ub_scale, (obj.image_height - 1) / (obj.crop_height - 1), repeat_time, 1, 1, 8, 8) if obj.crop_width > 1: # to get scale_w: scale * (image_width - 1) / (crop_width - 1) tik_instance.vmuls(obj.width_mask_list, boxes_ub_scale, boxes_ub_scale, (obj.image_width - 1) / (obj.crop_width - 1), repeat_time, 1, 1, 8, 8) # to get h_small: h_small * (image_height - 1) if obj.crop_height > 1: # to get h_small: h_small * (image_height - 1) tik_instance.vmuls(obj.height_mask_list, boxes_ub_small, boxes_ub_small, obj.image_height - 1, repeat_time, 1, 1, 8, 8) else: # to get h_small: (h_small + h_big) * (image_height - 1) * 0.5 tik_instance.vmuls(obj.height_mask_list, boxes_ub_small, boxes_ub_big, 0.5, repeat_time, 1, 1, 8, 8) tik_instance.vmuls(obj.height_mask_list, boxes_ub_small, boxes_ub_small, obj.image_height - 1, repeat_time, 1, 1, 8, 8) if obj.crop_width > 1: # to get w_small: w_small * (image_width - 1) tik_instance.vmuls(obj.width_mask_list, boxes_ub_small, boxes_ub_small, obj.image_width - 1, repeat_time, 1, 1, 8, 8) else: # to get w_small: (w_small + w_big) * (image_width - 1) * 0.5 tik_instance.vmuls(obj.width_mask_list, boxes_ub_small, boxes_ub_big, 0.5, repeat_time, 1, 1, 8, 8) tik_instance.vmuls(obj.width_mask_list, boxes_ub_small, boxes_ub_small, obj.image_width - 1, repeat_time, 1, 1, 8, 8) # box_index process for one sigment box_index_ub = obj.apply_mem((get_ceil_int(box_num_sigment, obj.boxes_block_num)*obj.boxes_block_num,), "box_index_ub", tik.scope_ubuf, "int32") copy_burst_len = get_ceil_int(box_num_sigment, obj.boxes_block_num) tik_instance.data_move(box_index_ub, obj.input_gm_list[2][box_num_offset], 0, 1, copy_burst_len, 0, 0) tik_func_vcomple(tik_instance, "vmul", box_index_ub, box_index_ub, batch_offset_ub, box_num_sigment, src1_rep=0) with tik_instance.for_range(0, box_num_sigment) as _box_idx: _out_batch_idx = _box_idx + box_num_offset scaler_h_small = tik_instance.Scalar(dtype=boxes_ub_small.dtype) scaler_w_small = tik_instance.Scalar(dtype=boxes_ub_small.dtype) scaler_h_scale = tik_instance.Scalar(dtype=boxes_ub_small.dtype) scaler_w_scale = tik_instance.Scalar(dtype=boxes_ub_small.dtype) # read scale for h and w scaler_h_small.set_as(boxes_ub_small[_box_idx*4]) scaler_w_small.set_as(boxes_ub_small[_box_idx*4 + 1]) scaler_h_scale.set_as(boxes_ub_scale[_box_idx*4]) scaler_w_scale.set_as(boxes_ub_scale[_box_idx*4 + 1]) input_boxes_in_h = obj.apply_mem((get_ceil_int(obj.crop_height, obj.boxes_block_num) * obj.boxes_block_num,), "input_boxes_in_h", tik.scope_ubuf, obj.boxes_type) tik_instance.vmuls(obj.crop_height, input_boxes_in_h, index_ub, scaler_h_scale, 1, 1, 1, 8, 8) input_boxes_in_w = obj.apply_mem((get_ceil_int(obj.crop_width, obj.boxes_block_num) * obj.boxes_block_num,), "input_boxes_in_w", tik.scope_ubuf, obj.boxes_type) tik_instance.vmuls(obj.crop_width, input_boxes_in_w, index_ub, scaler_w_scale, 1, 1, 1, 8, 8) tik_instance.vadds(obj.crop_height, input_boxes_in_h, input_boxes_in_h, scaler_h_small, 1, 1, 1, 8, 8) tik_instance.vadds(obj.crop_width, input_boxes_in_w, input_boxes_in_w, scaler_w_small, 1, 1, 1, 8, 8) h_top_index = \ obj.apply_mem((get_ceil_int(obj.crop_height, obj.boxes_block_num) * obj.boxes_block_num,), "h_top_index", tik.scope_ubuf, "int32") w_left_index = \ obj.apply_mem((get_ceil_int(obj.crop_width, obj.boxes_block_num) * obj.boxes_block_num,), "w_left_index", tik.scope_ubuf, "int32") h_index_post = \ obj.apply_mem((get_ceil_int(obj.crop_height, obj.boxes_block_num) * obj.boxes_block_num,), "h_index_post", tik.scope_ubuf, "int32") w_index_post = \ obj.apply_mem((get_ceil_int(obj.crop_width, obj.boxes_block_num) * obj.boxes_block_num,), "w_index_post", tik.scope_ubuf, "int32") cast_flag = tbe_platform.cce_conf.api_check_support("tik.vconv", "f322s32r") with tik_instance.new_stmt_scope(): tmp_float_ub_0 = obj.apply_mem((get_ceil_int(obj.crop_height, obj.boxes_block_num) * obj.boxes_block_num,), "tmp_float_ub_0", tik.scope_ubuf, obj.boxes_type) if not cast_flag: tik_func_vconv(tik_instance, h_top_index, input_boxes_in_h, obj.crop_height, mode="floor", mini_mid_ub=tmp_float_ub_0) else: tik_func_vconv(tik_instance, h_top_index, input_boxes_in_h, obj.crop_height, mode="floor") # h_top_index vconv from int32 to float32 tik_func_vconv(tik_instance, tmp_float_ub_0, h_top_index, obj.crop_height) tik_func_vcomple(tik_instance, "vmul", h_top_index, h_top_index, height_offset_ub, obj.crop_height, src1_rep=0) # do: h_lerp = input_boxes_in_h - tmp_float_ub tik_func_vcomple(tik_instance, "vsub", input_boxes_in_h, input_boxes_in_h, tmp_float_ub_0, obj.crop_height) tik_func_vconv(tik_instance, h_index_post, input_boxes_in_h, obj.crop_height, mode="ceil") tmp_float_ub_1 = obj.apply_mem((get_ceil_int(obj.crop_width, obj.boxes_block_num) * obj.boxes_block_num,), "tmp_float_ub_1", tik.scope_ubuf, obj.boxes_type) if not cast_flag: tik_func_vconv(tik_instance, w_left_index, input_boxes_in_w, obj.crop_width, mode="floor", mini_mid_ub=tmp_float_ub_1) else: tik_func_vconv(tik_instance, w_left_index, input_boxes_in_w, obj.crop_width, mode="floor") # h_top_index vconv from int32 to float32 tik_func_vconv(tik_instance, tmp_float_ub_1, w_left_index, obj.crop_width) tik_func_vcomple(tik_instance, "vmul", w_left_index, w_left_index, width_offset_ub, obj.crop_width, src1_rep=0) # do: w_lerp = input_boxes_in_h - tmp_float_ub tik_func_vcomple(tik_instance, "vsub", input_boxes_in_w, input_boxes_in_w, tmp_float_ub_1, obj.crop_width) tik_func_vconv(tik_instance, w_index_post, input_boxes_in_w, obj.crop_width, mode="ceil") # read input batch index and calc input offset input_batch_offset = tik_instance.Scalar(dtype="int32") input_batch_offset.set_as(box_index_ub[_box_idx]) input_h_offset = tik_instance.Scalar(dtype="int32") input_h_post = tik_instance.Scalar(dtype="int32") h_lerp = tik_instance.Scalar(dtype=boxes_ub_small.dtype) c0_block_num = obj.image_c0 // obj.block_num image_gm = obj.input_gm_list[0] output_gm = obj.output_gm_list[0] with tik_instance.for_range(0, obj.crop_height) as _crop_height_idx: input_h_offset.set_as(h_top_index[_crop_height_idx]) input_h_post.set_as(h_index_post[_crop_height_idx]) real_h_offset = input_h_offset + input_h_post with tik_instance.if_scope( tik.all(input_h_offset >= 0, real_h_offset <= (obj.image_height - 1)*obj.image_c0*obj.image_width)): h_lerp.set_as(input_boxes_in_h[_crop_height_idx]) thread_num = 2 if obj.crop_width <= 1: thread_num = 1 with tik_instance.for_range(0, obj.crop_width, thread_num=thread_num) as _crop_width_idx: input_w_offset = tik_instance.Scalar(dtype="int32") input_w_post = tik_instance.Scalar(dtype="int32") w_lerp = tik_instance.Scalar(dtype=boxes_ub_small.dtype) input_w_offset.set_as(w_left_index[_crop_width_idx]) input_w_post.set_as(w_index_post[_crop_width_idx]) real_w_offset = input_w_offset + input_w_post with tik_instance.if_scope(tik.all(input_w_offset >= 0, real_w_offset <= (obj.image_width - 1) * obj.image_c0)): w_lerp.set_as(input_boxes_in_w[_crop_width_idx]) # copy all C data in ub with tik_instance.new_stmt_scope(): h0_w_ub = obj.apply_mem((obj.image_c1*obj.image_c0*2,), "h0_w_ub", tik.scope_ubuf, "float32") h1_w_ub = obj.apply_mem((obj.image_c1*obj.image_c0*2,), "h1_w_ub", tik.scope_ubuf, "float32") if obj.image_block_num == obj.block_num: # when input is fp32, just copy if obj.image_width > 1: tik_instance.data_move( h0_w_ub, image_gm[input_batch_offset + input_h_offset + input_w_offset], 0, obj.image_c1, c0_block_num*2, obj.image_height*obj.image_width*c0_block_num - c0_block_num*2, 0) if obj.image_height > 1: tik_instance.data_move( h1_w_ub, image_gm[input_batch_offset + input_h_offset + input_w_offset + obj.image_width * obj.image_c0], 0, obj.image_c1, c0_block_num*2, obj.image_height*obj.image_width*c0_block_num - c0_block_num*2, 0) else: tik_func_vector(tik_instance, h1_w_ub, 0, obj.image_c1*obj.image_c0*2) else: tik_instance.data_move( h0_w_ub, image_gm[input_batch_offset + input_h_offset + input_w_offset], 0, obj.image_c1, c0_block_num, obj.image_height*obj.image_width*c0_block_num - c0_block_num, c0_block_num) tik_instance.data_move( h0_w_ub[obj.image_c0], image_gm[input_batch_offset + input_h_offset + input_w_offset], 0, obj.image_c1, c0_block_num, obj.image_height*obj.image_width*c0_block_num - c0_block_num, c0_block_num) if obj.image_height > 1: tik_instance.data_move( h1_w_ub, image_gm[input_batch_offset + input_h_offset + input_w_offset + obj.image_width * obj.image_c0], 0, obj.image_c1, c0_block_num, obj.image_height*obj.image_width*c0_block_num - c0_block_num, c0_block_num) tik_instance.data_move( h1_w_ub[obj.image_c0], image_gm[input_batch_offset + input_h_offset + input_w_offset + obj.image_width * obj.image_c0], 0, obj.image_c1, c0_block_num, obj.image_height*obj.image_width*c0_block_num - c0_block_num, c0_block_num) else: tik_func_vector(tik_instance, h1_w_ub, 0, obj.image_c1*obj.image_c0*2) else: # when input is fp16, will copy and cast to fp32 with tik_instance.new_stmt_scope(): h0_w_ub_fp16 = obj.apply_mem((obj.image_c1*obj.image_c0*2,), "h0_w_ub_fp16", tik.scope_ubuf) h1_w_ub_fp16 = obj.apply_mem((obj.image_c1*obj.image_c0*2,), "h1_w_ub_fp16", tik.scope_ubuf) c0_block_fp16 = 1 if obj.image_width > 1: tik_instance.data_move( h0_w_ub_fp16, image_gm[input_batch_offset + input_h_offset + input_w_offset], 0, obj.image_c1, c0_block_fp16*2, obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16*2, 0) tik_func_vconv(tik_instance, h0_w_ub, h0_w_ub_fp16, obj.image_c1*obj.image_c0*2) if obj.image_height > 1: tik_instance.data_move( h1_w_ub_fp16, image_gm[input_batch_offset + input_h_offset + input_w_offset + obj.image_width * obj.image_c0], 0, obj.image_c1, c0_block_fp16*2, obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16*2, 0) tik_func_vconv(tik_instance, h1_w_ub, h1_w_ub_fp16, obj.image_c1*obj.image_c0*2) else: tik_func_vector(tik_instance, h1_w_ub, 0, obj.image_c1*obj.image_c0*2) else: tik_instance.data_move( h0_w_ub_fp16, image_gm[input_batch_offset + input_h_offset + input_w_offset], 0, obj.image_c1, c0_block_fp16, obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16, c0_block_fp16) tik_instance.data_move( h0_w_ub_fp16[c0_block_fp16*obj.image_c0], image_gm[input_batch_offset + input_h_offset + input_w_offset], 0, obj.image_c1, c0_block_fp16, obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16, c0_block_fp16) tik_func_vconv(tik_instance, h0_w_ub, h0_w_ub_fp16, obj.image_c1*obj.image_c0*2) if obj.image_height > 1: tik_instance.data_move( h1_w_ub_fp16, image_gm[input_batch_offset + input_h_offset + input_w_offset + obj.image_width * obj.image_c0], 0, obj.image_c1, c0_block_fp16, obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16, c0_block_fp16) tik_instance.data_move( h1_w_ub_fp16[c0_block_fp16*obj.image_c0], image_gm[input_batch_offset + input_h_offset + input_w_offset + obj.image_width * obj.image_c0], 0, obj.image_c1, c0_block_fp16, obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16, c0_block_fp16) tik_func_vconv(tik_instance, h1_w_ub, h1_w_ub_fp16, obj.image_c1*obj.image_c0*2) else: tik_func_vector(tik_instance, h1_w_ub, 0, obj.image_c1*obj.image_c0*2) tik_func_vcomple(tik_instance, "vsub", h1_w_ub, h1_w_ub, h0_w_ub, obj.image_c1*obj.image_c0*2) tik_func_vmuls(tik_instance, h1_w_ub, h1_w_ub, h_lerp, obj.image_c1*obj.image_c0*2) tik_func_vcomple(tik_instance, "vadd", h0_w_ub, h1_w_ub, h0_w_ub, obj.image_c1*obj.image_c0*2) tik_fun = tik_instance.vsub tik_fun(obj.image_c0, h1_w_ub, h0_w_ub[16], h0_w_ub, obj.image_c1, 1, 1, 1, 2, 4, 4) tik_func_vmuls(tik_instance, h1_w_ub, h1_w_ub, w_lerp, obj.image_c1*obj.image_c0) tik_fun = tik_instance.vadd tik_fun(obj.image_c0, h1_w_ub[obj.image_c1*obj.image_c0:], h1_w_ub, h0_w_ub, obj.image_c1, 1, 1, 1, 2, 2, 4) output_offset = \ _out_batch_idx*obj.image_c1*obj.crop_width*obj.crop_height*obj.image_c0 \ + _crop_height_idx*obj.crop_width*obj.image_c0 + _crop_width_idx*obj.image_c0 tik_instance.data_move(output_gm[output_offset], h1_w_ub[obj.image_c1*obj.image_c0:], 0, obj.image_c1, c0_block_num, 0, obj.crop_height*obj.crop_width*c0_block_num - c0_block_num) with tik_instance.else_scope(): with tik_instance.new_stmt_scope(): h1_w_ub = obj.apply_mem((get_ceil_int(obj.image_c1*obj.image_c0, obj.vector_num) * obj.vector_num,), "h1_w_ub", tik.scope_ubuf, "float32") tik_func_vector(tik_instance, h1_w_ub, obj.extrapolation_value, obj.image_c1*obj.image_c0) output_offset = \ _out_batch_idx*obj.image_c1*obj.crop_width*obj.crop_height*obj.image_c0 \ + _crop_height_idx*obj.crop_width*obj.image_c0 + _crop_width_idx*obj.image_c0 tik_instance.data_move(output_gm[output_offset], h1_w_ub, 0, obj.image_c1, c0_block_num, 0, obj.crop_height*obj.crop_width*c0_block_num - c0_block_num) with tik_instance.else_scope(): with tik_instance.new_stmt_scope(): h0_w_ub = obj.apply_mem((get_ceil_int(obj.image_c1*obj.crop_width*obj.image_c0, obj.vector_num) * obj.vector_num,), "h0_w_ub", tik.scope_ubuf, "float32") tik_func_vector(tik_instance, h0_w_ub, obj.extrapolation_value, obj.image_c1*obj.image_c0*obj.crop_width) output_offset = \ _out_batch_idx*obj.image_c1*obj.crop_width*obj.crop_height*obj.image_c0 \ + _crop_height_idx*obj.crop_width*obj.image_c0 tik_instance.data_move(output_gm[output_offset], h0_w_ub, 0, obj.image_c1, c0_block_num*obj.crop_width, 0, obj.crop_height*obj.crop_width*c0_block_num - c0_block_num*obj.crop_width)
def data_move(self, input_dict): """ move data from ub to gm Parameters ---------- input_dict: input_dict is a dict, the keys as follow: x1_ub: x1_ub is a tensor,store data from gm x1_offset: x1 gm data offset out_offset: output data offset element_num: each continuous segment block_num: blcok number Returns ------- None """ x1_ub = input_dict.get("x1_ub") out_offset = input_dict.get("out_offset") element_num = input_dict.get("element_num") block_num = input_dict.get("block_num") loop_cycle, last_ub_num = get_loop_param(element_num, self.one_max_size) total_size = self.instance.Scalar("int32") total_size.set_as(self.one_max_size * self.dsize) ub_size = self.instance.Scalar("int32") ub_size.set_as(self.one_max_size) offset_x1 = self.instance.Scalar("int32") offset_x1.set_as(input_dict.get("x1_offset")) offset_out = self.instance.Scalar("int32") offset_out.set_as(out_offset) each_burst_num = constant.BLOCK_SIZE // self.dsize with self.instance.for_range(0, loop_cycle) as cycle: with self.instance.if_scope(cycle == loop_cycle - 1): total_size.set_as(last_ub_num * self.dsize) ub_size.set_as(last_ub_num) nburst = common_util.get_datamove_nburst(self.instance, total_size) with self.instance.if_scope( tik.all(cycle == loop_cycle - 1, total_size % constant.BLOCK_SIZE != 0, block_num > 1)): x1_ub_tmp = self.instance.Tensor(self.dtype, (32, ), name="x1_ub_tmp", scope=tik.scope_ubuf) self.instance.data_move( x1_ub_tmp, self.x1_gm[offset_x1 + ub_size - each_burst_num], constant.SID, constant.DEFAULT_NBURST, 1, constant.STRIDE_ZERO, constant.STRIDE_ZERO) self.instance.data_move( self.y_gm[offset_out + ub_size - each_burst_num], x1_ub_tmp, constant.SID, constant.DEFAULT_NBURST, 1, constant.STRIDE_ZERO, constant.STRIDE_ZERO) with self.instance.if_scope(total_size > constant.BLOCK_SIZE): self.instance.data_move(x1_ub, self.x1_gm[offset_x1], constant.SID, constant.DEFAULT_NBURST, nburst - 1, constant.STRIDE_ZERO, constant.STRIDE_ZERO) self.instance.data_move(self.y_gm[offset_out], x1_ub, constant.SID, constant.DEFAULT_NBURST, nburst - 1, constant.STRIDE_ZERO, constant.STRIDE_ZERO) with self.instance.else_scope(): self.instance.data_move(x1_ub, self.x1_gm[offset_x1], constant.SID, constant.DEFAULT_NBURST, nburst, constant.STRIDE_ZERO, constant.STRIDE_ZERO) self.instance.data_move(self.y_gm[offset_out], x1_ub, constant.SID, constant.DEFAULT_NBURST, nburst, constant.STRIDE_ZERO, constant.STRIDE_ZERO) offset_x1.set_as(offset_x1 + ub_size) offset_out.set_as(offset_out + ub_size)
def compute_crop(self): """ compute crop Parameters ---------- None Returns ------- None """ block_num, each_block_size, loop, tail = \ self.get_blockdim_and_loop_cycle() shape_out = self.input_dict.get("y").get("shape") shape_out_len = get_shape_total_number(shape_out) offset_in = self.input_dict.get("offset") shape = self.input_dict.get("x1").get("shape") element_num, shape_len = self.get_element_num() x1_shape_list = get_elem_of_each_dim(shape, len(shape)) shape = self.input_dict.get("x2").get("shape") x2_shape_list = get_elem_of_each_dim(shape, shape_len - 1) thread_n = self.get_thread_num(block_num, loop, element_num) with self.instance.for_range(0, block_num, block_num=block_num) \ as block_id: ub_tmp = self.instance.Tensor(self.dtype, (256, ), name="ub_tmp", scope=tik.scope_ubuf) self.instance.data_move(ub_tmp, self.x2_gm[0], constant.SID, constant.DEFAULT_NBURST, 1, constant.STRIDE_ZERO, constant.STRIDE_ZERO) count = self.instance.Scalar("int32") count.set_as(0) each_loop = self.instance.Scalar("int32") each_loop.set_as(loop) offset = self.instance.Scalar("int32") if tail > 0: with self.instance.if_scope(block_id < tail): each_loop.set_as(each_loop + 1) offset.set_as(block_id * each_loop) with self.instance.if_scope(tik.all(block_id >= tail, tail > 0)): offset.set_as(block_id * (each_loop + 1) - (block_id - tail)) out_offset = self.instance.Scalar("int32") out_offset.set_as(offset * element_num) cycles = shape_out_len // element_num tmp_offset = self.instance.Scalar("int32") tmp_offset.set_as(0) with self.instance.for_range(offset, cycles, thread_num=thread_n) as times: with self.instance.if_scope(count < each_loop): x1_ub = self.instance.Tensor(self.dtype, (self.one_max_size, ), name="x1_ub", scope=tik.scope_ubuf) x1_offset = self.instance.Scalar("int32") x1_offset.set_as(0) for q in range(shape_len): mod = times for s in range(q): mod %= x2_shape_list[s] mod = mod // x2_shape_list[q] + offset_in[q] x1_offset.set_as(x1_offset + mod * x1_shape_list[q]) if element_num * self.dsize < constant.BLOCK_SIZE \ and block_num > 1: input_dict = { "x1_ub": x1_ub, "ub_tmp": ub_tmp, "x1_offset": x1_offset, "out_offset": out_offset, "tmp_offset": tmp_offset, "element_num": element_num, "each_block_size": each_block_size, "count": count, "each_loop": each_loop, } self.move_out_less_than32b(input_dict) out_offset.set_as(out_offset + element_num) else: input_dict = { "x1_ub": x1_ub, "x1_offset": x1_offset, "out_offset": out_offset, "element_num": element_num, "block_num": block_num, } self.data_move(input_dict) out_offset.set_as(out_offset + element_num) count.set_as(count + 1)
def tik_instance_cut_nc1_cut_one_h(self, kernel_name): """ get vector instruct repeat times Parameters ---------- kernel_name: cce kernel name, default value is "maxpoolGradWithArgmax" Returns ------- None """ batch, channel1, dyh, dyw, channel = self.input_gard_shape dxh, dxw = self.y_shape[2:4] strideh, stridew = self.strides[1:3] if strideh > dxh: strideh = dxh if stridew > dxw: stridew = dxw dtype = self.dtype dtype_size = self.dtype_size windowh, windoww = self.ksize[1:3] block = self.block pad_top = self.pad[0] pad_left = self.pad[2] hoverlap = self.hoverlap col2img_h = windowh if col2img_h < strideh: col2img_h = strideh col2img_dyw = (dyw + 15) // 16 * 16 if self.woverlap == 0: col2img_w = col2img_dyw * stridew else: col2img_w = (col2img_dyw - 1) * stridew + windoww mask_one_window = ((dyh * dyw + 15) // 16 + 1) * 16 # vector_repeat_time v_rep_time = col2img_dyw * channel * dtype_size // ONE_REPEAT v_rep_cycle_fp32 = 2 * v_rep_time // V_MAX_REPEAT # v_rep_last v_rep_last_fp32 = 2 * v_rep_time % V_MAX_REPEAT # when every looph move data after, then dup col2img data v_rep_afmv = (windowh - hoverlap) * channel *\ col2img_w * dtype_size * 2 // ONE_REPEAT v_rep_afmv_cycle = v_rep_afmv // V_MAX_REPEAT v_rep_afmv_last = v_rep_afmv % V_MAX_REPEAT v_rep_time_col = (2 * col2img_w * channel * col2img_h * \ dtype_size + ONE_REPEAT - 1) // ONE_REPEAT v_rep_cycle_col = v_rep_time_col // V_MAX_REPEAT v_rep_last_col = v_rep_time_col % V_MAX_REPEAT data_input = self.tik_instance.Tensor(dtype, self.input_gard_shape, name="data_input", scope=tik.scope_gm) data_mask = self.tik_instance.Tensor("uint16", (batch * channel1 * windowh * windoww * mask_one_window,), name="data_mask", scope=tik.scope_gm) if self.padding == "SAME": data_output = self.tik_instance.Tensor(dtype, self.y_shape, name="data_output", scope=tik.scope_gm) else: data_output = self.tik_instance.Tensor(dtype, self.y_shape, name="data_output", scope=tik.scope_gm, is_atomic_add=True) data_input_origin = self.tik_instance.Tensor(dtype, self.y_shape, name="data_input_origin", scope=tik.scope_gm) real_block, block_cycle, block_index = self.get_block_param(block) with self.tik_instance.for_range(0, real_block, block_num=real_block) as block_id: real_cycle = self.tik_instance.Scalar("int32") block_base = self.tik_instance.Scalar("int32") block_num = self.tik_instance.Scalar("int32") with self.tik_instance.if_scope(block_id < block_index): real_cycle.set_as(block_cycle + 1) block_base.set_as(block_id * real_cycle) with self.tik_instance.else_scope(): real_cycle.set_as(block_cycle) block_base.set_as(block_index + block_id * block_cycle) with self.tik_instance.for_range(0, real_cycle) as cycle_id: block_num.set_as(block_base + cycle_id) data_vsel_scalar = self.tik_instance.Scalar(dtype) data_vsel_scalar.set_as(0) data_vsel_ub_zero = self.tik_instance.Tensor(dtype, (128,), name="data_vsel_ub_zero", scope=tik.scope_ubuf) self.tik_instance.data_move(data_vsel_ub_zero[0], data_input_origin[0], constant.SID, constant.DEFAULT_NBURST, constant.DEFAULT_BURST_LEN, constant.STRIDE_ZERO, constant.STRIDE_ZERO) self.clean_fp16_one_repeat(data_vsel_ub_zero, dtype) # vector_dup ub every time dxh_address_offset = self.tik_instance.Scalar("int32") dxh_address_offset.set_as(0) dxh_calcline = self.tik_instance.Scalar("int32") dxh_calcline.set_as(0) data_max_ub = self.tik_instance.Tensor(dtype, (col2img_dyw * channel,), name="data_max_ub", scope=tik.scope_ubuf) if self.woverlap > 0 and dyw % 16 != 0 and self.padding == "VALID": self.clean_max_ub(data_max_ub, dtype) data_vmul_ub_col2img_fp32 = \ self.tik_instance.Tensor("float32", (col2img_w * channel * col2img_h + 64,), name="data_vmul_ub_col2img_fp32", scope=tik.scope_ubuf) data_vmul_ub_col2img_fp16 = \ self.tik_instance.Tensor(dtype, (col2img_w * channel * col2img_h + 128,), name="data_vmul_ub_col2img_fp16", scope=tik.scope_ubuf) self.clean_fp32_multi_repeat(data_vmul_ub_col2img_fp32, dtype_size * 2) with self.tik_instance.for_range(0, dyh) as looph: # dy copy gm to ub self.tik_instance.data_move(data_max_ub, data_input[(block_num * dyh + looph) * dyw * channel], constant.SID, constant.DEFAULT_NBURST, dyw * channel * dtype_size // BLOCK_SIZE, constant.STRIDE_ZERO, constant.STRIDE_ZERO) # mask define data_mask_ub = self.tik_instance.Tensor("uint16", (col2img_dyw,), name="data_mask_ub", scope=tik.scope_ubuf) with self.tik_instance.for_range(0, windowh * windoww) as mask_id: # mask copy gm to ub self.tik_instance.data_move(data_mask_ub, data_mask[block_num * mask_one_window * windoww * windowh + looph * dyw + mask_id * mask_one_window], constant.SID, 1, col2img_dyw * dtype_size // BLOCK_SIZE, constant.STRIDE_ZERO, constant.STRIDE_ZERO) data_vsel_ub = self.tik_instance.Tensor(dtype, (col2img_dyw * channel,), name="data_vsel_ub", scope=tik.scope_ubuf) data_vsel_ub_fp32 = self.tik_instance.Tensor("float32", (col2img_dyw * channel,), name="data_vsel_ub_fp32", scope=tik.scope_ubuf) if v_rep_time > 0: with self.tik_instance.for_range(0, v_rep_time, thread_num=1) as cycle: cmpmask = self.tik_instance.mov_tensor_to_cmpmask( data_mask_ub[cycle * MASK_MAX]) self.tik_instance.vsel(constant.MASK128, 0, data_vsel_ub[cycle * FP16_MAX], cmpmask, data_max_ub[cycle * FP16_MAX], data_vsel_ub_zero[0], constant.REPEAT_TIME_ONCE, constant.STRIDE_ONE, constant.STRIDE_ONE, constant.STRIDE_ONE, constant.REPEAT_STRIDE_EIGHT, constant.REPEAT_STRIDE_EIGHT, constant.REPEAT_STRIDE_EIGHT) # fp16 to fp32 if v_rep_cycle_fp32 > 0: with self.tik_instance.for_range(0, v_rep_cycle_fp32, thread_num=1) as cycle: self.tik_instance.vconv(constant.MASK64, "", data_vsel_ub_fp32[cycle * V_MAX_REPEAT * FP32_MAX], data_vsel_ub[cycle * V_MAX_REPEAT * FP16_MAX], V_MAX_REPEAT, constant.STRIDE_ONE, constant.STRIDE_ONE, constant.REPEAT_STRIDE_EIGHT, constant.REPEAT_STRIDE_FOUR) if v_rep_last_fp32 != 0: self.tik_instance.vconv(constant.MASK64, "", data_vsel_ub_fp32[ v_rep_cycle_fp32 * V_MAX_REPEAT * FP32_MAX], data_vsel_ub[ v_rep_cycle_fp32 * V_MAX_REPEAT * FP32_MAX], v_rep_last_fp32, constant.STRIDE_ONE, constant.STRIDE_ONE, constant.REPEAT_STRIDE_EIGHT, constant.REPEAT_STRIDE_FOUR) # col2img fetch_filter_w = mask_id % windoww fetch_filter_h = mask_id // windoww left_top_w = 0 left_top_h = 0 self.tik_instance.col2img(data_vmul_ub_col2img_fp32[0], data_vsel_ub_fp32[0], (0, 0, 0, 0), col2img_h, col2img_w, fetch_filter_w, fetch_filter_h, left_top_w, left_top_h, stridew, strideh, windoww, windowh, 1, 1, col2img_dyw // 16) if v_rep_cycle_col > 0: with self.tik_instance.for_range(0, v_rep_cycle_col, thread_num=1) as cycle: self.tik_instance.vconv(constant.MASK64, "", data_vmul_ub_col2img_fp16[ cycle * V_MAX_REPEAT * FP32_MAX], data_vmul_ub_col2img_fp32[ cycle * V_MAX_REPEAT * FP32_MAX], V_MAX_REPEAT, constant.STRIDE_ONE, constant.STRIDE_ONE, constant.REPEAT_STRIDE_FOUR, constant.REPEAT_STRIDE_EIGHT) if v_rep_last_col != 0: self.tik_instance.vconv(constant.MASK64, "", data_vmul_ub_col2img_fp16[ v_rep_cycle_col * V_MAX_REPEAT * FP32_MAX], data_vmul_ub_col2img_fp32[ v_rep_cycle_col * V_MAX_REPEAT * FP32_MAX], v_rep_last_col, constant.STRIDE_ONE, constant.STRIDE_ONE, constant.REPEAT_STRIDE_FOUR, constant.REPEAT_STRIDE_EIGHT) src_address = self.tik_instance.Scalar("int32") dst_address = self.tik_instance.Scalar("int32") nburst = self.tik_instance.Scalar("int32") burst_len = self.tik_instance.Scalar("int32") src_stride = self.tik_instance.Scalar("int32") dst_stride = self.tik_instance.Scalar("int32") if hoverlap == 0: # move ub to gm src_address.set_as(pad_left * channel) dst_address.set_as(block_num * dxh * dxw * channel + (looph * col2img_h - pad_top) * dxw * channel) nburst.set_as(col2img_h) burst_len.set_as(self.offset_w) src_stride.set_as(col2img_w - self.offset_w) dst_stride.set_as(dxw - self.offset_w) with self.tik_instance.if_scope(looph == 0): src_address.set_as(src_address + pad_top * col2img_w * channel) dst_address.set_as(block_num * dxh * dxw * channel) nburst.set_as(nburst - pad_top) with self.tik_instance.if_scope(looph == dyh - 1): with self.tik_instance.if_scope(self.padding == "SAME"): nburst.set_as(dxh) with self.tik_instance.else_scope(): with self.tik_instance.if_scope(looph == dyh - 1): with self.tik_instance.if_scope(self.padding == "SAME"): nburst.set_as(dxh - col2img_h * looph + pad_top) with self.tik_instance.else_scope(): nburst.set_as(windowh) self.tik_instance.data_move(data_output[dst_address], data_vmul_ub_col2img_fp16[src_address], constant.SID, nburst, burst_len, src_stride, dst_stride) data_clean_scalar_fp32 = self.tik_instance.Scalar("float32") data_clean_scalar_fp32.set_as(0) if v_rep_cycle_col > 0: with self.tik_instance.for_range(0, v_rep_cycle_col, thread_num=1) as cycle: self.tik_instance.vector_dup(constant.MASK64, data_vmul_ub_col2img_fp32[ cycle * V_MAX_REPEAT * FP32_MAX], data_clean_scalar_fp32, V_MAX_REPEAT, constant.STRIDE_ONE, constant.REPEAT_STRIDE_EIGHT) if v_rep_last_col != 0: self.tik_instance.vector_dup(constant.MASK64, data_vmul_ub_col2img_fp32[ v_rep_cycle_col * \ V_MAX_REPEAT * FP32_MAX], data_clean_scalar_fp32, v_rep_last_col, constant.STRIDE_ONE, constant.REPEAT_STRIDE_EIGHT) else: with self.tik_instance.if_scope((looph + 1) * strideh > pad_top): src_address.set_as(pad_left * channel) dst_address.set_as(block_num * dxh * dxw * channel + dxh_address_offset) nburst.set_as(strideh) with self.tik_instance.if_scope(looph * strideh < pad_top): nburst.set_as((looph + 1) * strideh - pad_top) src_address.set_as(src_address + (pad_top - looph * strideh) * col2img_w * channel) with self.tik_instance.if_scope( tik.all(dxh_calcline < dxh, looph == dyh - 1)): with self.tik_instance.if_scope(self.padding == "SAME"): nburst.set_as(dxh - dxh_calcline) with self.tik_instance.else_scope(): nburst.set_as(windowh) burst_len.set_as(self.offset_w) src_stride.set_as(col2img_w - self.offset_w) dst_stride.set_as(dxw - self.offset_w) self.tik_instance.data_move(data_output[dst_address], data_vmul_ub_col2img_fp16[src_address], constant.SID, nburst, burst_len, src_stride, dst_stride) dxh_address_offset.set_as(dxh_address_offset + \ nburst * dxw * channel) dxh_calcline.set_as(dxh_calcline + nburst) # dma_copy ub to ub self.tik_instance.data_move(data_vmul_ub_col2img_fp32[0], data_vmul_ub_col2img_fp32[ strideh * channel * col2img_w], constant.SID, hoverlap, 2 * col2img_w, constant.STRIDE_ZERO, constant.STRIDE_ZERO) data_clean_scalar_fp32 = self.tik_instance.Scalar("float32") data_clean_scalar_fp32.set_as(0) if v_rep_afmv_cycle > 0: with self.tik_instance.for_range(0, v_rep_afmv_cycle, thread_num=1) as cycle: self.tik_instance.vector_dup(constant.MASK64, data_vmul_ub_col2img_fp32[ hoverlap * channel * col2img_w + cycle * V_MAX_REPEAT * FP32_MAX], data_clean_scalar_fp32, V_MAX_REPEAT, constant.STRIDE_ONE, constant.REPEAT_STRIDE_EIGHT) if v_rep_afmv_last != 0: self.tik_instance.vector_dup(constant.MASK64, data_vmul_ub_col2img_fp32[ hoverlap * channel * \ col2img_w + \ v_rep_afmv_cycle * \ V_MAX_REPEAT * FP32_MAX], data_clean_scalar_fp32, v_rep_afmv_last, constant.STRIDE_ONE, constant.REPEAT_STRIDE_EIGHT) self.tik_instance.BuildCCE(kernel_name=kernel_name, inputs=(data_input_origin, data_input, data_mask), outputs=(data_output), enable_l2=False) return self.tik_instance
def scatter_nd_d(indices, x, y, shape, kernel_name="scatter_nd_d"): """ the main function of scatter_nd_d Parameters ---------- indices: dict,shape and datatype,datatype supports int32 x: dict,shape and datatype,datatype supports float32,float16,int32, int8,uint8 y: dict,shape and datatype,datatype supports float32,float16,int32, int8,uint8 shape: out put shape kernel_name: cce kernel name, default value is "scatter_nd_d" Returns ------- tik_instance: tik_instance """ check_param(indices, x, y, shape, kernel_name) if _check_1d_updates(indices, x, y): return _scatter_nd_d_1d(indices, x, y, kernel_name) indices_shape = indices.get("shape") indice_len = scatter_nd_d_help.get_indice_len(indices_shape) update_each_size = scatter_nd_d_help.get_shape_total_number( x.get("shape")) // indice_len block_dim, loop_cycle = get_blockdim_and_loop_cycle( x, shape, update_each_size) output_shape = scatter_nd_d_help.get_shape_total_number(shape) output_spilts = output_shape // update_each_size last_spilt = output_spilts - output_spilts // block_dim * block_dim tik_instance = tik.Tik() input_param = (indices, x, y, shape) scatter = scatter_nd_d_help.ScatterNd(input_param, tik_instance) with tik_instance.for_range(0, block_dim, block_num=block_dim) as block_id: process = scatter_nd_d_help.ScatterProcess(scatter.tik_instance, scatter.updates, scatter.indices, scatter.shape) cycle_each_block = tik_instance.Scalar("int32") cycle_each_block.set_as(loop_cycle) output_offset = tik_instance.Scalar("int32") output_size = tik_instance.Scalar("int32") output_size.set_as(cycle_each_block * process.update_each_size) with tik_instance.if_scope(tik.all(block_dim == \ constant.MAX_BLOCK_NUMBER, last_spilt != 0, block_id < last_spilt)): cycle_each_block.set_as(loop_cycle + 1) output_size.set_as(cycle_each_block * process.update_each_size) output_offset.set_as(block_id * output_size) with tik_instance.else_scope(): output_offset.set_as(block_id*output_size + \ last_spilt*process.update_each_size) scatter.initial_output(process, output_offset, output_size) scatter.update_data(process, cycle_each_block, output_offset) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=(scatter.input_indices_gm, scatter.input_updates_gm), outputs=(scatter.output_y_gm), enable_l2=False) return tik_instance
def _do_vec_dup(pattern, obj, max_num, blk_idx, mark, axis): """ Params: top_address: start address for top padding. top_div_core: dividing line between two types of cores in top padding. top_total_core: physical cores for top padding. top_core_vol_x: volume of data processed by each core(type_x) for top padding. top_core_gap_x: gap between different cores(type_x) for top padding. Solution: MAX_CORE = 32 in_shape is [34,16,16,16,...],func will work in [0, ] only. in_shape is [16,16,16,16,...],func will work in [0, 1]. """ if pattern == "top": begin_index = obj.top_address[axis] division_core = obj.top_div_core[axis] total_core = obj.top_total_core[axis] core_data_0 = obj.top_core_vol_0[axis] core_data_1 = obj.top_core_vol_1[axis] core_gap_0 = obj.top_core_gap_0[axis] core_gap_1 = obj.top_core_gap_1[axis] pad_data = obj.top_vol[axis] else: begin_index = obj.bottom_address[axis] division_core = obj.bottom_div_core[axis] total_core = obj.bottom_total_core[axis] core_data_0 = obj.bottom_core_vol_0[axis] core_data_1 = obj.bottom_core_vol_1[axis] core_gap_0 = obj.bottom_core_gap_0[axis] core_gap_1 = obj.bottom_core_gap_1[axis] pad_data = obj.bottom_vol[axis] # discriminate first layer or not. offset = obj.tik_instance.Scalar("int64", name="cir_offset_") offset_value = pad_data - core_data_0 * (division_core + 1) \ - core_data_1 * (total_core - division_core - 1) offset.set_as(offset_value) with obj.tik_instance.if_scope(pad_data - core_data_0 == 0): # not the first layer offset.set_as(0) vir_num, block_index = max_num, blk_idx # vector_dup: all physical cores. with obj.tik_instance.if_scope(mark != 1): set_vector_dup(obj, vir_num, 0) # data_move with obj.tik_instance.if_scope(block_index < division_core): dst_idx = begin_index + block_index * core_gap_0 copy_buf2gm_circulation(obj, core_data_0, vir_num, dst_idx) with obj.tik_instance.if_scope(block_index == division_core): dst_idx = begin_index + division_core * core_gap_0 copy_buf2gm_circulation(obj, core_data_0 + offset, vir_num, dst_idx) with obj.tik_instance.if_scope( tik.all(block_index > division_core, block_index < total_core)): begin_index += core_gap_0 * (division_core + 1) + offset block_index = block_index - (division_core + 1) dst_idx = begin_index + block_index * core_gap_1 copy_buf2gm_circulation(obj, core_data_1, vir_num, dst_idx)
def update_each_slice(self, process, update_offset, start_address): """ update each update slice Parameters ---------- process: ScatterProcess class,which used to store scatter nd parameters update_offset: the offset of gm update data start_address: the start_address of output data Returns ------- None """ output_offset = self.tik_instance.Scalar("int32") output_offset.set_as(start_address) total_size = self.tik_instance.Scalar("int32") total_size.set_as(MAX_UB_ELEMENT_NUMBER * self.data_size) with self.tik_instance.for_range(0, process.loop_update) as update_cycle: with self.tik_instance.if_scope(update_cycle == \ process.loop_update - 1): total_size.set_as(process.last_update_ub_size * self.data_size) nburst = common_util.get_datamove_nburst(self.tik_instance, total_size) repeats = common_util.get_vector_repeat_times( self.tik_instance, total_size) self.tik_instance.data_move(process.input_update_ub, self.input_updates_gm[update_offset], constant.SID, constant.DEFAULT_NBURST, nburst, constant.STRIDE_ZERO, constant.STRIDE_ZERO) self.tik_instance.data_move(process.input_ub, self.output_y_gm[output_offset], constant.SID, constant.DEFAULT_NBURST, nburst, constant.STRIDE_ZERO, constant.STRIDE_ZERO) dtype = process.input_ub.dtype input_ub_fp16 = None if dtype == constant.DATA_TYPE_INT8 \ or dtype == constant.DATA_TYPE_UINT8: input_shape = process.input_update_ub.shape total_number = constant.DATA_SIZE_TWO * \ get_shape_total_number(input_shape) input_ub_fp16 = self.tik_instance.Tensor( constant.DATA_TYPE_FP16, (total_number, ), name="input_ub_fp16", scope=tik.scope_ubuf) self.tik_instance.vconv( constant.MASK128, "", input_ub_fp16, process.input_ub, repeats * constant.DATA_SIZE_TWO, constant.STRIDE_ONE, constant.STRIDE_ONE, constant.REPEAT_STRIDE_EIGHT, constant.REPEAT_STRIDE_FOUR) element_num = self.tik_instance.Scalar("int32") element_num.set_as(total_size // self.data_size) self.add_same_indices(process, repeats, input_ub_fp16, element_num) with self.tik_instance.if_scope(tik.all(total_size % \ constant.BLOCK_SIZE != 0, \ process.update_each_size * \ self.data_size >= \ constant.BLOCK_SIZE)): self.move_out_non32_alignment(process, output_offset, element_num) with self.tik_instance.else_scope(): self.tik_instance.data_move(self.output_y_gm[output_offset], process.input_ub, constant.SID, constant.DEFAULT_NBURST, nburst, constant.STRIDE_ZERO, constant.STRIDE_ZERO) output_offset.set_as(output_offset + element_num) update_offset.set_as(update_offset + element_num)