Exemple #1
0
    def get_nms_all_class_result(self, batch, topk_num_ecah_class):
        """
        handle nms result

        Parameters
        ----------
        batch: batch num
        topk_num_ecah_class: nms result

        Returns
        -------
        None
        """
        with self.instance.if_scope(
                tik.all(self.keep_top_k > -1,
                        topk_num_ecah_class > self.keep_top_k)):
            with self.instance.new_stmt_scope():
                topk2_tail_init_tmp_ub = self.instance.Tensor(
                    self.dtype, (128, ),
                    name="topk2_in_data_tmp_ub",
                    scope=tik.scope_ubuf)
                self.instance.vector_dup(self.mask, topk2_tail_init_tmp_ub, 0,
                                         128 // self.mask, 1, 8)

                topk2_tail_num = self.instance.Scalar("int32",
                                                      "topk_num_ecah_class",
                                                      16)
                burst_tail_scalar = self.instance.Scalar(
                    "int32", "burst_tail_scalar", 0)
                self.get_tersor_data_burst_val(True, topk2_tail_num,
                                               burst_tail_scalar)

                self.instance.data_move(
                    self.topk2_in_gm[batch, topk_num_ecah_class, 0],
                    topk2_tail_init_tmp_ub, 0, 1, burst_tail_scalar, 0, 0)
                self.instance.data_move(
                    self.topk3_in_gm[batch, topk_num_ecah_class, 0],
                    topk2_tail_init_tmp_ub, 0, 1, burst_tail_scalar, 0, 0)

            self.sort_all_class(batch)
            self.sort_for_get_label(batch)

        with self.instance.new_stmt_scope():
            #set out box num and tensor
            out_box_num_ub = self.instance.Tensor("int32", (8, ),
                                                  name="out_box_num_ub",
                                                  scope=tik.scope_ubuf)
            with self.instance.if_scope(
                    tik.all(self.keep_top_k > -1,
                            topk_num_ecah_class > self.keep_top_k)):
                out_box_num_ub[0].set_as(self.topk2_num)
            with self.instance.else_scope():
                out_box_num_ub[0].set_as(topk_num_ecah_class)
            self.instance.data_move(self.out_box_num_gm[batch, 0],
                                    out_box_num_ub, 0, 1, 1, 0, 0)

        self.adjust_topk_crood(batch, topk_num_ecah_class)
Exemple #2
0
    def pad_case1(self, tik_instance):
        in_num = _prod(self.ou_shape)
        total_num = math.ceil(in_num * self.num_bit / MINI_UNIT)
        core_num = total_num
        if core_num > MAX_CORE:
            core_num = MAX_CORE

        split_core_index, \
        core_loop_before, \
        core_loop_after = _cal_core(total_num, core_num, MAX_CORE)
        ac_num_one = (MINI_UNIT // self.num_bit) * core_loop_before
        ac_num_two = (MINI_UNIT // self.num_bit) * core_loop_after

        with tik_instance.for_range(0, core_num,
                                    block_num=core_num) as blk_idx:
            if split_core_index + 1 == core_num:
                with tik_instance.if_scope(blk_idx <= core_num - 1):
                    begin_index = blk_idx * ac_num_one
                    self._pad_case1_main(tik_instance, ac_num_one, begin_index,
                                         self.ubuf)

            else:
                with tik_instance.if_scope(blk_idx <= split_core_index):
                    begin_index = blk_idx * ac_num_one
                    self._pad_case1_main(tik_instance, ac_num_one, begin_index,
                                         self.ubuf)

                with tik_instance.if_scope(
                        tik.all(blk_idx > split_core_index,
                                blk_idx < core_num)):
                    begin_index = ac_num_one * (split_core_index + 1)
                    block_index = blk_idx - (split_core_index + 1)
                    begin_index += block_index * ac_num_two
                    self._pad_case1_main(tik_instance, ac_num_two, begin_index,
                                         self.ubuf)
    def proposal_pooling_h(self, block_id, proposal_id, fm_c1_index):
        """
        load the pooled_h * fm_width size featuremap to ub. maxpooling accroing
        to h direction

        Parameters
        ----------
        block_id:  aicore id
        proposal_id: which proposal is now being processed
        fm_c1_index: c1 index of the feature map
        Returns
        -------
        None
        """
        with self.tik_instance.for_range(0, self.pooled_h) as poolh:
            proposal_fm_data = \
                self.tik_instance.Tensor(self.dtype,
                                         (self.fm_h//self.pooled_h+2,
                                          self.fm_w_align, self.fm_c0),
                                         name="proposal_data",
                                         scope=tik.scope_ubuf)

            scalar_roi_start_h = self.tik_instance.Scalar("int32")
            scalar_roi_start_h.set_as(self.roi_start_h[poolh, proposal_id])
            scalar_roi_start_w = self.tik_instance.Scalar("int32")
            scalar_roi_start_w.set_as(self.roi_start_w[0, proposal_id])
            scalar_roi_width = self.tik_instance.Scalar("int32")
            scalar_roi_width.set_as(self.roi_width[proposal_id])
            scalar_roi_bin_h = self.tik_instance.Scalar("int32")
            scalar_roi_bin_h.set_as(self.roi_bin_h[poolh, proposal_id])

            with self.tik_instance.if_scope(
                    tik.all(scalar_roi_bin_h != 0, scalar_roi_width != 0)):
                coeff = self.fm_c0 * TYPELEN_DICT[self.dtype] // 32
                self.tik_instance.data_move(
                    proposal_fm_data, self.fm_c0_data[scalar_roi_start_h,
                                                      scalar_roi_start_w, 0],
                    0, scalar_roi_bin_h, scalar_roi_width * coeff,
                    (self.fm_w - scalar_roi_width) * coeff,
                    (self.fm_w_align - scalar_roi_width) * coeff)

                ceil_loop = 16 // TYPELEN_DICT[self.dtype]
                with self.tik_instance.for_range(0,
                                                 ceil_div(scalar_roi_width,
                                                          ceil_loop)) as \
                        loop_w:
                    self.tik_instance.vec_max(
                        256 // TYPELEN_DICT[self.dtype],
                        self.pooled_h_res[poolh, ceil_loop * loop_w, 0],
                        proposal_fm_data[0, ceil_loop * loop_w, 0],
                        self.pooled_h_res[poolh, ceil_loop * loop_w, 0],
                        scalar_roi_bin_h, 0, self.fm_w_align * coeff, 0)
Exemple #4
0
def _do_vec_dup(pattern, obj, max_num, blk_idx, mark, axis):
    """
    Params:
    top_address: start address for top padding.
    top_div_core: dividing line between two types of cores in top padding.
    top_total_core: physical cores for top padding.
    top_core_vol_x: volume of data processed by each core(type_x) for top padding.
    top_core_gap_x: gap between different cores(type_x) for top padding.

    Solution: MAX_CORE = 32
    in_shape is [34,16,16,16,...],func will work in [0, ] only.
    in_shape is [16,16,16,16,...],func will work in [0, 1].
    """
    if pattern == "top":
        begin_index = obj.top_address[axis]
        division_core = obj.top_div_core[axis]
        total_core = obj.top_total_core[axis]
        core_data_0 = obj.top_core_vol_0[axis]
        core_data_1 = obj.top_core_vol_1[axis]
        core_gap_0 = obj.top_core_gap_0[axis]
        core_gap_1 = obj.top_core_gap_1[axis]
    else:
        begin_index = obj.bottom_address[axis]
        division_core = obj.bottom_div_core[axis]
        total_core = obj.bottom_total_core[axis]
        core_data_0 = obj.bottom_core_vol_0[axis]
        core_data_1 = obj.bottom_core_vol_1[axis]
        core_gap_0 = obj.bottom_core_gap_0[axis]
        core_gap_1 = obj.bottom_core_gap_1[axis]

    vir_num, block_index = max_num, blk_idx

    # vector_dup: all physical cores.
    with obj.tik_instance.if_scope(mark != 1):
        set_vector_dup(obj, vir_num, 0)

    # data_move: part of physical cores.
    with obj.tik_instance.if_scope(block_index <= division_core):
        dst_idx = begin_index + block_index * core_gap_0

        copy_buf2gm_circulation(obj.tik_instance, obj.num_bit, core_data_0,
                                vir_num, obj.buf, obj.output_gm, dst_idx)

    with obj.tik_instance.if_scope(
            tik.all(block_index > division_core, block_index < total_core)):
        begin_index += core_gap_0 * (division_core + 1)
        block_index = block_index - (division_core + 1)
        dst_idx = begin_index + block_index * core_gap_1

        copy_buf2gm_circulation(obj.tik_instance, obj.num_bit, core_data_1,
                                vir_num, obj.buf, obj.output_gm, dst_idx)
Exemple #5
0
    def row_in_core_exp(self, block_idx):
        """
        expression of whether current row is processed in the core

        Parameters
        ----------
        block_idx: core index

        Returns
        -------
        expression
        """
        return tik.all(self.reg_cur_row >= block_idx * self.indices_step,
                       self.reg_cur_row < self.reg_core_last_rows)
Exemple #6
0
 def move_ub_to_gm_with_tail(self, input_dict):
     """
     move data from ub to gm when c < 16
     """
     shape = input_dict.get("shape")
     dst_ub = input_dict.get("dst_ub")
     ub_tail = input_dict.get("ub_tail")
     tail_offset = input_dict.get("tail_offset")
     tail_num = input_dict.get("tail_num")
     block_num = input_dict.get("block_num")
     row_index = input_dict.get("row_index")
     out_index = input_dict.get("out_index")
     tail_start = input_dict.get("tail_start")
     total_loop = input_dict.get("total_loop")
     r_i = input_dict.get("r_i")
     num = input_dict.get("num")
     _, col_len, row_len = shape
     col_len_align = (col_len + 15) // 16 * 16
     with self.instance.if_scope(tik.all(row_index >= num, block_num > 1)):
         scalar = self.instance.Scalar(ub_tail.dtype)
         with self.instance.for_range(0, col_len) as time:
             scalar.set_as(dst_ub[r_i * col_len_align + time])
             ub_tail[tail_offset + time].set_as(scalar)
         tail_offset.set_as(tail_offset + col_len)
         with self.instance.if_scope(row_index == total_loop * row_len - 1):
             each_burst_num = 32 // self.dsize
             n_burst = self.instance.Scalar("int32")
             n_burst.set_as((tail_num * self.dsize) // 32)
             mod = self.instance.Scalar("int32")
             mod.set_as((tail_num * self.dsize) % 32)
             # 32b alignment
             with self.instance.if_scope(mod == 0):
                 self.instance.data_move(self.y_gm[tail_start], ub_tail, 0,
                                         1, n_burst, 0, 0)
             # bigger than 32b
             with self.instance.else_scope():
                 self.instance.data_move(self.y_gm[tail_start], ub_tail, 0,
                                         1, n_burst, 0, 0)
                 offset = tail_num - each_burst_num
                 scalar = self.instance.Scalar(ub_tail.dtype)
                 with self.instance.for_range(0, each_burst_num) as time:
                     scalar.set_as(ub_tail[offset + time])
                     ub_tail[time].set_as(scalar)
                 self.instance.data_move(self.y_gm[tail_start + offset],
                                         ub_tail, 0, 1, 1, 0, 0)
     with self.instance.else_scope():
         burst_len = col_len_align // 16
         self.instance.data_move(self.y_gm[out_index],
                                 dst_ub[r_i * col_len_align], 0, 1,
                                 burst_len, 0, 0)
Exemple #7
0
    def indices_inner_gather_last_1(self, indices_ub, res_ub, row_num_last, inner_indices_offset, gm_offset_base,
                                    output_offset, burst_len_row, burst_len_res):
        """
        process last indices for tiling mode 1

        Parameters
        ----------
        indices_ub: cache indices data in UB
        res_ub: cache result data in UB
        row_num_last: the last indices num
        inner_indices_offset: inner indices num offset
        gm_offset_base: base of gm offset
        output_offset: output offset
        burst_len_row: burst length of one params row
        burst_len_res: burst length of result

        Returns
        -------
        None
        """
        tik_instance = self.tik_instance
        block_ub = tik_instance.Tensor(self.params_dtype, (self.block_elem,), name="block_ub", scope=tik.scope_ubuf)

        with tik_instance.for_range(0, row_num_last, thread_num=1) as row_i:
            indices_i_value = tik_instance.Scalar(dtype=self.indices_dtype, name="indices_i_value")
            indices_i_value.set_as(indices_ub[inner_indices_offset + row_i])

            gm_offset_i = (gm_offset_base + indices_i_value) * self.params_row

            # copy params row to block_ub from gm
            tik_instance.data_move(block_ub, self.x[gm_offset_i],
                                   0, 1, burst_len_row, 0, 0)

            res_ub_offset = row_i * self.params_row
            with tik_instance.for_range(0, self.params_row) as i:
                res_ub[res_ub_offset + i].set_as(block_ub[i])

        # copy result data to gm from ub
        tail_elem = (row_num_last * self.params_row) % self.block_elem
        with tik_instance.if_scope(tik.all(tail_elem != 0, burst_len_res > 1)):
            with tik_instance.for_range(0, self.block_elem) as num_i:
                block_ub[num_i].set_as(res_ub[row_num_last * self.params_row - self.block_elem + num_i])
            tik_instance.data_move(self.y[output_offset], res_ub, 0,
                                   1, burst_len_res - 1, 0, 0)
            tik_instance.data_move(self.y[output_offset + (row_num_last * self.params_row - self.block_elem)],
                                   block_ub, 0, 1, 1, 0, 0)
        with tik_instance.else_scope():
            tik_instance.data_move(self.y[output_offset], res_ub,
                                   0, 1, burst_len_res, 0, 0)
Exemple #8
0
    def row_in_ub_exp(self):
        """
        expression of whether current row is already loaded on ubuf

        Parameters
        ----------
        None

        Returns
        -------
        expression
        """
        return tik.all(
            self.reg_cur_row >= self.reg_row_start,
            self.reg_cur_row < self.reg_row_start + self.num_multi_rows)
    def calc_multi_indices(self, indices_ub, indices_num, burst_len_multi_row, ub_tuples, ub_block_tuples):
        """
        calculate multi rows, multi rows will read at one to avoid loading
        little data from gm to ubuf at a high frequency

        Parameters
        ----------
        indices_ub: indices_ub
        indices_num: how many indices to calculate
        burst_len_multi_row: burst length of multi row
        ub_tuples: contains var_ub, accum_ub, linear_ub, grad_ub, tmp_ub, tmp2_ub
        ub_block_tuples: contains var_ub_block, accum_ub_block, linear_ub_block, grad_ub_block

        Returns
        -------
        None
        """
        tik_instance = self.tik_instance
        with tik_instance.for_range(0, indices_num) as indices_i:
            self.var_cur_row.set_as(indices_ub[indices_i])

            # check whether current indices is within the processing range of the core
            with tik_instance.if_scope(tik.all(self.var_cur_row >= self.core_rows_start_index,
                                               self.var_cur_row < self.core_rows_end_index)):
                # check whether the var, accum, linear corresponding to current indices is cached in the UB
                with tik_instance.if_scope(tik.all(self.var_cur_row >= self.cached_rows_start_index,
                                                   self.var_cur_row < self.cached_rows_start_index +
                                                   self.num_multi_rows)):
                    self.calc_a_small_row(indices_i, ub_tuples, ub_block_tuples)
                with tik_instance.else_scope():
                    with tik_instance.if_scope(self.cached_rows_start_index < self.var_rows):
                        self.save_multi_rows(ub_tuples, burst_len_multi_row)
                    self.load_multi_rows(ub_tuples, burst_len_multi_row)
                    self.calc_a_small_row(indices_i, ub_tuples, ub_block_tuples)
        with tik_instance.if_scope(self.cached_rows_start_index < self.var_rows):
            self.save_multi_rows(ub_tuples, burst_len_multi_row)
Exemple #10
0
    def adjust_topk_crood(self, batch, topk_num_ecah_class):
        """
        modify x1 and y1 value

        Parameters
        ----------
        batch: batch num
        topk_num_ecah_class: out box data num

        Returns
        -------
        None
        """
        with self.instance.new_stmt_scope():

            box_data_ub = self.instance.Tensor(self.dtype,
                                               (self.out_box_gm.shape[1], 8),
                                               name="box_data_ub",
                                               scope=tik.scope_ubuf)
            topk3_out_ub = self.instance.Tensor(self.dtype,
                                                (self.out_box_gm.shape[1], 8),
                                                name="topk3_out_ub",
                                                scope=tik.scope_ubuf)
            burst_val_tmp_scalar = self.instance.Scalar(
                "int32", "burst_val_tmp_scalar", 0)

            with self.instance.if_scope(
                    tik.all(self.keep_top_k > -1,
                            topk_num_ecah_class > self.keep_top_k)):
                if self.keep_top_k > 0:
                    self.instance.data_move(
                        box_data_ub, self.out_box_gm_tmp[batch, 0, 0], 0, 1,
                        math.ceil(self.keep_top_k * 8 / self.burnest_len), 0,
                        0)
                    self.instance.data_move(
                        topk3_out_ub, self.topk3_out_gm[batch, 0, 0], 0, 1,
                        math.ceil(self.keep_top_k * 8 / self.burnest_len), 0,
                        0)

            with self.instance.else_scope():
                self.get_tersor_data_burst_val(True, topk_num_ecah_class,
                                               burst_val_tmp_scalar)
                with self.instance.if_scope(burst_val_tmp_scalar > 0):
                    self.instance.data_move(box_data_ub,
                                            self.topk2_in_gm[batch, 0, 0], 0,
                                            1, burst_val_tmp_scalar, 0, 0)
            self.set_crood_data_order(batch, topk_num_ecah_class, box_data_ub,
                                      topk3_out_ub)
Exemple #11
0
    def format_transfer_case_one(self, tik_instance):
        """
        the transfer process when UB can not put in N1 * N0 * X0 data
        """
        ub_ori_data = self.ub_memory - self.ub_memory % (CUBE_SIZE * CUBE_SIZE)
        ub_trans_data = ub_ori_data
        loop_n, loop_remainder = _cal_core_loop_python(CUBE_SIZE * CUBE_SIZE,
                                                       self.dst_shape[1],
                                                       ub_ori_data)
        # divide the core according to X0
        total_core_loop_num = self.dst_shape[0]
        core_number = _set_core_num(total_core_loop_num)

        with tik_instance.for_range(0, core_number, block_num=core_number) \
                as num_core:
            ub_ori = tik_instance.Tensor(self.dtype, (ub_ori_data, ),
                                         name="ub_ori",
                                         scope=tik.scope_ubuf)
            ub_trans = tik_instance.Tensor(self.dtype, (ub_trans_data, ),
                                           name="ub_trans",
                                           scope=tik.scope_ubuf)
            core_loop, sum_core = _cal_core(tik_instance, total_core_loop_num,
                                            num_core, core_number)
            with tik_instance.for_range(0, core_loop) as num_core_loop:
                total_core_loop = sum_core + num_core_loop
                num_x0 = total_core_loop
                is_last = tik_instance.Scalar("uint64", init_value=0)
                with tik_instance.for_range(0, self.dst_shape[1] // loop_n) \
                        as num_n_loop:
                    with tik_instance.if_scope(
                            tik.all(
                                num_n_loop == self.dst_shape[1] // loop_n - 1,
                                self.dst_shape[1] % loop_n == 0)):
                        is_last.set_as(1)
                    self.data_move_case_two(tik_instance, ub_ori, ub_trans,
                                            is_last, num_x0, num_n_loop,
                                            loop_n, loop_n)

                if loop_remainder != 0:
                    is_last.set_as(1)
                    self.data_move_case_two(tik_instance, ub_ori, ub_trans,
                                            is_last, num_x0,
                                            self.dst_shape[1] // loop_n,
                                            loop_n, loop_remainder)

        return tik_instance
    def compute_mode_1(self, block_id):
        """
        compute for tiling mode 1 of 32B aligned for var row

        Parameters
        ----------
        block_id: id of ai core

        Returns
        -------
        None
        """
        tik_instance = self.tik_instance
        indices_ub = tik_instance.Tensor(self.indices_dtype, (self.indices_nums_once,),
                                         name="indices_ub", scope=tik.scope_ubuf)
        var_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,),
                                     name="var_ub", scope=tik.scope_ubuf)
        accum_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,),
                                       name="accum_ub", scope=tik.scope_ubuf)
        linear_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,),
                                        name="linear_ub", scope=tik.scope_ubuf)
        grad_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,),
                                      name="grad_ub", scope=tik.scope_ubuf)
        tmp_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,),
                                     name="tmp_ub", scope=tik.scope_ubuf)
        tmp2_ub = tik_instance.Tensor(self.var_dtype, (self.one_part_elem,),
                                      name="tmp2_ub", scope=tik.scope_ubuf)
        ub_tuples = (var_ub, accum_ub, linear_ub, grad_ub, tmp_ub, tmp2_ub)

        self.var_cur_row = tik_instance.Scalar(dtype=self.tiling_dtype, name="var_cur_row")
        self.var_row_repeat = ceil_value(self.var_row_elem, self.vector_elem)

        # process indices_num_each_core: indices_nums_once * indices_loop_num + indices_nums_last
        with tik_instance.for_range(0, self.indices_loop_num) as indices_loop_i:
            indices_num_offset = block_id * self.indices_num_each_core + indices_loop_i * self.indices_nums_once
            self.process_num_indices(ub_tuples, indices_ub, self.indices_nums_once, indices_num_offset)

        with tik_instance.if_scope(self.indices_nums_last > 0):
            indices_num_offset = block_id * self.indices_num_each_core + \
                                 self.indices_loop_num * self.indices_nums_once
            self.process_num_indices(ub_tuples, indices_ub, self.indices_nums_last, indices_num_offset)

        with tik_instance.if_scope(tik.all(self.indices_num_remaining > 0, block_id < self.indices_num_remaining)):
            indices_num_offset = self.indices_num_each_core * self.need_core_num + block_id
            self.process_num_indices(ub_tuples, indices_ub, 1, indices_num_offset)
Exemple #13
0
    def pad_case0(self, tik_instance, split_core_idx, core_loop_list,
                  model_list):

        with tik_instance.for_range(0, MAX_CORE,
                                    block_num=MAX_CORE) as blk_idx:
            # use as many as possible core (MAX_CORE)
            # outermost padding (top, bottom)
            # vec_mark: pad_vec_dup_outermost had worked
            # and model_list[0][x] is 'ub_reorder',it will be 'True'
            # if vec_mark = True, the followed
            # computation will not vec_dup again
            in_num_top = self.in_paddings[0][0] * _prod(self.ou_shape[1:])
            in_num_bottom = self.in_paddings[0][1] * _prod(self.ou_shape[1:])
            vec_mark = [False, False]
            if max(in_num_top, in_num_bottom) > 0:
                self.pad_vec_dup_outermost(tik_instance, in_num_top,
                                           in_num_bottom, blk_idx)
                # vec_dup doesn't care about core
                # different core must obey the same rule
                if model_list[0][0] != "ub_reorder":
                    vec_mark[0] = True
                if model_list[0][1] != "ub_reorder":
                    vec_mark[1] = True

            with tik_instance.if_scope(blk_idx <= split_core_idx):
                src_gm = 0
                dst_gm = in_num_top
                self._pad_case0_main(tik_instance, core_loop_list[0],
                                     model_list[0], blk_idx, src_gm, dst_gm,
                                     vec_mark[0])

            if core_loop_list[0] != core_loop_list[1]:
                with tik_instance.if_scope(
                        tik.all(blk_idx > split_core_idx,
                                blk_idx < self.core)):
                    processed_in_shape = self.in_shape.copy()
                    processed_in_shape[0] = core_loop_list[0]
                    processed_ou_shape = self.ou_shape.copy()
                    processed_ou_shape[0] = core_loop_list[0]
                    src_gm += (split_core_idx + 1) * _prod(processed_in_shape)
                    dst_gm += (split_core_idx + 1) * _prod(processed_ou_shape)
                    blk_idx = blk_idx - split_core_idx - 1
                    self._pad_case0_main(tik_instance, core_loop_list[1],
                                         model_list[1], blk_idx, src_gm,
                                         dst_gm, vec_mark[1])
Exemple #14
0
 def check_batch(num_elements, elements_offset):
     idx_blocks = (num_elements + indices_block_len -
                   1) // indices_block_len
     uds_blocks = (num_elements + updates_block_len -
                   1) // updates_block_len
     tik_instance.data_move(indices_ub, indices_gm[elements_offset], 0, 1,
                            idx_blocks, 0, 0)
     tik_instance.data_move(updates_ub, updates_gm[elements_offset], 0, 1,
                            uds_blocks, 0, 0)
     with tik_instance.for_range(0, num_elements) as k:
         indices_var.set_as(indices_ub[k])
         with tik_instance.if_scope(
                 tik.all(indices_var >= core_start,
                         indices_var < core_end)):
             cur_var.set_as(var_ub[indices_var - core_start])
             cur_update.set_as(updates_ub[k])
             acc_var.set_as(cur_var + cur_update)
             var_ub[indices_var - core_start] = acc_var
Exemple #15
0
def _recursion_compute(obj, blk_idx):
    """
    recur_cond: condition that torch off stride between different cores.
    recur_gap_x: gap_x between in diff cores.
    recur_loop_x: work times by each core(type_x).
    recur_in_vol: volume of input_data by each core do once.
    recur_div_core: dividing line between two types of core.
    recur_total_core: physical cores in recursion.
    recur_start_address: start address in recursion
    """
    tik_instance = obj.tik_instance
    cond, gap0, gap1 = obj.recur_cond[0], obj.recur_gap_0[0], obj.recur_gap_1[
        0]
    loop0, loop1, in_vol = obj.recur_loop_0[0], obj.recur_loop_1[
        0], obj.recur_in_vol[0]
    max_num = obj.tik_instance.Scalar("int32", name="max_num_")

    def _main(processed, loop, block_index):
        src_ub = 0
        dst_ub = 0
        dst_gm = obj.recur_start_address[0]
        src_gm = 0
        axis = 0
        with tik_instance.for_range(0, loop) as idx:
            sum_core = processed + block_index * loop + idx
            dst_gm += sum_core / cond * gap0 + sum_core % cond * gap1
            src_gm += sum_core * in_vol
            _recursion(obj, axis, dst_gm, src_gm, src_ub, dst_ub, max_num,
                       False)

    with tik_instance.if_scope(blk_idx <= obj.recur_div_core[0]):
        pro = 0
        _main(pro, loop0, blk_idx)

    with tik_instance.if_scope(
            tik.all(blk_idx > obj.recur_div_core[0],
                    blk_idx < obj.recur_total_core[0])):

        pro = (obj.recur_div_core[0] + 1) * loop0
        blk_idx = blk_idx - obj.recur_div_core[0] - 1
        _main(pro, loop1, blk_idx)
Exemple #16
0
        def _do_vec_dup(ac_num, vir_num, begin_index, block_index, mark):
            total_num = ac_num // (MINI_UNIT // self.num_bit)
            if total_num >= MAX_CORE:
                core_num = MAX_CORE
            else:
                core_num = total_num

            split_core_index, \
            core_loop_before, \
            core_loop_after = _cal_core(total_num, core_num, MAX_CORE)
            ac_num_one = (MINI_UNIT // self.num_bit) * core_loop_before
            ac_num_two = (MINI_UNIT // self.num_bit) * core_loop_after

            if not mark:
                self.set_vector_dup(tik_instance, vir_num, self.ubuf, 0)

            if split_core_index + 1 == core_num:
                with tik_instance.if_scope(block_index <= split_core_index):
                    begin_index += block_index * ac_num_one
                    self.copy_ubuf_2_gm_case01(tik_instance, ac_num_one,
                                               vir_num, self.ubuf, 0,
                                               begin_index)

            else:
                with tik_instance.if_scope(block_index <= split_core_index):
                    begin_index_new = begin_index + block_index * ac_num_one
                    self.copy_ubuf_2_gm_case01(tik_instance, ac_num_one,
                                               vir_num, self.ubuf, 0,
                                               begin_index_new)

                with tik_instance.if_scope(
                        tik.all(block_index > split_core_index,
                                block_index < core_num)):
                    begin_index += ac_num_one * (split_core_index + 1)
                    block_index = block_index - (split_core_index + 1)
                    begin_inde_new = begin_index + block_index * ac_num_two
                    self.copy_ubuf_2_gm_case01(tik_instance, ac_num_two,
                                               vir_num, self.ubuf, 0,
                                               begin_inde_new)
    def process_each_indices(self, process, loop_param, output_offset):
        """
        process each indices

        Parameters
        ----------
        process: ScatterProcess class,which used to store scatter nd parameters
        loop_param: a tupe keep the loop params
        output_offset: the offset of output data

        Returns
        -------
        None
        """
        update_offset = self.tik_instance.Scalar("int32")
        update_offset.set_as(0)
        with self.tik_instance.for_range(0, loop_param[2]) as ind_cycle:
            start_address = self.tik_instance.Scalar("int32")
            start_address.set_as(0)
            for k in range(process.get_each_size()):
                indices_saclar = self.tik_instance.Scalar("int32")
                indices_saclar.set_as(0)
                indices_saclar.set_as(ind_cycle * process.get_each_size() + k)
                indices_saclar.set_as(process.input_indices_ub[indices_saclar])
                start_address.set_as(start_address + \
                                     indices_saclar * \
                                     process.elem_of_each_dim[k])
            end_address = self.tik_instance.Scalar("int32")
            end_address.set_as(output_offset +
                               (loop_param[3] - 1) * process.update_each_size)
            with self.tik_instance.if_scope(tik.all(start_address \
                                                    >= output_offset,
                                                    start_address \
                                                    <= end_address)):
                update_offset.set_as((loop_param[0] * loop_param[1] // \
                                      process.get_each_size() + \
                                      ind_cycle) * process.update_each_size)
                self.update_each_slice(process, update_offset, start_address)
Exemple #18
0
    def move_without_transform(self, shape):
        """
        when C = 1 or H*W = 1, directly move data in and out
        """
        ub_size = (UB_SIZE_B - 1024) // 2 // self.dsize // 16 * 16
        if shape[0] <= 16:
            block_num = 1
        else:
            all_block_num = shape[0] // 16
            block_num = AICORE_NUM
            if all_block_num < AICORE_NUM:
                block_num = all_block_num
        each_len = shape[0] // block_num
        each_mod = shape[0] % block_num
        thread_num = 1
        if each_len // ub_size > 1:
            thread_num = 2

        with self.instance.for_range(0, block_num, block_num=block_num) \
                as block_id:
            each_size = self.instance.Scalar("int32")
            each_size.set_as(each_len)
            with self.instance.if_scope(block_id == block_num - 1):
                each_size.set_as(each_len + each_mod)
            ub_loop = each_size // ub_size
            ub_mod = each_size % ub_size
            with self.instance.for_range(0, ub_loop,
                                         thread_num=thread_num) as loop_id:
                src_ub = self.instance.Tensor(self.dtype, (ub_size, ),
                                              name="src_ub",
                                              scope=tik.scope_ubuf)
                burst_len = ub_size // 16
                self.instance.data_move(
                    src_ub, self.x_gm[each_len * block_id + loop_id * ub_size],
                    0, 1, burst_len, 0, 0)
                self.instance.data_move(
                    self.y_gm[each_len * block_id + loop_id * ub_size], src_ub,
                    0, 1, burst_len, 0, 0)
            with self.instance.if_scope(ub_mod > 0):
                src_ub = self.instance.Tensor(self.dtype, (ub_size, ),
                                              name="src_ub",
                                              scope=tik.scope_ubuf)
                with self.instance.if_scope(
                        tik.all(block_num > 1, ub_mod % 16 != 0)):
                    src_ub_1 = self.instance.Tensor(self.dtype, (16, ),
                                                    name="src_ub_1",
                                                    scope=tik.scope_ubuf)
                    index = each_len * block_id + ub_loop * ub_size
                    with self.instance.if_scope(ub_mod >= 16):
                        burst_len = ub_mod // 16
                        self.instance.data_move(src_ub, self.x_gm[index], 0, 1,
                                                burst_len, 0, 0)
                        self.instance.data_move(self.y_gm[index], src_ub, 0, 1,
                                                burst_len, 0, 0)
                        offset = index + burst_len * 16 - 16 + ub_mod % 16
                        self.instance.data_move(src_ub_1, self.x_gm[offset], 0,
                                                1, 1, 0, 0)
                        self.instance.data_move(self.y_gm[offset], src_ub_1, 0,
                                                1, 1, 0, 0)
                    with self.instance.else_scope():
                        offset = index - 16 + ub_mod % 16
                        self.instance.data_move(src_ub_1, self.x_gm[offset], 0,
                                                1, 1, 0, 0)
                        self.instance.data_move(self.y_gm[offset], src_ub_1, 0,
                                                1, 1, 0, 0)
                with self.instance.else_scope():
                    burst_len = (ub_mod + 15) // 16
                    self.instance.data_move(
                        src_ub,
                        self.x_gm[each_len * block_id + ub_loop * ub_size], 0,
                        1, burst_len, 0, 0)
                    self.instance.data_move(
                        self.y_gm[each_len * block_id + ub_loop * ub_size],
                        src_ub, 0, 1, burst_len, 0, 0)
Exemple #19
0
    def compute_mode_3(self, half_ub_size, block_id):
        """
        compute for tiling mode 3

        Parameters
        ----------
        half_ub_size: bytes of half UB
        block_id: id of ai core

        Returns
        -------
        None
        """
        tik_instance = self.tik_instance
        indices_dsize = self.indices_dsize
        params_dsize = self.params_dsize

        with tik_instance.if_scope(block_id < self.need_core_num):
            indices_ub = tik_instance.Tensor(self.indices_dtype, (half_ub_size // indices_dsize,),
                                             name="indices_ub", scope=tik.scope_ubuf)
            res_ub = tik_instance.Tensor(self.params_dtype, (half_ub_size // params_dsize,),
                                         name="res_ub", scope=tik.scope_ubuf)

            burst_len_row = self.params_row * params_dsize // BLOCK_SIZE

            with tik_instance.for_range(0, self.params_pre) as pre_i:
                gm_offset_base = pre_i * self.params_axis

                indices_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="indices_offset")

                with tik_instance.for_range(0, self.indices_loop_num) as indices_loop_i:
                    indices_offset.set_as(block_id * self.indices_num_each_core +
                                          indices_loop_i * self.indices_row_num_once)
                    # copy indices data to ub from gm
                    tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1,
                                           ceil_value(self.indices_row_num_once * indices_dsize, BLOCK_SIZE), 0, 0)

                    burst_len_res = self.row_num_once_ub * self.params_row * params_dsize // BLOCK_SIZE
                    inner_indices_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="inner_indices_offset")
                    output_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="output_offset")
                    with tik_instance.for_range(0, self.inner_loop_num) as inner_loop_i:
                        inner_indices_offset.set_as(inner_loop_i * self.row_num_once_ub)
                        output_offset.set_as((pre_i * self.indices_num +
                                              block_id * self.indices_num_each_core +
                                              indices_loop_i * self.indices_row_num_once +
                                              inner_loop_i * self.row_num_once_ub)
                                             * self.params_row)

                        self.indices_inner_gather(indices_ub, res_ub, self.row_num_once_ub,
                                                  inner_indices_offset, gm_offset_base, output_offset,
                                                  burst_len_row, burst_len_res)

                    with tik_instance.if_scope(self.row_num_once_tail_ub > 0):
                        burst_len_res = self.row_num_once_tail_ub * self.params_row * params_dsize // BLOCK_SIZE
                        inner_indices_offset.set_as(self.inner_loop_num * self.row_num_once_ub)
                        output_offset.set_as((pre_i * self.indices_num +
                                              block_id * self.indices_num_each_core +
                                              indices_loop_i * self.indices_row_num_once +
                                              self.inner_loop_num * self.row_num_once_ub)
                                             * self.params_row)

                        self.indices_inner_gather(indices_ub, res_ub, self.row_num_once_tail_ub,
                                                  inner_indices_offset, gm_offset_base, output_offset,
                                                  burst_len_row, burst_len_res)

                with tik_instance.if_scope(self.indices_row_num_last > 0):
                    burst_len_res = self.row_num_last_ub * self.params_row * params_dsize // BLOCK_SIZE
                    indices_offset.set_as(block_id * self.indices_num_each_core +
                                          self.indices_loop_num * self.indices_row_num_once)
                    # copy indices data to ub from gm
                    tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1,
                                           ceil_value(self.indices_row_num_last * indices_dsize, BLOCK_SIZE), 0, 0)

                    inner_indices_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="inner_indices_offset")
                    output_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="output_offset")
                    with tik_instance.for_range(0, self.inner_loop_num_last) as inner_loop_i:
                        inner_indices_offset.set_as(inner_loop_i * self.row_num_last_ub)
                        output_offset.set_as((pre_i * self.indices_num +
                                              block_id * self.indices_num_each_core +
                                              self.indices_loop_num * self.indices_row_num_once +
                                              inner_loop_i * self.row_num_last_ub)
                                             * self.params_row)

                        self.indices_inner_gather(indices_ub, res_ub, self.row_num_last_ub,
                                                  inner_indices_offset, gm_offset_base, output_offset,
                                                  burst_len_row, burst_len_res)

                    with tik_instance.if_scope(self.row_num_last_tail_ub > 0):
                        burst_len_res = self.row_num_last_tail_ub * self.params_row * params_dsize // BLOCK_SIZE
                        inner_indices_offset.set_as(self.inner_loop_num_last * self.row_num_last_ub)
                        output_offset.set_as((pre_i * self.indices_num +
                                              block_id * self.indices_num_each_core +
                                              self.indices_loop_num * self.indices_row_num_once +
                                              self.inner_loop_num_last * self.row_num_last_ub)
                                             * self.params_row)

                        self.indices_inner_gather(indices_ub, res_ub, self.row_num_last_tail_ub,
                                                  inner_indices_offset, gm_offset_base, output_offset,
                                                  burst_len_row, burst_len_res)

                with tik_instance.if_scope(tik.all(self.indices_num_remaining > 0,
                                                   block_id == self.tail_process_core)):
                    indices_offset.set_as(self.need_core_num * self.indices_num_each_core)
                    # copy indices data to ub from gm
                    tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1,
                                           ceil_value(self.indices_num_remaining * indices_dsize, BLOCK_SIZE), 0, 0)

                    output_offset = tik_instance.Scalar(dtype=self.indices_dtype, name="output_offset")
                    output_offset.set_as((pre_i * self.indices_num +
                                          self.need_core_num * self.indices_num_each_core)
                                         * self.params_row)
                    burst_len_res_tail = self.indices_num_remaining * self.params_row * params_dsize // BLOCK_SIZE

                    self.indices_inner_gather(indices_ub, res_ub, self.indices_num_remaining,
                                              0, gm_offset_base, output_offset, burst_len_row, burst_len_res_tail)
Exemple #20
0
    def data_move_case_one(self, tik_instance, ub_ori, ub_trans, core_loop,
                           sum_core, align_loop, remainder, num_data_one_loop):
        """
        the data_move process when UB can put in N1 * N0 * X0 data and
        N % 16 != 0
        """
        is_x_padding = tik_instance.Scalar("uint64", init_value=0)
        with tik_instance.for_range(0, core_loop) as num_core_loop:

            total_core_loop = sum_core + num_core_loop
            num_x0 = total_core_loop
            # zero padding if C != 4
            with tik_instance.if_scope(num_core_loop % align_loop == 0):
                if self.src_shape[2] != C0:
                    self.vector_dup_zero(tik_instance, ub_ori,
                                         align_loop * num_data_one_loop, 0)

            src_gm_index = num_x0 * self.src_shape[3] * self.src_shape[2] * \
                           CUBE_SIZE // C0
            src_ub_index = (num_core_loop % align_loop) * num_data_one_loop
            if C0 * self.src_shape[0] * self.src_shape[1] % CUBE_SIZE != 0:
                with tik_instance.if_scope(num_x0 == self.dst_shape[0] - 1):
                    is_x_padding.set_as(1)
                    with tik_instance.for_range(
                            0, self.src_shape[0] * self.src_shape[1] %
                        (CUBE_SIZE // C0)) as num_cn:
                        with tik_instance.for_range(0, self.src_shape[2])\
                                as num_row:
                            tik_instance.data_move(
                                ub_ori[src_ub_index + (num_cn * C0 + num_row) *
                                       self.dst_shape[1] * self.dst_shape[2]],
                                self.src_gm[src_gm_index +
                                            (num_cn * self.src_shape[2] +
                                             num_row) * self.src_shape[3]], 0,
                                1, self.dst_shape[1] * self.dst_shape[2] //
                                self.num_data, 0, 0)
                with tik_instance.else_scope():
                    with tik_instance.for_range(0, CUBE_SIZE // C0) as num_cn:
                        with tik_instance.for_range(0, self.src_shape[2])\
                                as num_row:
                            tik_instance.data_move(
                                ub_ori[src_ub_index + (num_cn * C0 + num_row) *
                                       self.dst_shape[1] * self.dst_shape[2]],
                                self.src_gm[src_gm_index +
                                            (num_cn * self.src_shape[2] +
                                             num_row) * self.src_shape[3]], 0,
                                1, self.dst_shape[1] * self.dst_shape[2] //
                                self.num_data, 0, 0)
            else:
                with tik_instance.for_range(0, CUBE_SIZE // C0) as num_cn:
                    with tik_instance.for_range(0, self.src_shape[2])\
                            as num_row:
                        tik_instance.data_move(
                            ub_ori[src_ub_index + (num_cn * C0 + num_row) *
                                   self.dst_shape[1] * self.dst_shape[2]],
                            self.src_gm[src_gm_index +
                                        (num_cn * self.src_shape[2] +
                                         num_row) * self.src_shape[3]], 0, 1,
                            self.dst_shape[1] * self.dst_shape[2] //
                            self.num_data, 0, 0)

            with tik_instance.if_scope(
                    tik.all((num_core_loop + 1) % align_loop == 0,
                            num_core_loop != core_loop - 1)):
                self.data_rearrange_case_zero(tik_instance, ub_ori, ub_trans,
                                              align_loop, is_x_padding)
                dst_gm_index = (num_x0 - (align_loop - 1)) * num_data_one_loop
                tik_instance.data_move(
                    self.dst_gm[dst_gm_index], ub_trans[0], 0, 1,
                    align_loop * num_data_one_loop // self.num_data, 0, 0)

            with tik_instance.if_scope(num_core_loop == core_loop - 1):
                self.data_rearrange_case_zero(tik_instance, ub_ori, ub_trans,
                                              remainder, is_x_padding)
                dst_gm_index = (num_x0 - (remainder - 1)) * num_data_one_loop
                tik_instance.data_move(
                    self.dst_gm[dst_gm_index], ub_trans[0], 0, 1,
                    remainder * num_data_one_loop // self.num_data, 0, 0)
Exemple #21
0
    def compute_mode_1(self, half_ub_size, block_id):
        """
        compute for tiling mode 1

        Parameters
        ----------
        half_ub_size: bytes of half UB
        block_id: id of ai core

        Returns
        -------
        None
        """
        tik_instance = self.tik_instance
        indices_dsize = self.indices_dsize
        params_dsize = self.params_dsize

        with tik_instance.if_scope(block_id < self.need_core_num):
            indices_ub = tik_instance.Tensor(self.indices_dtype, ((half_ub_size + 256) // indices_dsize,),
                                             name="indices_ub", scope=tik.scope_ubuf)
            res_ub = tik_instance.Tensor(self.params_dtype, ((half_ub_size + BLOCK_SIZE) // params_dsize,),
                                         name="res_ub", scope=tik.scope_ubuf)

            burst_len_row = ceil_value(self.params_row * params_dsize, BLOCK_SIZE)

            with tik_instance.for_range(0, self.params_pre) as pre_i:
                gm_offset_base = pre_i * self.params_axis

                # indices_num_each_core = indices_row_num_once * indices_loop_num + indices_row_num_last
                with tik_instance.for_range(0, self.indices_loop_num) as indices_loop_i:
                    indices_offset = block_id * self.indices_num_each_core + \
                                     indices_loop_i * self.indices_row_num_once
                    # copy indices data to ub from gm
                    tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1,
                                           ceil_value(self.indices_row_num_once * indices_dsize, BLOCK_SIZE), 0, 0)

                    # indices_row_num_once = row_num_once_ub * inner_loop_num + row_num_once_tail_ub
                    # a1. row_num_once_ub * inner_loop_num
                    burst_len_res = ceil_value(self.row_num_once_ub * self.params_row * params_dsize, BLOCK_SIZE)

                    with tik_instance.for_range(0, self.inner_loop_num) as inner_loop_i:
                        inner_indices_offset = inner_loop_i * self.row_num_once_ub
                        output_offset = (pre_i * self.indices_num +
                                         block_id * self.indices_num_each_core +
                                         indices_loop_i * self.indices_row_num_once +
                                         inner_loop_i * self.row_num_once_ub) * self.params_row

                        self.indices_inner_gather_1(indices_ub, res_ub, self.row_num_once_ub,
                                                    inner_indices_offset, gm_offset_base, output_offset,
                                                    burst_len_row, burst_len_res)

                    # a2. row_num_once_tail_ub
                    with tik_instance.if_scope(self.row_num_once_tail_ub > 0):
                        burst_len_res = ceil_value(self.row_num_once_tail_ub * self.params_row * params_dsize,
                                                   BLOCK_SIZE)
                        inner_indices_offset = self.inner_loop_num * self.row_num_once_ub
                        output_offset = (pre_i * self.indices_num +
                                         block_id * self.indices_num_each_core +
                                         indices_loop_i * self.indices_row_num_once +
                                         self.inner_loop_num * self.row_num_once_ub) * self.params_row

                        self.indices_inner_gather_last_1(indices_ub, res_ub, self.row_num_once_tail_ub,
                                                         inner_indices_offset, gm_offset_base, output_offset,
                                                         burst_len_row, burst_len_res)

                with tik_instance.if_scope(self.indices_row_num_last > 0):
                    burst_len_res = ceil_value(self.row_num_last_ub * self.params_row * params_dsize, BLOCK_SIZE)
                    indices_offset = block_id * self.indices_num_each_core + \
                                     self.indices_loop_num * self.indices_row_num_once
                    # copy indices data to ub from gm
                    tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1,
                                           ceil_value(self.indices_row_num_last * indices_dsize, BLOCK_SIZE), 0, 0)

                    with tik_instance.for_range(0, self.inner_loop_num_last) as inner_loop_i:
                        inner_indices_offset  = inner_loop_i * self.row_num_last_ub
                        output_offset = (pre_i * self.indices_num +
                                         block_id * self.indices_num_each_core +
                                         self.indices_loop_num * self.indices_row_num_once +
                                         inner_loop_i * self.row_num_last_ub) * self.params_row

                        self.indices_inner_gather_1(indices_ub, res_ub, self.row_num_last_ub,
                                                    inner_indices_offset, gm_offset_base, output_offset,
                                                    burst_len_row, burst_len_res)

                    with tik_instance.if_scope(self.row_num_last_tail_ub > 0):
                        burst_len_res = ceil_value(self.row_num_last_tail_ub * self.params_row * params_dsize,
                                                   BLOCK_SIZE)
                        inner_indices_offset = self.inner_loop_num_last * self.row_num_last_ub
                        output_offset = (pre_i * self.indices_num +
                                         block_id * self.indices_num_each_core +
                                         self.indices_loop_num * self.indices_row_num_once +
                                         self.inner_loop_num_last * self.row_num_last_ub) * self.params_row

                        self.indices_inner_gather_last_1(indices_ub, res_ub, self.row_num_last_tail_ub,
                                                         inner_indices_offset, gm_offset_base, output_offset,
                                                         burst_len_row, burst_len_res)

                with tik_instance.if_scope(tik.all(self.indices_num_remaining > 0,
                                                   block_id == self.tail_process_core)):
                    indices_offset = self.need_core_num * self.indices_num_each_core
                    # copy indices data to ub from gm
                    tik_instance.data_move(indices_ub, self.indices[indices_offset], 0, 1,
                                           ceil_value(self.indices_num_remaining * indices_dsize, BLOCK_SIZE), 0, 0)

                    output_offset = (pre_i * self.indices_num +
                                     self.need_core_num * self.indices_num_each_core) * self.params_row
                    burst_len_res_tail = ceil_value(self.indices_num_remaining * self.params_row * params_dsize,
                                                    BLOCK_SIZE)

                    self.indices_inner_gather_1(indices_ub, res_ub, self.indices_num_remaining,
                                                0, gm_offset_base, output_offset, burst_len_row, burst_len_res_tail)
Exemple #22
0
    def data_move(self, input_dict):
        """
      move data from ub to gm

      Parameters
      ----------
        input_dict: input_dict is a dict, the keys as follow:
                x_ub: x_ub is a tensor,store data from gm
                src_start: the start address of src tensor
                dest_start: the start address of dest tensor
                element_num: each continuous segment
                block_num: blcok number
      Returns
      -------
      None
      """
        x_ub = input_dict.get("x_ub")
        element_num = input_dict.get("element_num")
        block_num = input_dict.get("block_num")
        loop_num, last_ub_num = get_loop_param(element_num,
                                               self.one_max_size)
        cur_size = self.instance.Scalar("int32")
        cur_size.set_as(self.one_max_size * self.dsize)
        ub_num = self.instance.Scalar("int32")
        ub_num.set_as(self.one_max_size)
        offset_in = self.instance.Scalar("int32")
        offset_in.set_as(input_dict.get("src_start"))
        offset_out = self.instance.Scalar("int32")
        offset_out.set_as(input_dict.get("dest_start"))
        each_burst_num = constant.BLOCK_SIZE // self.dsize
        with self.instance.for_range(0, loop_num) as cycle:
            with self.instance.if_scope(cycle == loop_num - 1):
                cur_size.set_as(last_ub_num * self.dsize)
                ub_num.set_as(last_ub_num)
            n_burst = common_util.get_datamove_nburst(self.instance,
                                                      cur_size)
            mod = cur_size % constant.BLOCK_SIZE
            with self.instance.if_scope(
                    tik.all(cycle == loop_num - 1, mod != 0, block_num > 1)):
                x_ub_tail = self.instance.Tensor(self.dtype, (32,),
                                                 name="x_ub_tail",
                                                 scope=tik.scope_ubuf)
                self.instance.data_move(x_ub_tail,
                                        self.x_gm[offset_in +
                                                  ub_num - each_burst_num],
                                        constant.SID,
                                        constant.DEFAULT_NBURST, 1,
                                        constant.STRIDE_ZERO,
                                        constant.STRIDE_ZERO)
                self.instance.data_move(self.y_gm[offset_out +
                                                  ub_num - each_burst_num],
                                        x_ub_tail,
                                        constant.SID,
                                        constant.DEFAULT_NBURST, 1,
                                        constant.STRIDE_ZERO,
                                        constant.STRIDE_ZERO)
                with self.instance.if_scope(cur_size > constant.BLOCK_SIZE):
                    self.instance.data_move(x_ub,
                                            self.x_gm[offset_in],
                                            constant.SID,
                                            constant.DEFAULT_NBURST,
                                            n_burst - 1,
                                            constant.STRIDE_ZERO,
                                            constant.STRIDE_ZERO)
                    self.instance.data_move(self.y_gm[offset_out],
                                            x_ub,
                                            constant.SID,
                                            constant.DEFAULT_NBURST,
                                            n_burst - 1,
                                            constant.STRIDE_ZERO,
                                            constant.STRIDE_ZERO)
            with self.instance.else_scope():
                self.instance.data_move(x_ub,
                                        self.x_gm[offset_in],
                                        constant.SID, constant.DEFAULT_NBURST,
                                        n_burst, constant.STRIDE_ZERO,
                                        constant.STRIDE_ZERO)
                self.instance.data_move(self.y_gm[offset_out],
                                        x_ub,
                                        constant.SID,
                                        constant.DEFAULT_NBURST, n_burst,
                                        constant.STRIDE_ZERO,
                                        constant.STRIDE_ZERO)
            offset_in.set_as(offset_in + ub_num)
            offset_out.set_as(offset_out + ub_num)
Exemple #23
0
    def proposal_pooling_fp32(self, proposal_id, c1_loop_index):
        """
        max pooling from the h direction, then max pooling from the w
        direction, for fp32

        Parameters
        ----------
        proposal_id: which proposal is now being processed
        c1_loop_index: c1 index of the feature map 4C0

        Returns
        -------
        None
        """
        scalar_roi_start_w = self.tik_instance.Scalar("int32", \
                name="scalar_roi_start_w")
        scalar_roi_start_w.set_as(self.roi_start_w[0, proposal_id])

        scalar_roi_start_h = self.tik_instance.Scalar("int32", \
                name="scalar_roi_start_h")

        scalar_roi_bin_h = self.tik_instance.Scalar("int32",
                                                    name="scalar_roi_bin_h")

        scalar_roi_width = self.tik_instance.Scalar("int32",
                                                    name="scalar_roi_width")
        scalar_roi_width.set_as(self.roi_width[proposal_id])

        scalar_roi_height = self.tik_instance.Scalar("int32",
                                                     name="scalar_roi_height")
        scalar_roi_height.set_as(self.roi_height[proposal_id])

        pooled_res = self.tik_instance.Tensor(FP32, \
              shape=(FOUR_C0, self.pooled_h, self.pooled_w, self.fm_c0), \
              scope=tik.scope_ubuf, name="pooled_res")
        res_size = FOUR_C0 * self.pooled_h * self.pooled_w * self.fm_c0

        if res_size // DIGIT_64 >= 1:
            self.tik_instance.vec_dup(DIGIT_256 // self.dsize,
                                      pooled_res[0, 0, 0, 0], 0,
                                      res_size // DIGIT_64, DIGIT_8)
        if res_size % DIGIT_64 != 0:  # tail
            self.tik_instance.vec_dup(
                res_size % DIGIT_64,
                pooled_res[res_size // DIGIT_64 * DIGIT_64], 0, DIGIT_1,
                DIGIT_8)

        pooled_h_res = self.tik_instance.Tensor(FP32, \
                shape=(FOUR_C0, 1, self.fm_w_align, self.fm_c0), \
                scope=tik.scope_ubuf, name="pooled_h_res")
        pooled_h_res_size = FOUR_C0 * 1 * self.fm_w_align * self.fm_c0

        with self.tik_instance.for_range(0, self.pooled_h) as pooled_h_i:
            scalar_roi_start_h.set_as(self.roi_start_h[pooled_h_i,
                                                       proposal_id])
            scalar_roi_bin_h.set_as(self.roi_bin_h[pooled_h_i, proposal_id])

            with self.tik_instance.if_scope(
                    tik.all(scalar_roi_bin_h != 0, scalar_roi_width != 0)):
                self.tik_instance.vec_dup(DIGIT_256 // self.dsize,
                                          pooled_h_res[0, 0, 0, 0], 0,
                                          pooled_h_res_size // DIGIT_64,
                                          DIGIT_8)

                if self.fm_h * self.fm_w < DIGIT_128:
                    with self.tik_instance.for_range(0, scalar_roi_width) \
                            as w_index:
                        self.tik_instance.vmax(
                            FOUR_C0 * self.fm_c0, pooled_h_res[0, 0, w_index,
                                                               0],
                            self.proposal_fm_data[0, scalar_roi_start_h,
                                                  scalar_roi_start_w + w_index,
                                                  0],
                            pooled_h_res[0, 0, w_index, 0], scalar_roi_bin_h,
                            self.fm_w_align * C0 * self.dsize // BLOCK_SIZE,
                            self.fm_h * self.fm_w * C0 * self.dsize //
                            BLOCK_SIZE,
                            self.fm_w_align * C0 * self.dsize // BLOCK_SIZE, 0,
                            self.fm_w * C0 * self.dsize // BLOCK_SIZE, 0)
                else:
                    with self.tik_instance.for_range(0, FOUR_C0) as c0_i:
                        with self.tik_instance.if_scope(
                                scalar_roi_width <= DIGIT_4):
                            self.tik_instance.vec_max(
                                scalar_roi_width * self.fm_c0,
                                pooled_h_res[c0_i, 0, 0, 0],
                                self.proposal_fm_data[c0_i, scalar_roi_start_h,
                                                      scalar_roi_start_w, 0],
                                pooled_h_res[c0_i, 0, 0,
                                             0], scalar_roi_bin_h, 0,
                                self.fm_w * C0 * self.dsize // BLOCK_SIZE, 0)
                        with self.tik_instance.else_scope():
                            with self.tik_instance.for_range(0, \
                                    scalar_roi_width // DIGIT_4) as loop_4w_i:
                                self.tik_instance.vec_max(
                                    DIGIT_256 // self.dsize,
                                    pooled_h_res[c0_i, 0, DIGIT_4 * loop_4w_i,
                                                 0],
                                    self.proposal_fm_data[c0_i,
                                                          scalar_roi_start_h,
                                                          scalar_roi_start_w +
                                                          DIGIT_4 * loop_4w_i,
                                                          0],
                                    pooled_h_res[c0_i, 0, DIGIT_4 * loop_4w_i,
                                                 0], scalar_roi_bin_h, 0,
                                    self.fm_w * C0 * self.dsize // BLOCK_SIZE,
                                    0)
                            with self.tik_instance.if_scope(
                                    scalar_roi_width % DIGIT_4 != 0):
                                tmp_w = scalar_roi_width // DIGIT_4 * DIGIT_4
                                self.tik_instance.vec_max(
                                    (scalar_roi_width - tmp_w)*self.fm_c0,
                                    pooled_h_res[c0_i, 0, tmp_w, 0],
                                    self.proposal_fm_data[c0_i, \
                                          scalar_roi_start_h, \
                                          scalar_roi_start_w + tmp_w, 0],
                                    pooled_h_res[c0_i, 0, tmp_w, 0],
                                    scalar_roi_bin_h,
                                    0, self.fm_w*C0*self.dsize // BLOCK_SIZE, 0)

                self.proposal_pooling_w(proposal_id, pooled_h_i, pooled_res,
                                        pooled_h_res)

        # move result to out
        with self.tik_instance.if_scope(c1_loop_index != self.c1_looptime - 1):
            self.tik_instance.data_move(
                self.y[self.ouput_proposal_offset + self.calced_rois +
                       proposal_id, c1_loop_index * FOUR_C0, 0, 0, 0],
                pooled_res[0, 0, 0, 0], 0, 1, FOUR_C0 * self.pooled_h *
                self.pooled_w * C0 * self.dsize // BLOCK_SIZE, 0, 0)
        with self.tik_instance.else_scope():  # tail
            self.tik_instance.data_move(
                self.y[self.ouput_proposal_offset +
                       self.calced_rois+proposal_id,
                       (self.c1_looptime - 1) * FOUR_C0, 0, 0, 0],
                pooled_res[0, 0, 0, 0],
                0,
                1,
                self.tail_c0_num * self.pooled_h * self.pooled_w * \
                    C0 * self.dsize // BLOCK_SIZE,
                0,
                0)
Exemple #24
0
def do_crop_and_resize_compute_one_core(box_num_sigment, obj, box_num_offset):
    """do crop and resize in one core
        step 1 read boxes from boxes and calc h_top_index/h_bottom_index/h_lerp/w_left_index/w_right_index/w_lerp
        step 2 read input_batch_num from box_index
        step 3 copy 4 data(Total C(C1*C0)) in ub
               use use input_batch_num/h_top_index/h_bottom_index/w_left_index/w_right_index
        step 4 calcu the out
               top = top_left + (top_right - top_left) * x_lerp
               bottom = bottom_left + (bottom_right - bottom_left) * x_lerp
               out = top + (bottom - top) * y_lerp;

    Parameters:
    ----------
    box_num_sigment : int.
        the crop boxes num for one core
    obj : class.
        crop_and_resize par object
    box_num_offset: int
        copy boxes offset

    Returns
    -------
    None
    """
    tik_instance = obj.get_tik_instance()
    # get float32 index ub
    index_ub = obj.index_ub
    men_len = get_ceil_int(box_num_sigment*4, obj.boxes_vector_num) * obj.boxes_vector_num

    # apply ub mem for index
    boxes_ub_small = obj.apply_mem((men_len,), "boxes_ub_h1", tik.scope_ubuf, obj.boxes_type)
    boxes_ub_big = obj.apply_mem((men_len,), "boxes_ub_h2", tik.scope_ubuf, obj.boxes_type)
    boxes_ub_scale = obj.apply_mem((men_len,), "boxes_ub_scale", tik.scope_ubuf, obj.boxes_type)
    copy_burst_len = get_ceil_int(box_num_sigment*4, obj.boxes_block_num)

    # init ub for input offset
    batch_offset_ub = obj.apply_mem((obj.boxes_vector_num,), "batch_offset_ub", tik.scope_ubuf, "int32")
    height_offset_ub = obj.apply_mem((obj.boxes_vector_num,), "height_offset_ub", tik.scope_ubuf, "int32")
    width_offset_ub = obj.apply_mem((obj.boxes_vector_num,), "width_offset_ub", tik.scope_ubuf, "int32")
    tik_instance.vector_dup(obj.boxes_vector_num, batch_offset_ub,
                            obj.image_c1*obj.image_c0*obj.image_height*obj.image_width, 1, 1, 8)
    tik_instance.vector_dup(obj.boxes_vector_num, height_offset_ub,
                            obj.image_c0*obj.image_width, 1, 1, 8)
    tik_instance.vector_dup(obj.boxes_vector_num, width_offset_ub,
                            obj.image_c0, 1, 1, 8)

    # copy boxes in boxes_ub_small
    tik_instance.data_move(boxes_ub_small, obj.input_gm_list[1][box_num_offset*4],
                           0, 1, copy_burst_len, 0, 0)
    copy_burst_len = get_ceil_int(box_num_sigment*4 - 2, obj.boxes_block_num)
    # copy boxes[2] in boxes_ub_small
    tik_instance.data_move(boxes_ub_big, obj.input_gm_list[1][box_num_offset*4 + 2],
                           0, 1, copy_burst_len, 0, 0)
    # calc boxes[2] - boxes  means y2 - y1 and x2 - x1
    tik_func_vcomple(tik_instance, "vsub", boxes_ub_scale, boxes_ub_big, boxes_ub_small, men_len)
    if obj.crop_height <= 1 or obj.crop_width <= 1:
        tik_func_vcomple(tik_instance, "vadd", boxes_ub_big, boxes_ub_big, boxes_ub_small, men_len)

    # calc resize scale for h and w
    repeat_time = get_ceil_int(box_num_sigment*4, obj.boxes_vector_num)
    if obj.crop_height > 1:
        # to get scale_h: scale * (image_height - 1) / (crop_height - 1)
        tik_instance.vmuls([obj.height_mask_list[0], obj.height_mask_list[1]],
                           boxes_ub_scale, boxes_ub_scale, (obj.image_height - 1) / (obj.crop_height - 1),
                           repeat_time, 1, 1, 8, 8)
    if obj.crop_width > 1:
        # to get scale_w:  scale * (image_width - 1) / (crop_width - 1)
        tik_instance.vmuls(obj.width_mask_list,
                           boxes_ub_scale, boxes_ub_scale, (obj.image_width - 1) / (obj.crop_width - 1),
                           repeat_time, 1, 1, 8, 8)
    # to get h_small: h_small * (image_height - 1)
    if obj.crop_height > 1:
        # to get h_small: h_small * (image_height - 1)
        tik_instance.vmuls(obj.height_mask_list,
                           boxes_ub_small, boxes_ub_small, obj.image_height - 1,
                           repeat_time, 1, 1, 8, 8)
    else:
        # to get h_small: (h_small + h_big) * (image_height - 1) * 0.5
        tik_instance.vmuls(obj.height_mask_list,
                           boxes_ub_small, boxes_ub_big, 0.5,
                           repeat_time, 1, 1, 8, 8)
        tik_instance.vmuls(obj.height_mask_list,
                           boxes_ub_small, boxes_ub_small, obj.image_height - 1,
                           repeat_time, 1, 1, 8, 8)

    if obj.crop_width > 1:
        # to get w_small: w_small * (image_width - 1)
        tik_instance.vmuls(obj.width_mask_list,
                           boxes_ub_small, boxes_ub_small, obj.image_width - 1,
                           repeat_time, 1, 1, 8, 8)
    else:
        # to get w_small: (w_small + w_big) * (image_width - 1) * 0.5
        tik_instance.vmuls(obj.width_mask_list,
                           boxes_ub_small, boxes_ub_big, 0.5,
                           repeat_time, 1, 1, 8, 8)
        tik_instance.vmuls(obj.width_mask_list,
                           boxes_ub_small, boxes_ub_small, obj.image_width - 1,
                           repeat_time, 1, 1, 8, 8)

    # box_index process for one sigment
    box_index_ub = obj.apply_mem((get_ceil_int(box_num_sigment, obj.boxes_block_num)*obj.boxes_block_num,),
                                 "box_index_ub", tik.scope_ubuf, "int32")
    copy_burst_len = get_ceil_int(box_num_sigment, obj.boxes_block_num)
    tik_instance.data_move(box_index_ub, obj.input_gm_list[2][box_num_offset],
                           0, 1, copy_burst_len, 0, 0)
    tik_func_vcomple(tik_instance, "vmul", box_index_ub, box_index_ub, batch_offset_ub, box_num_sigment, src1_rep=0)

    with tik_instance.for_range(0, box_num_sigment) as _box_idx:
        _out_batch_idx = _box_idx + box_num_offset
        scaler_h_small = tik_instance.Scalar(dtype=boxes_ub_small.dtype)
        scaler_w_small = tik_instance.Scalar(dtype=boxes_ub_small.dtype)
        scaler_h_scale = tik_instance.Scalar(dtype=boxes_ub_small.dtype)
        scaler_w_scale = tik_instance.Scalar(dtype=boxes_ub_small.dtype)
        # read scale for h and w
        scaler_h_small.set_as(boxes_ub_small[_box_idx*4])
        scaler_w_small.set_as(boxes_ub_small[_box_idx*4 + 1])
        scaler_h_scale.set_as(boxes_ub_scale[_box_idx*4])
        scaler_w_scale.set_as(boxes_ub_scale[_box_idx*4 + 1])

        input_boxes_in_h = obj.apply_mem((get_ceil_int(obj.crop_height, obj.boxes_block_num) * obj.boxes_block_num,),
                                         "input_boxes_in_h", tik.scope_ubuf, obj.boxes_type)
        tik_instance.vmuls(obj.crop_height,
                           input_boxes_in_h, index_ub, scaler_h_scale,
                           1, 1, 1, 8, 8)
        input_boxes_in_w = obj.apply_mem((get_ceil_int(obj.crop_width, obj.boxes_block_num) * obj.boxes_block_num,),
                                         "input_boxes_in_w", tik.scope_ubuf, obj.boxes_type)
        tik_instance.vmuls(obj.crop_width,
                           input_boxes_in_w, index_ub, scaler_w_scale,
                           1, 1, 1, 8, 8)
        tik_instance.vadds(obj.crop_height,
                           input_boxes_in_h, input_boxes_in_h, scaler_h_small,
                           1, 1, 1, 8, 8)
        tik_instance.vadds(obj.crop_width,
                           input_boxes_in_w, input_boxes_in_w, scaler_w_small,
                           1, 1, 1, 8, 8)

        h_top_index = \
            obj.apply_mem((get_ceil_int(obj.crop_height, obj.boxes_block_num) * obj.boxes_block_num,),
                          "h_top_index", tik.scope_ubuf, "int32")
        w_left_index = \
            obj.apply_mem((get_ceil_int(obj.crop_width, obj.boxes_block_num) * obj.boxes_block_num,),
                          "w_left_index", tik.scope_ubuf, "int32")
        h_index_post = \
            obj.apply_mem((get_ceil_int(obj.crop_height, obj.boxes_block_num) * obj.boxes_block_num,),
                          "h_index_post", tik.scope_ubuf, "int32")
        w_index_post = \
            obj.apply_mem((get_ceil_int(obj.crop_width, obj.boxes_block_num) * obj.boxes_block_num,),
                          "w_index_post", tik.scope_ubuf, "int32")

        cast_flag = tbe_platform.cce_conf.api_check_support("tik.vconv", "f322s32r")
        with tik_instance.new_stmt_scope():
            tmp_float_ub_0 = obj.apply_mem((get_ceil_int(obj.crop_height,
                                                         obj.boxes_block_num)
                                            * obj.boxes_block_num,),
                                           "tmp_float_ub_0", tik.scope_ubuf, obj.boxes_type)
            if not cast_flag:
                tik_func_vconv(tik_instance, h_top_index, input_boxes_in_h, obj.crop_height,
                               mode="floor", mini_mid_ub=tmp_float_ub_0)
            else:
                tik_func_vconv(tik_instance, h_top_index, input_boxes_in_h, obj.crop_height,
                               mode="floor")
                # h_top_index vconv from int32 to float32
                tik_func_vconv(tik_instance, tmp_float_ub_0, h_top_index, obj.crop_height)
            tik_func_vcomple(tik_instance, "vmul", h_top_index, h_top_index, height_offset_ub, obj.crop_height,
                             src1_rep=0)

            # do: h_lerp = input_boxes_in_h - tmp_float_ub
            tik_func_vcomple(tik_instance, "vsub", input_boxes_in_h,
                             input_boxes_in_h, tmp_float_ub_0, obj.crop_height)
            tik_func_vconv(tik_instance, h_index_post, input_boxes_in_h, obj.crop_height,
                           mode="ceil")
            tmp_float_ub_1 = obj.apply_mem((get_ceil_int(obj.crop_width,
                                                         obj.boxes_block_num)
                                            * obj.boxes_block_num,),
                                           "tmp_float_ub_1", tik.scope_ubuf, obj.boxes_type)
            if not cast_flag:
                tik_func_vconv(tik_instance, w_left_index, input_boxes_in_w, obj.crop_width,
                               mode="floor", mini_mid_ub=tmp_float_ub_1)
            else:
                tik_func_vconv(tik_instance, w_left_index, input_boxes_in_w, obj.crop_width,
                               mode="floor")
                # h_top_index vconv from int32 to float32
                tik_func_vconv(tik_instance, tmp_float_ub_1, w_left_index, obj.crop_width)
            tik_func_vcomple(tik_instance, "vmul", w_left_index, w_left_index, width_offset_ub, obj.crop_width,
                             src1_rep=0)
            # do: w_lerp = input_boxes_in_h - tmp_float_ub
            tik_func_vcomple(tik_instance, "vsub", input_boxes_in_w,
                             input_boxes_in_w, tmp_float_ub_1, obj.crop_width)
            tik_func_vconv(tik_instance, w_index_post, input_boxes_in_w, obj.crop_width,
                           mode="ceil")

        # read input batch index and calc input offset
        input_batch_offset = tik_instance.Scalar(dtype="int32")
        input_batch_offset.set_as(box_index_ub[_box_idx])
        input_h_offset = tik_instance.Scalar(dtype="int32")
        input_h_post = tik_instance.Scalar(dtype="int32")
        h_lerp = tik_instance.Scalar(dtype=boxes_ub_small.dtype)
        c0_block_num = obj.image_c0 // obj.block_num
        image_gm = obj.input_gm_list[0]
        output_gm = obj.output_gm_list[0]
        with tik_instance.for_range(0, obj.crop_height) as _crop_height_idx:
            input_h_offset.set_as(h_top_index[_crop_height_idx])
            input_h_post.set_as(h_index_post[_crop_height_idx])
            real_h_offset = input_h_offset + input_h_post
            with tik_instance.if_scope(
                    tik.all(input_h_offset >= 0,
                            real_h_offset <= (obj.image_height - 1)*obj.image_c0*obj.image_width)):
                h_lerp.set_as(input_boxes_in_h[_crop_height_idx])
                thread_num = 2
                if obj.crop_width <= 1:
                    thread_num = 1
                with tik_instance.for_range(0, obj.crop_width, thread_num=thread_num) as _crop_width_idx:
                    input_w_offset = tik_instance.Scalar(dtype="int32")
                    input_w_post = tik_instance.Scalar(dtype="int32")
                    w_lerp = tik_instance.Scalar(dtype=boxes_ub_small.dtype)
                    input_w_offset.set_as(w_left_index[_crop_width_idx])
                    input_w_post.set_as(w_index_post[_crop_width_idx])
                    real_w_offset = input_w_offset + input_w_post
                    with tik_instance.if_scope(tik.all(input_w_offset >= 0,
                                                       real_w_offset <= (obj.image_width - 1) * obj.image_c0)):
                        w_lerp.set_as(input_boxes_in_w[_crop_width_idx])
                        # copy all C data in ub
                        with tik_instance.new_stmt_scope():
                            h0_w_ub = obj.apply_mem((obj.image_c1*obj.image_c0*2,),
                                                    "h0_w_ub", tik.scope_ubuf, "float32")
                            h1_w_ub = obj.apply_mem((obj.image_c1*obj.image_c0*2,),
                                                    "h1_w_ub", tik.scope_ubuf, "float32")

                            if obj.image_block_num == obj.block_num:
                                # when input is fp32, just copy
                                if obj.image_width > 1:
                                    tik_instance.data_move(
                                        h0_w_ub, image_gm[input_batch_offset + input_h_offset + input_w_offset],
                                        0, obj.image_c1, c0_block_num*2,
                                        obj.image_height*obj.image_width*c0_block_num - c0_block_num*2, 0)
                                    if obj.image_height > 1:
                                        tik_instance.data_move(
                                            h1_w_ub,
                                            image_gm[input_batch_offset + input_h_offset + input_w_offset
                                                     + obj.image_width * obj.image_c0],
                                            0, obj.image_c1, c0_block_num*2,
                                            obj.image_height*obj.image_width*c0_block_num - c0_block_num*2, 0)
                                    else:
                                        tik_func_vector(tik_instance, h1_w_ub, 0, obj.image_c1*obj.image_c0*2)
                                else:
                                    tik_instance.data_move(
                                        h0_w_ub, image_gm[input_batch_offset + input_h_offset + input_w_offset],
                                        0, obj.image_c1, c0_block_num,
                                        obj.image_height*obj.image_width*c0_block_num - c0_block_num, c0_block_num)
                                    tik_instance.data_move(
                                        h0_w_ub[obj.image_c0],
                                        image_gm[input_batch_offset + input_h_offset + input_w_offset],
                                        0, obj.image_c1, c0_block_num,
                                        obj.image_height*obj.image_width*c0_block_num - c0_block_num, c0_block_num)
                                    if obj.image_height > 1:
                                        tik_instance.data_move(
                                            h1_w_ub,
                                            image_gm[input_batch_offset + input_h_offset + input_w_offset
                                                     + obj.image_width * obj.image_c0],
                                            0, obj.image_c1, c0_block_num,
                                            obj.image_height*obj.image_width*c0_block_num - c0_block_num, c0_block_num)
                                        tik_instance.data_move(
                                            h1_w_ub[obj.image_c0],
                                            image_gm[input_batch_offset + input_h_offset + input_w_offset
                                                     + obj.image_width * obj.image_c0],
                                            0, obj.image_c1, c0_block_num,
                                            obj.image_height*obj.image_width*c0_block_num - c0_block_num, c0_block_num)
                                    else:
                                        tik_func_vector(tik_instance, h1_w_ub, 0, obj.image_c1*obj.image_c0*2)
                            else:
                                # when input is fp16, will copy and cast to fp32
                                with tik_instance.new_stmt_scope():
                                    h0_w_ub_fp16 = obj.apply_mem((obj.image_c1*obj.image_c0*2,),
                                                                 "h0_w_ub_fp16", tik.scope_ubuf)
                                    h1_w_ub_fp16 = obj.apply_mem((obj.image_c1*obj.image_c0*2,),
                                                                 "h1_w_ub_fp16", tik.scope_ubuf)
                                    c0_block_fp16 = 1
                                    if obj.image_width > 1:
                                        tik_instance.data_move(
                                            h0_w_ub_fp16,
                                            image_gm[input_batch_offset + input_h_offset + input_w_offset],
                                            0, obj.image_c1, c0_block_fp16*2,
                                            obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16*2, 0)
                                        tik_func_vconv(tik_instance, h0_w_ub, h0_w_ub_fp16,
                                                       obj.image_c1*obj.image_c0*2)
                                        if obj.image_height > 1:
                                            tik_instance.data_move(
                                                h1_w_ub_fp16,
                                                image_gm[input_batch_offset + input_h_offset + input_w_offset
                                                         + obj.image_width * obj.image_c0],
                                                0, obj.image_c1, c0_block_fp16*2,
                                                obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16*2, 0)
                                            tik_func_vconv(tik_instance, h1_w_ub, h1_w_ub_fp16,
                                                           obj.image_c1*obj.image_c0*2)
                                        else:
                                            tik_func_vector(tik_instance, h1_w_ub, 0, obj.image_c1*obj.image_c0*2)
                                    else:
                                        tik_instance.data_move(
                                            h0_w_ub_fp16,
                                            image_gm[input_batch_offset + input_h_offset + input_w_offset],
                                            0, obj.image_c1, c0_block_fp16,
                                            obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16,
                                            c0_block_fp16)
                                        tik_instance.data_move(
                                            h0_w_ub_fp16[c0_block_fp16*obj.image_c0],
                                            image_gm[input_batch_offset + input_h_offset + input_w_offset],
                                            0, obj.image_c1, c0_block_fp16,
                                            obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16,
                                            c0_block_fp16)
                                        tik_func_vconv(tik_instance, h0_w_ub, h0_w_ub_fp16,
                                                       obj.image_c1*obj.image_c0*2)
                                        if obj.image_height > 1:
                                            tik_instance.data_move(
                                                h1_w_ub_fp16,
                                                image_gm[input_batch_offset + input_h_offset + input_w_offset
                                                         + obj.image_width * obj.image_c0],
                                                0, obj.image_c1, c0_block_fp16,
                                                obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16,
                                                c0_block_fp16)
                                            tik_instance.data_move(
                                                h1_w_ub_fp16[c0_block_fp16*obj.image_c0],
                                                image_gm[input_batch_offset + input_h_offset + input_w_offset
                                                         + obj.image_width * obj.image_c0],
                                                0, obj.image_c1, c0_block_fp16,
                                                obj.image_height*obj.image_width*c0_block_fp16 - c0_block_fp16,
                                                c0_block_fp16)
                                            tik_func_vconv(tik_instance, h1_w_ub, h1_w_ub_fp16,
                                                           obj.image_c1*obj.image_c0*2)
                                        else:
                                            tik_func_vector(tik_instance, h1_w_ub, 0, obj.image_c1*obj.image_c0*2)

                            tik_func_vcomple(tik_instance, "vsub", h1_w_ub,
                                             h1_w_ub, h0_w_ub,
                                             obj.image_c1*obj.image_c0*2)

                            tik_func_vmuls(tik_instance, h1_w_ub,
                                           h1_w_ub, h_lerp, obj.image_c1*obj.image_c0*2)

                            tik_func_vcomple(tik_instance, "vadd", h0_w_ub,
                                             h1_w_ub, h0_w_ub,
                                             obj.image_c1*obj.image_c0*2)

                            tik_fun = tik_instance.vsub
                            tik_fun(obj.image_c0, h1_w_ub, h0_w_ub[16],
                                    h0_w_ub, obj.image_c1, 1, 1, 1, 2, 4, 4)
                            tik_func_vmuls(tik_instance, h1_w_ub,
                                           h1_w_ub, w_lerp, obj.image_c1*obj.image_c0)

                            tik_fun = tik_instance.vadd
                            tik_fun(obj.image_c0, h1_w_ub[obj.image_c1*obj.image_c0:],
                                    h1_w_ub, h0_w_ub, obj.image_c1,
                                    1, 1, 1, 2, 2, 4)
                            output_offset = \
                                _out_batch_idx*obj.image_c1*obj.crop_width*obj.crop_height*obj.image_c0 \
                                + _crop_height_idx*obj.crop_width*obj.image_c0 + _crop_width_idx*obj.image_c0
                            tik_instance.data_move(output_gm[output_offset],
                                                   h1_w_ub[obj.image_c1*obj.image_c0:], 0, obj.image_c1, c0_block_num,
                                                   0, obj.crop_height*obj.crop_width*c0_block_num - c0_block_num)
                    with tik_instance.else_scope():
                        with tik_instance.new_stmt_scope():
                            h1_w_ub = obj.apply_mem((get_ceil_int(obj.image_c1*obj.image_c0, obj.vector_num)
                                                     * obj.vector_num,),
                                                    "h1_w_ub", tik.scope_ubuf, "float32")
                            tik_func_vector(tik_instance, h1_w_ub, obj.extrapolation_value,
                                            obj.image_c1*obj.image_c0)

                            output_offset = \
                                _out_batch_idx*obj.image_c1*obj.crop_width*obj.crop_height*obj.image_c0 \
                                + _crop_height_idx*obj.crop_width*obj.image_c0 + _crop_width_idx*obj.image_c0
                            tik_instance.data_move(output_gm[output_offset],
                                                   h1_w_ub, 0, obj.image_c1, c0_block_num,
                                                   0, obj.crop_height*obj.crop_width*c0_block_num - c0_block_num)
            with tik_instance.else_scope():
                with tik_instance.new_stmt_scope():
                    h0_w_ub = obj.apply_mem((get_ceil_int(obj.image_c1*obj.crop_width*obj.image_c0,
                                                          obj.vector_num)
                                             * obj.vector_num,),
                                            "h0_w_ub", tik.scope_ubuf, "float32")
                    tik_func_vector(tik_instance, h0_w_ub, obj.extrapolation_value,
                                    obj.image_c1*obj.image_c0*obj.crop_width)
                    output_offset = \
                        _out_batch_idx*obj.image_c1*obj.crop_width*obj.crop_height*obj.image_c0 \
                        + _crop_height_idx*obj.crop_width*obj.image_c0
                    tik_instance.data_move(output_gm[output_offset],
                                           h0_w_ub, 0, obj.image_c1, c0_block_num*obj.crop_width, 0,
                                           obj.crop_height*obj.crop_width*c0_block_num - c0_block_num*obj.crop_width)
Exemple #25
0
    def data_move(self, input_dict):
        """
      move data from ub to gm

      Parameters
      ----------
        input_dict: input_dict is a dict, the keys as follow:
                x1_ub: x1_ub is a tensor,store data from gm
                x1_offset: x1 gm data offset
                out_offset: output data offset
                element_num: each continuous segment
                block_num: blcok number
      Returns
      -------
      None
      """
        x1_ub = input_dict.get("x1_ub")
        out_offset = input_dict.get("out_offset")
        element_num = input_dict.get("element_num")
        block_num = input_dict.get("block_num")
        loop_cycle, last_ub_num = get_loop_param(element_num,
                                                 self.one_max_size)
        total_size = self.instance.Scalar("int32")
        total_size.set_as(self.one_max_size * self.dsize)
        ub_size = self.instance.Scalar("int32")
        ub_size.set_as(self.one_max_size)
        offset_x1 = self.instance.Scalar("int32")
        offset_x1.set_as(input_dict.get("x1_offset"))
        offset_out = self.instance.Scalar("int32")
        offset_out.set_as(out_offset)
        each_burst_num = constant.BLOCK_SIZE // self.dsize
        with self.instance.for_range(0, loop_cycle) as cycle:
            with self.instance.if_scope(cycle == loop_cycle - 1):
                total_size.set_as(last_ub_num * self.dsize)
                ub_size.set_as(last_ub_num)
            nburst = common_util.get_datamove_nburst(self.instance, total_size)
            with self.instance.if_scope(
                    tik.all(cycle == loop_cycle - 1,
                            total_size % constant.BLOCK_SIZE != 0,
                            block_num > 1)):
                x1_ub_tmp = self.instance.Tensor(self.dtype, (32, ),
                                                 name="x1_ub_tmp",
                                                 scope=tik.scope_ubuf)
                self.instance.data_move(
                    x1_ub_tmp,
                    self.x1_gm[offset_x1 + ub_size - each_burst_num],
                    constant.SID, constant.DEFAULT_NBURST, 1,
                    constant.STRIDE_ZERO, constant.STRIDE_ZERO)
                self.instance.data_move(
                    self.y_gm[offset_out + ub_size - each_burst_num],
                    x1_ub_tmp, constant.SID, constant.DEFAULT_NBURST, 1,
                    constant.STRIDE_ZERO, constant.STRIDE_ZERO)
                with self.instance.if_scope(total_size > constant.BLOCK_SIZE):
                    self.instance.data_move(x1_ub, self.x1_gm[offset_x1],
                                            constant.SID,
                                            constant.DEFAULT_NBURST,
                                            nburst - 1, constant.STRIDE_ZERO,
                                            constant.STRIDE_ZERO)
                    self.instance.data_move(self.y_gm[offset_out], x1_ub,
                                            constant.SID,
                                            constant.DEFAULT_NBURST,
                                            nburst - 1, constant.STRIDE_ZERO,
                                            constant.STRIDE_ZERO)
            with self.instance.else_scope():
                self.instance.data_move(x1_ub, self.x1_gm[offset_x1],
                                        constant.SID, constant.DEFAULT_NBURST,
                                        nburst, constant.STRIDE_ZERO,
                                        constant.STRIDE_ZERO)
                self.instance.data_move(self.y_gm[offset_out], x1_ub,
                                        constant.SID, constant.DEFAULT_NBURST,
                                        nburst, constant.STRIDE_ZERO,
                                        constant.STRIDE_ZERO)
            offset_x1.set_as(offset_x1 + ub_size)
            offset_out.set_as(offset_out + ub_size)
Exemple #26
0
    def compute_crop(self):
        """
        compute crop

        Parameters
        ----------
        None
        Returns
        -------
        None
        """
        block_num, each_block_size, loop, tail = \
            self.get_blockdim_and_loop_cycle()
        shape_out = self.input_dict.get("y").get("shape")
        shape_out_len = get_shape_total_number(shape_out)
        offset_in = self.input_dict.get("offset")
        shape = self.input_dict.get("x1").get("shape")
        element_num, shape_len = self.get_element_num()
        x1_shape_list = get_elem_of_each_dim(shape, len(shape))
        shape = self.input_dict.get("x2").get("shape")
        x2_shape_list = get_elem_of_each_dim(shape, shape_len - 1)
        thread_n = self.get_thread_num(block_num, loop, element_num)

        with self.instance.for_range(0, block_num, block_num=block_num) \
                as block_id:
            ub_tmp = self.instance.Tensor(self.dtype, (256, ),
                                          name="ub_tmp",
                                          scope=tik.scope_ubuf)
            self.instance.data_move(ub_tmp, self.x2_gm[0], constant.SID,
                                    constant.DEFAULT_NBURST, 1,
                                    constant.STRIDE_ZERO, constant.STRIDE_ZERO)
            count = self.instance.Scalar("int32")
            count.set_as(0)
            each_loop = self.instance.Scalar("int32")
            each_loop.set_as(loop)
            offset = self.instance.Scalar("int32")
            if tail > 0:
                with self.instance.if_scope(block_id < tail):
                    each_loop.set_as(each_loop + 1)
            offset.set_as(block_id * each_loop)
            with self.instance.if_scope(tik.all(block_id >= tail, tail > 0)):
                offset.set_as(block_id * (each_loop + 1) - (block_id - tail))
            out_offset = self.instance.Scalar("int32")
            out_offset.set_as(offset * element_num)
            cycles = shape_out_len // element_num
            tmp_offset = self.instance.Scalar("int32")
            tmp_offset.set_as(0)
            with self.instance.for_range(offset, cycles,
                                         thread_num=thread_n) as times:
                with self.instance.if_scope(count < each_loop):
                    x1_ub = self.instance.Tensor(self.dtype,
                                                 (self.one_max_size, ),
                                                 name="x1_ub",
                                                 scope=tik.scope_ubuf)
                    x1_offset = self.instance.Scalar("int32")
                    x1_offset.set_as(0)
                    for q in range(shape_len):
                        mod = times
                        for s in range(q):
                            mod %= x2_shape_list[s]
                        mod = mod // x2_shape_list[q] + offset_in[q]
                        x1_offset.set_as(x1_offset + mod * x1_shape_list[q])
                    if element_num * self.dsize < constant.BLOCK_SIZE \
                            and block_num > 1:
                        input_dict = {
                            "x1_ub": x1_ub,
                            "ub_tmp": ub_tmp,
                            "x1_offset": x1_offset,
                            "out_offset": out_offset,
                            "tmp_offset": tmp_offset,
                            "element_num": element_num,
                            "each_block_size": each_block_size,
                            "count": count,
                            "each_loop": each_loop,
                        }
                        self.move_out_less_than32b(input_dict)
                        out_offset.set_as(out_offset + element_num)
                    else:
                        input_dict = {
                            "x1_ub": x1_ub,
                            "x1_offset": x1_offset,
                            "out_offset": out_offset,
                            "element_num": element_num,
                            "block_num": block_num,
                        }
                        self.data_move(input_dict)
                        out_offset.set_as(out_offset + element_num)
                    count.set_as(count + 1)
Exemple #27
0
    def tik_instance_cut_nc1_cut_one_h(self, kernel_name):
        """
        get vector instruct repeat times

        Parameters
        ----------
        kernel_name: cce kernel name, default value is "maxpoolGradWithArgmax"
        Returns
        -------
        None
        """
        batch, channel1, dyh, dyw, channel = self.input_gard_shape
        dxh, dxw = self.y_shape[2:4]
        strideh, stridew = self.strides[1:3]
        if strideh > dxh:
            strideh = dxh

        if stridew > dxw:
            stridew = dxw

        dtype = self.dtype
        dtype_size = self.dtype_size
        windowh, windoww = self.ksize[1:3]
        block = self.block
        pad_top = self.pad[0]
        pad_left = self.pad[2]

        hoverlap = self.hoverlap
        col2img_h = windowh
        if col2img_h < strideh:
            col2img_h = strideh
        col2img_dyw = (dyw + 15) // 16 * 16
        if self.woverlap == 0:
            col2img_w = col2img_dyw * stridew
        else:
            col2img_w = (col2img_dyw - 1) * stridew + windoww

        mask_one_window = ((dyh * dyw + 15) // 16 + 1) * 16

        # vector_repeat_time
        v_rep_time = col2img_dyw * channel * dtype_size // ONE_REPEAT
        v_rep_cycle_fp32 = 2 * v_rep_time // V_MAX_REPEAT
        # v_rep_last
        v_rep_last_fp32 = 2 * v_rep_time % V_MAX_REPEAT

        # when every looph move data after, then dup col2img data
        v_rep_afmv = (windowh - hoverlap) * channel *\
                     col2img_w * dtype_size * 2 // ONE_REPEAT
        v_rep_afmv_cycle = v_rep_afmv // V_MAX_REPEAT
        v_rep_afmv_last = v_rep_afmv % V_MAX_REPEAT

        v_rep_time_col = (2 * col2img_w * channel * col2img_h * \
                          dtype_size + ONE_REPEAT - 1) // ONE_REPEAT
        v_rep_cycle_col = v_rep_time_col // V_MAX_REPEAT
        v_rep_last_col = v_rep_time_col % V_MAX_REPEAT

        data_input = self.tik_instance.Tensor(dtype, self.input_gard_shape, name="data_input",
                                              scope=tik.scope_gm)
        data_mask = self.tik_instance.Tensor("uint16", (batch * channel1 * windowh * windoww *
                                                        mask_one_window,),
                                             name="data_mask", scope=tik.scope_gm)
        if self.padding == "SAME":
            data_output = self.tik_instance.Tensor(dtype, self.y_shape, name="data_output",
                                                   scope=tik.scope_gm)
        else:
            data_output = self.tik_instance.Tensor(dtype, self.y_shape, name="data_output",
                                                   scope=tik.scope_gm, is_atomic_add=True)

        data_input_origin = self.tik_instance.Tensor(dtype, self.y_shape, name="data_input_origin",
                                                     scope=tik.scope_gm)

        real_block, block_cycle, block_index = self.get_block_param(block)
        with self.tik_instance.for_range(0, real_block, block_num=real_block) as block_id:
            real_cycle = self.tik_instance.Scalar("int32")
            block_base = self.tik_instance.Scalar("int32")
            block_num = self.tik_instance.Scalar("int32")
            with self.tik_instance.if_scope(block_id < block_index):
                real_cycle.set_as(block_cycle + 1)
                block_base.set_as(block_id * real_cycle)
            with self.tik_instance.else_scope():
                real_cycle.set_as(block_cycle)
                block_base.set_as(block_index + block_id * block_cycle)
            with self.tik_instance.for_range(0, real_cycle) as cycle_id:
                block_num.set_as(block_base + cycle_id)
                data_vsel_scalar = self.tik_instance.Scalar(dtype)
                data_vsel_scalar.set_as(0)
                data_vsel_ub_zero = self.tik_instance.Tensor(dtype, (128,),
                                                             name="data_vsel_ub_zero",
                                                             scope=tik.scope_ubuf)
                self.tik_instance.data_move(data_vsel_ub_zero[0],
                                            data_input_origin[0],
                                            constant.SID,
                                            constant.DEFAULT_NBURST,
                                            constant.DEFAULT_BURST_LEN,
                                            constant.STRIDE_ZERO,
                                            constant.STRIDE_ZERO)
                self.clean_fp16_one_repeat(data_vsel_ub_zero, dtype)
                # vector_dup ub every time
                dxh_address_offset = self.tik_instance.Scalar("int32")
                dxh_address_offset.set_as(0)
                dxh_calcline = self.tik_instance.Scalar("int32")
                dxh_calcline.set_as(0)

                data_max_ub = self.tik_instance.Tensor(dtype, (col2img_dyw * channel,),
                                                       name="data_max_ub",
                                                       scope=tik.scope_ubuf)
                if self.woverlap > 0 and dyw % 16 != 0 and self.padding == "VALID":
                    self.clean_max_ub(data_max_ub, dtype)
                data_vmul_ub_col2img_fp32 = \
                    self.tik_instance.Tensor("float32",
                                             (col2img_w * channel * col2img_h + 64,),
                                             name="data_vmul_ub_col2img_fp32",
                                             scope=tik.scope_ubuf)
                data_vmul_ub_col2img_fp16 = \
                    self.tik_instance.Tensor(dtype,
                                             (col2img_w * channel * col2img_h + 128,),
                                             name="data_vmul_ub_col2img_fp16",
                                             scope=tik.scope_ubuf)
                self.clean_fp32_multi_repeat(data_vmul_ub_col2img_fp32, dtype_size * 2)
                with self.tik_instance.for_range(0, dyh) as looph:
                    # dy copy gm to ub
                    self.tik_instance.data_move(data_max_ub,
                                                data_input[(block_num * dyh + looph) *
                                                           dyw * channel],
                                                constant.SID, constant.DEFAULT_NBURST,
                                                dyw * channel * dtype_size // BLOCK_SIZE,
                                                constant.STRIDE_ZERO,
                                                constant.STRIDE_ZERO)
                    # mask define
                    data_mask_ub = self.tik_instance.Tensor("uint16", (col2img_dyw,),
                                                            name="data_mask_ub",
                                                            scope=tik.scope_ubuf)
                    with self.tik_instance.for_range(0, windowh * windoww) as mask_id:
                        # mask copy gm to ub
                        self.tik_instance.data_move(data_mask_ub,
                                                    data_mask[block_num * mask_one_window *
                                                              windoww * windowh +
                                                              looph * dyw + mask_id *
                                                              mask_one_window],
                                                    constant.SID, 1,
                                                    col2img_dyw * dtype_size // BLOCK_SIZE,
                                                    constant.STRIDE_ZERO, constant.STRIDE_ZERO)
                        data_vsel_ub = self.tik_instance.Tensor(dtype, (col2img_dyw * channel,),
                                                                name="data_vsel_ub",
                                                                scope=tik.scope_ubuf)
                        data_vsel_ub_fp32 = self.tik_instance.Tensor("float32", (col2img_dyw *
                                                                                 channel,),
                                                                     name="data_vsel_ub_fp32",
                                                                     scope=tik.scope_ubuf)
                        if v_rep_time > 0:
                            with self.tik_instance.for_range(0, v_rep_time,
                                                             thread_num=1) as cycle:
                                cmpmask = self.tik_instance.mov_tensor_to_cmpmask(
                                    data_mask_ub[cycle * MASK_MAX])
                                self.tik_instance.vsel(constant.MASK128, 0,
                                                       data_vsel_ub[cycle * FP16_MAX],
                                                       cmpmask,
                                                       data_max_ub[cycle * FP16_MAX],
                                                       data_vsel_ub_zero[0],
                                                       constant.REPEAT_TIME_ONCE,
                                                       constant.STRIDE_ONE,
                                                       constant.STRIDE_ONE,
                                                       constant.STRIDE_ONE,
                                                       constant.REPEAT_STRIDE_EIGHT,
                                                       constant.REPEAT_STRIDE_EIGHT,
                                                       constant.REPEAT_STRIDE_EIGHT)

                        # fp16 to fp32
                        if v_rep_cycle_fp32 > 0:
                            with self.tik_instance.for_range(0, v_rep_cycle_fp32,
                                                             thread_num=1) as cycle:
                                self.tik_instance.vconv(constant.MASK64, "",
                                                        data_vsel_ub_fp32[cycle * V_MAX_REPEAT *
                                                                          FP32_MAX],
                                                        data_vsel_ub[cycle * V_MAX_REPEAT *
                                                                     FP16_MAX],
                                                        V_MAX_REPEAT, constant.STRIDE_ONE,
                                                        constant.STRIDE_ONE,
                                                        constant.REPEAT_STRIDE_EIGHT,
                                                        constant.REPEAT_STRIDE_FOUR)
                        if v_rep_last_fp32 != 0:
                            self.tik_instance.vconv(constant.MASK64, "", data_vsel_ub_fp32[
                                v_rep_cycle_fp32 * V_MAX_REPEAT * FP32_MAX],
                                                    data_vsel_ub[
                                                        v_rep_cycle_fp32 * V_MAX_REPEAT * FP32_MAX],
                                                    v_rep_last_fp32, constant.STRIDE_ONE,
                                                    constant.STRIDE_ONE,
                                                    constant.REPEAT_STRIDE_EIGHT,
                                                    constant.REPEAT_STRIDE_FOUR)
                        # col2img
                        fetch_filter_w = mask_id % windoww
                        fetch_filter_h = mask_id // windoww
                        left_top_w = 0
                        left_top_h = 0
                        self.tik_instance.col2img(data_vmul_ub_col2img_fp32[0],
                                                  data_vsel_ub_fp32[0],
                                                  (0, 0, 0, 0),
                                                  col2img_h, col2img_w, fetch_filter_w,
                                                  fetch_filter_h, left_top_w, left_top_h,
                                                  stridew, strideh,
                                                  windoww, windowh, 1, 1,
                                                  col2img_dyw // 16)

                    if v_rep_cycle_col > 0:
                        with self.tik_instance.for_range(0, v_rep_cycle_col,
                                                         thread_num=1) as cycle:
                            self.tik_instance.vconv(constant.MASK64, "",
                                                    data_vmul_ub_col2img_fp16[
                                                        cycle * V_MAX_REPEAT * FP32_MAX],
                                                    data_vmul_ub_col2img_fp32[
                                                        cycle * V_MAX_REPEAT * FP32_MAX],
                                                    V_MAX_REPEAT, constant.STRIDE_ONE,
                                                    constant.STRIDE_ONE,
                                                    constant.REPEAT_STRIDE_FOUR,
                                                    constant.REPEAT_STRIDE_EIGHT)
                    if v_rep_last_col != 0:
                        self.tik_instance.vconv(constant.MASK64, "", data_vmul_ub_col2img_fp16[
                            v_rep_cycle_col * V_MAX_REPEAT * FP32_MAX],
                                                data_vmul_ub_col2img_fp32[
                                                    v_rep_cycle_col * V_MAX_REPEAT * FP32_MAX],
                                                v_rep_last_col, constant.STRIDE_ONE,
                                                constant.STRIDE_ONE,
                                                constant.REPEAT_STRIDE_FOUR,
                                                constant.REPEAT_STRIDE_EIGHT)

                    src_address = self.tik_instance.Scalar("int32")
                    dst_address = self.tik_instance.Scalar("int32")
                    nburst = self.tik_instance.Scalar("int32")
                    burst_len = self.tik_instance.Scalar("int32")
                    src_stride = self.tik_instance.Scalar("int32")
                    dst_stride = self.tik_instance.Scalar("int32")
                    if hoverlap == 0:
                        # move ub to gm
                        src_address.set_as(pad_left * channel)
                        dst_address.set_as(block_num * dxh * dxw * channel +
                                           (looph * col2img_h - pad_top) * dxw * channel)
                        nburst.set_as(col2img_h)
                        burst_len.set_as(self.offset_w)
                        src_stride.set_as(col2img_w - self.offset_w)
                        dst_stride.set_as(dxw - self.offset_w)
                        with self.tik_instance.if_scope(looph == 0):
                            src_address.set_as(src_address + pad_top * col2img_w * channel)
                            dst_address.set_as(block_num * dxh * dxw * channel)
                            nburst.set_as(nburst - pad_top)
                            with self.tik_instance.if_scope(looph == dyh - 1):
                                with self.tik_instance.if_scope(self.padding == "SAME"):
                                    nburst.set_as(dxh)
                        with self.tik_instance.else_scope():
                            with self.tik_instance.if_scope(looph == dyh - 1):
                                with self.tik_instance.if_scope(self.padding == "SAME"):
                                    nburst.set_as(dxh - col2img_h * looph + pad_top)
                                with self.tik_instance.else_scope():
                                    nburst.set_as(windowh)
                        self.tik_instance.data_move(data_output[dst_address],
                                                    data_vmul_ub_col2img_fp16[src_address],
                                                    constant.SID, nburst, burst_len,
                                                    src_stride, dst_stride)
                        data_clean_scalar_fp32 = self.tik_instance.Scalar("float32")
                        data_clean_scalar_fp32.set_as(0)
                        if v_rep_cycle_col > 0:
                            with self.tik_instance.for_range(0, v_rep_cycle_col,
                                                             thread_num=1) as cycle:
                                self.tik_instance.vector_dup(constant.MASK64,
                                                             data_vmul_ub_col2img_fp32[
                                                                 cycle * V_MAX_REPEAT *
                                                                 FP32_MAX],
                                                             data_clean_scalar_fp32,
                                                             V_MAX_REPEAT,
                                                             constant.STRIDE_ONE,
                                                             constant.REPEAT_STRIDE_EIGHT)
                        if v_rep_last_col != 0:
                            self.tik_instance.vector_dup(constant.MASK64,
                                                         data_vmul_ub_col2img_fp32[
                                                             v_rep_cycle_col * \
                                                             V_MAX_REPEAT * FP32_MAX],
                                                         data_clean_scalar_fp32,
                                                         v_rep_last_col,
                                                         constant.STRIDE_ONE,
                                                         constant.REPEAT_STRIDE_EIGHT)
                    else:
                        with self.tik_instance.if_scope((looph + 1) * strideh > pad_top):
                            src_address.set_as(pad_left * channel)
                            dst_address.set_as(block_num * dxh * dxw * channel +
                                               dxh_address_offset)
                            nburst.set_as(strideh)
                            with self.tik_instance.if_scope(looph * strideh < pad_top):
                                nburst.set_as((looph + 1) * strideh - pad_top)
                                src_address.set_as(src_address +
                                                   (pad_top - looph * strideh) *
                                                   col2img_w * channel)
                            with self.tik_instance.if_scope(
                                    tik.all(dxh_calcline < dxh, looph == dyh - 1)):
                                with self.tik_instance.if_scope(self.padding == "SAME"):
                                    nburst.set_as(dxh - dxh_calcline)
                                with self.tik_instance.else_scope():
                                    nburst.set_as(windowh)
                            burst_len.set_as(self.offset_w)
                            src_stride.set_as(col2img_w - self.offset_w)
                            dst_stride.set_as(dxw - self.offset_w)
                            self.tik_instance.data_move(data_output[dst_address],
                                                        data_vmul_ub_col2img_fp16[src_address],
                                                        constant.SID, nburst,
                                                        burst_len, src_stride,
                                                        dst_stride)
                            dxh_address_offset.set_as(dxh_address_offset + \
                                                      nburst * dxw * channel)
                            dxh_calcline.set_as(dxh_calcline + nburst)

                        # dma_copy ub to ub
                        self.tik_instance.data_move(data_vmul_ub_col2img_fp32[0],
                                                    data_vmul_ub_col2img_fp32[
                                                        strideh * channel * col2img_w],
                                                    constant.SID, hoverlap, 2 * col2img_w,
                                                    constant.STRIDE_ZERO,
                                                    constant.STRIDE_ZERO)
                        data_clean_scalar_fp32 = self.tik_instance.Scalar("float32")
                        data_clean_scalar_fp32.set_as(0)
                        if v_rep_afmv_cycle > 0:
                            with self.tik_instance.for_range(0, v_rep_afmv_cycle,
                                                             thread_num=1) as cycle:
                                self.tik_instance.vector_dup(constant.MASK64,
                                                             data_vmul_ub_col2img_fp32[
                                                                 hoverlap * channel *
                                                                 col2img_w +
                                                                 cycle * V_MAX_REPEAT *
                                                                 FP32_MAX],
                                                             data_clean_scalar_fp32,
                                                             V_MAX_REPEAT,
                                                             constant.STRIDE_ONE,
                                                             constant.REPEAT_STRIDE_EIGHT)
                        if v_rep_afmv_last != 0:
                            self.tik_instance.vector_dup(constant.MASK64,
                                                         data_vmul_ub_col2img_fp32[
                                                             hoverlap * channel * \
                                                             col2img_w + \
                                                             v_rep_afmv_cycle * \
                                                             V_MAX_REPEAT * FP32_MAX],
                                                         data_clean_scalar_fp32,
                                                         v_rep_afmv_last,
                                                         constant.STRIDE_ONE,
                                                         constant.REPEAT_STRIDE_EIGHT)
        self.tik_instance.BuildCCE(kernel_name=kernel_name,
                                   inputs=(data_input_origin, data_input, data_mask),
                                   outputs=(data_output), enable_l2=False)
        return self.tik_instance
Exemple #28
0
def scatter_nd_d(indices, x, y, shape, kernel_name="scatter_nd_d"):
    """
    the main function of scatter_nd_d

    Parameters
    ----------
    indices: dict,shape and datatype,datatype supports int32
    x: dict,shape and datatype,datatype supports float32,float16,int32,
       int8,uint8
    y: dict,shape and datatype,datatype supports float32,float16,int32,
       int8,uint8
    shape: out put shape
    kernel_name: cce kernel name, default value is "scatter_nd_d"

    Returns
    -------
    tik_instance: tik_instance
    """
    check_param(indices, x, y, shape, kernel_name)

    if _check_1d_updates(indices, x, y):
        return _scatter_nd_d_1d(indices, x, y, kernel_name)

    indices_shape = indices.get("shape")
    indice_len = scatter_nd_d_help.get_indice_len(indices_shape)
    update_each_size = scatter_nd_d_help.get_shape_total_number(
        x.get("shape")) // indice_len
    block_dim, loop_cycle = get_blockdim_and_loop_cycle(
        x, shape, update_each_size)
    output_shape = scatter_nd_d_help.get_shape_total_number(shape)
    output_spilts = output_shape // update_each_size
    last_spilt = output_spilts - output_spilts // block_dim * block_dim
    tik_instance = tik.Tik()
    input_param = (indices, x, y, shape)
    scatter = scatter_nd_d_help.ScatterNd(input_param, tik_instance)

    with tik_instance.for_range(0, block_dim, block_num=block_dim) as block_id:
        process = scatter_nd_d_help.ScatterProcess(scatter.tik_instance,
                                                   scatter.updates,
                                                   scatter.indices,
                                                   scatter.shape)
        cycle_each_block = tik_instance.Scalar("int32")
        cycle_each_block.set_as(loop_cycle)
        output_offset = tik_instance.Scalar("int32")
        output_size = tik_instance.Scalar("int32")
        output_size.set_as(cycle_each_block * process.update_each_size)

        with tik_instance.if_scope(tik.all(block_dim == \
                                           constant.MAX_BLOCK_NUMBER,
                                           last_spilt != 0,
                                           block_id < last_spilt)):
            cycle_each_block.set_as(loop_cycle + 1)
            output_size.set_as(cycle_each_block * process.update_each_size)
            output_offset.set_as(block_id * output_size)
        with tik_instance.else_scope():
            output_offset.set_as(block_id*output_size + \
                                 last_spilt*process.update_each_size)

        scatter.initial_output(process, output_offset, output_size)
        scatter.update_data(process, cycle_each_block, output_offset)

    tik_instance.BuildCCE(kernel_name=kernel_name,
                          inputs=(scatter.input_indices_gm,
                                  scatter.input_updates_gm),
                          outputs=(scatter.output_y_gm),
                          enable_l2=False)
    return tik_instance
Exemple #29
0
def _do_vec_dup(pattern, obj, max_num, blk_idx, mark, axis):
    """
    Params:
    top_address: start address for top padding.
    top_div_core: dividing line between two types of cores in top padding.
    top_total_core: physical cores for top padding.
    top_core_vol_x: volume of data processed by each core(type_x) for top padding.
    top_core_gap_x: gap between different cores(type_x) for top padding.

    Solution: MAX_CORE = 32
    in_shape is [34,16,16,16,...],func will work in [0, ] only.
    in_shape is [16,16,16,16,...],func will work in [0, 1].
    """
    if pattern == "top":
        begin_index = obj.top_address[axis]
        division_core = obj.top_div_core[axis]
        total_core = obj.top_total_core[axis]
        core_data_0 = obj.top_core_vol_0[axis]
        core_data_1 = obj.top_core_vol_1[axis]
        core_gap_0 = obj.top_core_gap_0[axis]
        core_gap_1 = obj.top_core_gap_1[axis]
        pad_data = obj.top_vol[axis]
    else:
        begin_index = obj.bottom_address[axis]
        division_core = obj.bottom_div_core[axis]
        total_core = obj.bottom_total_core[axis]
        core_data_0 = obj.bottom_core_vol_0[axis]
        core_data_1 = obj.bottom_core_vol_1[axis]
        core_gap_0 = obj.bottom_core_gap_0[axis]
        core_gap_1 = obj.bottom_core_gap_1[axis]
        pad_data = obj.bottom_vol[axis]

    # discriminate first layer or not.
    offset = obj.tik_instance.Scalar("int64", name="cir_offset_")
    offset_value = pad_data - core_data_0 * (division_core + 1) \
                   - core_data_1 * (total_core - division_core - 1)
    offset.set_as(offset_value)
    with obj.tik_instance.if_scope(pad_data - core_data_0 == 0):
        # not the first layer
        offset.set_as(0)

    vir_num, block_index = max_num, blk_idx

    # vector_dup: all physical cores.
    with obj.tik_instance.if_scope(mark != 1):
        set_vector_dup(obj, vir_num, 0)

    # data_move
    with obj.tik_instance.if_scope(block_index < division_core):
        dst_idx = begin_index + block_index * core_gap_0
        copy_buf2gm_circulation(obj, core_data_0, vir_num, dst_idx)

    with obj.tik_instance.if_scope(block_index == division_core):
        dst_idx = begin_index + division_core * core_gap_0
        copy_buf2gm_circulation(obj, core_data_0 + offset, vir_num, dst_idx)

    with obj.tik_instance.if_scope(
            tik.all(block_index > division_core, block_index < total_core)):
        begin_index += core_gap_0 * (division_core + 1) + offset
        block_index = block_index - (division_core + 1)
        dst_idx = begin_index + block_index * core_gap_1
        copy_buf2gm_circulation(obj, core_data_1, vir_num, dst_idx)
    def update_each_slice(self, process, update_offset, start_address):
        """
        update each update slice

        Parameters
        ----------
        process: ScatterProcess class,which used to store scatter nd parameters
        update_offset: the offset of gm update data
        start_address: the start_address of output data

        Returns
        -------
        None
        """
        output_offset = self.tik_instance.Scalar("int32")
        output_offset.set_as(start_address)
        total_size = self.tik_instance.Scalar("int32")
        total_size.set_as(MAX_UB_ELEMENT_NUMBER * self.data_size)
        with self.tik_instance.for_range(0,
                                         process.loop_update) as update_cycle:
            with self.tik_instance.if_scope(update_cycle == \
                                            process.loop_update - 1):
                total_size.set_as(process.last_update_ub_size * self.data_size)
            nburst = common_util.get_datamove_nburst(self.tik_instance,
                                                     total_size)
            repeats = common_util.get_vector_repeat_times(
                self.tik_instance, total_size)
            self.tik_instance.data_move(process.input_update_ub,
                                        self.input_updates_gm[update_offset],
                                        constant.SID, constant.DEFAULT_NBURST,
                                        nburst, constant.STRIDE_ZERO,
                                        constant.STRIDE_ZERO)

            self.tik_instance.data_move(process.input_ub,
                                        self.output_y_gm[output_offset],
                                        constant.SID, constant.DEFAULT_NBURST,
                                        nburst, constant.STRIDE_ZERO,
                                        constant.STRIDE_ZERO)

            dtype = process.input_ub.dtype
            input_ub_fp16 = None
            if dtype == constant.DATA_TYPE_INT8 \
                    or dtype == constant.DATA_TYPE_UINT8:
                input_shape = process.input_update_ub.shape
                total_number = constant.DATA_SIZE_TWO * \
                               get_shape_total_number(input_shape)
                input_ub_fp16 = self.tik_instance.Tensor(
                    constant.DATA_TYPE_FP16, (total_number, ),
                    name="input_ub_fp16",
                    scope=tik.scope_ubuf)
                self.tik_instance.vconv(
                    constant.MASK128, "", input_ub_fp16, process.input_ub,
                    repeats * constant.DATA_SIZE_TWO, constant.STRIDE_ONE,
                    constant.STRIDE_ONE, constant.REPEAT_STRIDE_EIGHT,
                    constant.REPEAT_STRIDE_FOUR)
            element_num = self.tik_instance.Scalar("int32")
            element_num.set_as(total_size // self.data_size)

            self.add_same_indices(process, repeats, input_ub_fp16, element_num)
            with self.tik_instance.if_scope(tik.all(total_size % \
                                                    constant.BLOCK_SIZE != 0, \
                                                    process.update_each_size * \
                                                    self.data_size >= \
                                                    constant.BLOCK_SIZE)):
                self.move_out_non32_alignment(process, output_offset,
                                              element_num)
            with self.tik_instance.else_scope():
                self.tik_instance.data_move(self.output_y_gm[output_offset],
                                            process.input_ub, constant.SID,
                                            constant.DEFAULT_NBURST, nburst,
                                            constant.STRIDE_ZERO,
                                            constant.STRIDE_ZERO)
            output_offset.set_as(output_offset + element_num)
            update_offset.set_as(update_offset + element_num)