Ejemplo n.º 1
0
def _sel_data(ir_builder, src_data, alloc_mem, offset_length):
    """
    select data from src_data with alloc_mem data
    """
    if src_data.dtype == 'float16':
        # list alloc_mem has diff data adds, ref: list alloc_res
        ir_builder.emit(
            tvm.call_extern(src_data.dtype, 'vsel',
                            alloc_mem[4].access_ptr('w', offset=offset_length),
                            alloc_mem[4].access_ptr('r', offset=offset_length),
                            alloc_mem[0].access_ptr('r', offset=0), 1, 1, 1, 1,
                            0, 0, 0))
    else:
        ir_builder.emit(
            tvm.call_extern('float16', 'vsel', alloc_mem[2].access_ptr('w'),
                            alloc_mem[1].access_ptr('r'),
                            alloc_mem[0].access_ptr('r'), 1, 1, 1, 1, 0, 0, 0))
        ir_builder.emit(
            tvm.call_extern("float32", "vconv_f162f32",
                            alloc_mem[3].access_ptr("rw"),
                            alloc_mem[2].access_ptr("r"), 2, 1, 1, 8, 4))
        ir_builder.emit(
            tvm.call_extern(
                "float32", "vmul",
                alloc_mem[4].access_ptr("rw", offset=offset_length),
                alloc_mem[4].access_ptr("r", offset=offset_length),
                alloc_mem[3].access_ptr("r"), 2, 1, 1, 1, 8, 8, 8))
Ejemplo n.º 2
0
    def _out_put_one_time(out_begin, block_index, sub_num, tail_core=False):
        """
        :param out_begin : multi core output begin address
        :param block_index : block index
        :param sub_num : sub cycle num
        """
        c0_pad_len = _ceil_fill(channel, channel0) - channel
        _do_vector_dup((params.output_ub, 0), len_params["output_data_len"],
                       params.dtype, params)
        with params.ib_.for_range(0, sub_num, for_type="serial",
                                  name="i") as sub_i:
            i = out_begin + block_index * len_params["one_output_num"] + sub_i
            _do_data_copy(i, sub_i, out_begin, c0_pad_len)

        core_out_len = sub_num * channel0

        if channel % channel0 != 0 and tail_core:
            out_begin_offset = params.ib_.allocate("int32", (1, ),
                                                   name="out_begin_offset",
                                                   scope=cce_params.scope_reg)
            out_begin_offset[0] = (out_begin // channel1) * c0_pad_len
            core_out_len -= ((
                (out_begin + block_index * len_params["one_output_num"] +
                 sub_num) // channel1) * c0_pad_len - out_begin_offset[0])
            pad_len = _ceil_fill(core_out_len,
                                 params.cp_align_len) - core_out_len
            with params.ib_.for_range(0,
                                      params.cp_align_len,
                                      for_type="serial",
                                      name="j") as sub_i:
                i = out_begin + block_index * len_params[
                    "one_output_num"] + sub_i + sub_num
                _do_data_copy(
                    i, sub_i + sub_num,
                    out_begin + block_index * len_params["one_output_num"],
                    c0_pad_len)

                real_pad_len = (
                    (i + 1) * channel0 - ((i + 1) // channel1) * c0_pad_len
                ) - ((out_begin + block_index * len_params["one_output_num"] +
                      sub_num) * channel0 -
                     ((out_begin + block_index * len_params["one_output_num"] +
                       sub_num) // channel1) * c0_pad_len)

                with params.ib_.if_scope(real_pad_len >= pad_len):
                    params.ib_.emit(tvm.call_extern(params.dtype, 'break'))

        num_cp = _ceil_div(core_out_len, params.cp_align_len)
        out_gm_offset = (
            out_begin +
            block_index * len_params["one_output_num"]) * channel0 - (
                (out_begin + block_index * len_params["one_output_num"]) //
                channel1) * c0_pad_len
        params.ib_.emit(
            tvm.call_extern(params.dtype, 'copy_ubuf_to_gm',
                            output_gm.access_ptr("rw", offset=out_gm_offset),
                            params.output_ub.access_ptr("r", offset=0), 0, 1,
                            num_cp, 0, 0))
Ejemplo n.º 3
0
 def _out_put_one_time(out_begin, block_index, sub_num):
     """
     :param out_begin : multi core output begin address
     :param block_index : block index
     :param sub_num : sub cycle num
     """
     _do_vector_dup((params.output_ub, 0), output_data_len, params.dtype,
                    params)
     with params.ib_.for_range(0, sub_num, for_type="serial",
                               name="sub_i") as sub_i:
         i_tmp = out_begin + block_index * one_output_num + sub_i
         with params.ib_.for_range(0,
                                   channel0,
                                   for_type="serial",
                                   name="c0_index") as c0_index:
             output_offset = sub_i * channel0 * channel0 + c0_index * channel0 + c0_index
             c1_index = i_tmp // (hight * weight)
             i_hw = i_tmp % (hight * weight)
             with params.ib_.if_scope(
                     channel0 * c1_index + c0_index < channel):
                 input_offset = i_hw * channel + channel0 * c1_index + c0_index
                 value = params.input_ub.vload(input_offset)
                 params.ib_.emit(
                     params.output_ub.vstore(output_offset, value))
     num_cp = _ceil_div(sub_num * channel0 * channel0, params.cp_align_len)
     out_gm_offset = (out_begin +
                      block_index * one_output_num) * channel0 * channel0
     params.ib_.emit(
         tvm.call_extern(params.dtype, 'copy_ubuf_to_gm',
                         output_gm.access_ptr("rw", offset=out_gm_offset),
                         params.output_ub.access_ptr("r", offset=0), 0, 1,
                         num_cp, 0, 0))
Ejemplo n.º 4
0
 def set_pipe_barrier(self, val):
     """
     :param val : "PIPE_ALL", "PIPE_MTE3", "PIPE_MTE2", "PIPE_MTE1",
     "PIPE_M", "PIPE_V", "PIPE_S"
     """
     args_str = tvm.call_pure_intrin("int32", "tvm_cce_string_print", val)
     self.ib_.emit(tvm.call_extern('int32', 'pipe_barrier', args_str))
Ejemplo n.º 5
0
 def _do_cmp_calcu(repeat, cal_offset, nbins_index):
     if params.compile_plat in ("Ascend910", "Ascend610", "Ascend710"):
         params.ir_builder.emit(
             tvm.call_extern(
                 params.vcadd_ub.dtype, "vadds",
                 params.vcadd_ub.access_ptr("rw", offset=0),
                 calc_ub_info[0].access_ptr("r", offset=cal_offset),
                 nbins_index * params.reg[3] + params.reg[0], repeat, 1, 1,
                 8, 8))
     else:
         # scalar can not supprt int32 to float32 in mini
         params.ir_builder.emit(
             tvm.call_extern('int32', 'pipe_barrier', params.args_str))
         params.reg[5] = params.index_ub.vload(0, params.mid_dtype)
         kernel_api.kernel_scalar_to_one_fuc(
             params.ir_builder,
             [[params.index_ub, 0], [params.index_ub, 0]],
             [1, params.mid_vec_align_len], ["vadds", params.reg[3]])
         params.ir_builder.emit(
             tvm.call_extern(
                 params.vcadd_ub.dtype, "vadds",
                 params.vcadd_ub.access_ptr("rw", offset=0),
                 calc_ub_info[0].access_ptr("r", offset=cal_offset),
                 params.reg[5], repeat, 1, 1, 8, 8))
     params.ir_builder.emit(
         tvm.call_extern(params.vcadd_ub.dtype, "vmax",
                         params.vcadd_ub.access_ptr("rw", offset=0),
                         params.vcadd_ub.access_ptr("r", offset=0),
                         params.range0_ub.access_ptr("r", offset=0), repeat,
                         1, 1, 1, 8, 8, 0))
     params.ir_builder.emit(
         tvm.call_extern(
             params.vcadd_ub.dtype, "vmin",
             params.vcadd_ub.access_ptr("rw", offset=0),
             params.vcadd_ub.access_ptr("r", offset=0),
             params.range0_ub.access_ptr("r",
                                         offset=params.mid_vec_align_len),
             repeat, 1, 1, 1, 8, 8, 0))
     params.ir_builder.emit(
         tvm.call_extern(params.vcadd_ub.dtype, "vmuls",
                         params.vcadd_ub.access_ptr("rw", offset=0),
                         params.vcadd_ub.access_ptr("r", offset=0),
                         tvm.const(2**38, dtype=params.vcadd_ub.dtype),
                         repeat, 1, 1, 8, 8))
     params.ir_builder.emit(
         tvm.call_extern(params.vcadd_ub.dtype, "vmuls",
                         params.vcadd_ub.access_ptr("rw", offset=0),
                         params.vcadd_ub.access_ptr("r", offset=0),
                         tvm.const(2**44, dtype=params.vcadd_ub.dtype),
                         repeat, 1, 1, 8, 8))
     params.ir_builder.emit(
         tvm.call_extern(params.vcadd_ub.dtype, "vmuls",
                         params.vcadd_ub.access_ptr("rw", offset=0),
                         params.vcadd_ub.access_ptr("r", offset=0),
                         tvm.const(2**44, dtype=params.vcadd_ub.dtype),
                         repeat, 1, 1, 8, 8))
Ejemplo n.º 6
0
 def _dump(data_len, cycle_offset):
     """
     :param data_len : length to dup
     :param cycle_offset : cycle_offset
     """
     params.ib_.emit(
         tvm.call_extern(
             dtype, 'vector_dup',
             buf.access_ptr("rw", offset=buf_offset + cycle_offset),
             tvm.const(val, dtype),
             _ceil_div(data_len, _get_vec_align_len(dtype)), 1, 1, 8, 8))
Ejemplo n.º 7
0
def _temp_ir(dst, data):
    tvm_ib = tvm.ir_builder.create()
    float_size = cce.cce_intrin.get_bit_len(data.dtype) // 8
    cp_align_len = cce_params.BLOCK_REDUCE_INT8 // float_size

    n_i, c_i, h_i, w_i = dst.shape

    ub_bytes = UB_SIZE_B
    ub_ele = ub_bytes // float_size
    shape_ele = n_i * c_i * h_i * w_i

    data_ub = _new_alloc(tvm_ib,
                         dst.dtype,
                         ub_ele,
                         "data_ub",
                         scope=cce.scope_ubuf)

    with tvm_ib.if_scope(shape_ele <= ub_ele):
        burst_len = _ceil_div(shape_ele, cp_align_len)
        tvm_ib.emit(
            tvm.call_extern(data_ub.dtype, "copy_gm_to_ubuf",
                            data_ub.access_ptr("w", offset=0),
                            data.access_ptr('r', offset=0), 0, 1, burst_len, 0,
                            0))
        tvm_ib.emit(
            tvm.call_extern(dst.dtype, "copy_ubuf_to_gm",
                            dst.access_ptr('w', offset=0),
                            data_ub.access_ptr("r", offset=0), 0, 1, burst_len,
                            0, 0))

    with tvm_ib.if_scope(shape_ele > ub_ele):
        loop = shape_ele // ub_ele
        mod = shape_ele % ub_ele
        with tvm_ib.for_range(0, loop, name="num_p") as num_p:
            burst_len = _ceil_div(ub_ele, cp_align_len)
            tvm_ib.emit(
                tvm.call_extern(data_ub.dtype, "copy_gm_to_ubuf",
                                data_ub.access_ptr("w", offset=0),
                                data.access_ptr('r', offset=num_p * ub_ele), 0,
                                1, burst_len, 0, 0))
            tvm_ib.emit(
                tvm.call_extern(dst.dtype, "copy_ubuf_to_gm",
                                dst.access_ptr('w', offset=num_p * ub_ele),
                                data_ub.access_ptr("r", offset=0), 0, 1,
                                burst_len, 0, 0))
        with tvm_ib.if_scope(mod > 0):
            burst_len = _ceil_div(mod, cp_align_len)
            tvm_ib.emit(
                tvm.call_extern(data_ub.dtype, "copy_gm_to_ubuf",
                                data_ub.access_ptr("w", offset=0),
                                data.access_ptr('r', offset=loop * ub_ele), 0,
                                1, burst_len, 0, 0))
            tvm_ib.emit(
                tvm.call_extern(dst.dtype, "copy_ubuf_to_gm",
                                dst.access_ptr('w', offset=loop * ub_ele),
                                data_ub.access_ptr("r", offset=0), 0, 1,
                                burst_len, 0, 0))

    return tvm_ib.get()
Ejemplo n.º 8
0
    def collapse(ir_b, buffer, current_size):
        """Function to do emit insn"""
        repeat = current_size // 2 / vector_inst_one_repeat_size
        tail_flag = False
        if not repeat.is_integer():
            tail_flag = True
        repeat = int(repeat)

        ir_b.emit(
            tvm.call_extern(buffer.dtype, "vadd",
                            buffer.access_ptr("rw", offset=0),
                            buffer.access_ptr("r", offset=0),
                            buffer.access_ptr("r", offset=8), repeat, 1, 2, 2,
                            8, 16, 16))

        # solve tail vadd
        if tail_flag:
            tail_mask = \
                (current_size - repeat * 2 * vector_inst_one_repeat_size) // 2
            te.platform.cce_intrin_md.reset_mask_insn(ir_builder,
                                                      in_buffer.dtype,
                                                      tail_mask)
            ir_b.emit(
                tvm.call_extern(
                    buffer.dtype, "vadd",
                    buffer.access_ptr("rw",
                                      offset=repeat *
                                      vector_inst_one_repeat_size),
                    buffer.access_ptr("r",
                                      offset=repeat * 2 *
                                      vector_inst_one_repeat_size),
                    buffer.access_ptr(
                        "r",
                        offset=repeat * 2 * vector_inst_one_repeat_size + 8),
                    1, 1, 2, 2, 0, 0, 0))
            te.platform.cce_intrin_md.reset_mask_insn(ir_builder,
                                                      in_buffer.dtype)
        return current_size // 2
Ejemplo n.º 9
0
def _do_cp_input_gm(input_gm, data_len, offset, params):
    """
    :param input_gm: gm input buf
    :param data_len : length to be add
    :param offset: gm offset
    :param params : parameters
    """
    params.ib_.emit(
        tvm.call_extern(params.dtype, 'copy_gm_to_ubuf',
                        params.input_ub.access_ptr("rw", offset=0),
                        input_gm.access_ptr("r", offset=offset), 0, 1,
                        _ceil_div(data_len, params.cp_align_len), 0, 0))

    params.set_pipe_barrier('PIPE_ALL')
Ejemplo n.º 10
0
def _special_ir(dst, data):
    tvm_ib = tvm.ir_builder.create()
    float_size = cce.cce_intrin.get_bit_len(data.dtype) // 8
    cp_align_len = cce_params.BLOCK_REDUCE_INT8 // float_size

    n_i, c_i, _, _ = dst.shape
    c_0 = 16
    n_true = _ceil_fill(n_i, c_0)
    ub_max = 3968 * 16

    data_ub = _new_alloc(tvm_ib,
                         dst.dtype,
                         ub_max,
                         "data_ub",
                         scope=cce.scope_ubuf)

    loop = c_i // c_0

    with tvm_ib.for_range(0, loop, name="n_loop") as n_loop:
        data_offset = n_loop * c_0 * n_true
        burst_len = n_i * c_0 // cp_align_len
        tvm_ib.emit(
            tvm.call_extern(data_ub.dtype, "copy_gm_to_ubuf",
                            data_ub.access_ptr("w", offset=0),
                            data.access_ptr('r', offset=data_offset), 0, 1,
                            burst_len, 0, 0))
        dst_offset = n_loop * c_0
        burst_len_data = c_0 // cp_align_len
        dst_stride = (c_i - c_0) // cp_align_len
        tvm_ib.emit(
            tvm.call_extern(dst.dtype, "copy_ubuf_to_gm",
                            dst.access_ptr('w', offset=dst_offset),
                            data_ub.access_ptr("r", offset=0), 0, n_i,
                            burst_len_data, 0, dst_stride))

    return tvm_ib.get()
Ejemplo n.º 11
0
    def _core_func(out_begin, out_end, element_num_of_core):
        """
        :param out_begin : multi core output begin address
        :param out_end : multi core output end address
        :param element_num_of_core : element num of one core
        """
        if out_end != total_element or element_num_of_core == total_element:
            core_cal_num = element_num_of_core
        else:
            core_cal_num = total_element % element_num_of_core

        if core_cal_num == 0:
            return

        _do_vector_dup((params.output_ub, 0), output_data_len, params.dtype,
                       params)
        _do_cp_input_gm(input_gm, input_data_len, 0, params)

        with params.ib_.for_range(0, core_cal_num, for_type="serial",
                                  name="i") as i:
            i_tmp = out_begin + i
            with params.ib_.for_range(0,
                                      channel0,
                                      for_type="serial",
                                      name="c0_index") as c0_index:
                output_offset = i * channel0 * channel0 + c0_index * channel0 + c0_index

                c1_index = i_tmp // (hight * weight)
                i_hw = i_tmp % (hight * weight)

                with params.ib_.if_scope(channel0 *
                                         (c1_index) + c0_index < channel):
                    value = params.input_ub.vload(i_hw * channel +
                                                  channel0 * c1_index +
                                                  c0_index)
                    params.ib_.emit(
                        params.output_ub.vstore(output_offset, value))

        num_cp = _ceil_div((core_cal_num * channel0 * channel0),
                           params.cp_align_len)
        params.ib_.emit(
            tvm.call_extern(
                params.dtype, 'copy_ubuf_to_gm',
                output_gm.access_ptr("rw",
                                     offset=out_begin * channel0 * channel0),
                params.output_ub.access_ptr("r", offset=0), 0, 1, num_cp, 0,
                0))
Ejemplo n.º 12
0
def _kernel_ir(dst, src, dst_type, src_type):
    """
    convert a scale from src type to dst type
    NOTICE: SCALE ONLY
    """
    ir_builder = tvm.ir_builder.create()
    in_tensor = src[0]
    a_ub = _new_alloc(ir_builder,
                      src_type,
                      in_tensor.shape,
                      "a_ub",
                      scope=tbe_platform.scope_ubuf)
    out_tensor = dst[0]
    b_ub = _new_alloc(ir_builder,
                      dst_type,
                      in_tensor.shape,
                      "b_ub",
                      scope=tbe_platform.scope_ubuf)

    reg = ir_builder.allocate(dst_type, (1, ),
                              name='reg',
                              scope=tbe_platform.scope_reg)
    ir_builder.emit(
        tvm.call_extern(src_type, "copy_gm_to_ubuf", a_ub.access_ptr("w"),
                        in_tensor.access_ptr("r"), 0, 1, 1, 0, 0))
    ir_builder.emit(
        tvm.call_extern(src_type, "reg_mov",
                        tvm.call_extern(dst_type, "reg", reg[0]),
                        a_ub.access_ptr('r', offset=0)))
    ir_builder.emit(
        tvm.call_extern(dst_type, "reg_mov", b_ub.access_ptr('w', offset=0),
                        tvm.call_extern(dst_type, "reg", reg[0])))
    ir_builder.emit(
        tvm.call_extern(dst_type,
                        "copy_ubuf_to_gm", out_tensor.access_ptr('w'),
                        b_ub.access_ptr("r"), 0, 1, 1, 0, 0))

    return ir_builder.get()
Ejemplo n.º 13
0
    def get_block_offset_one_core(self):
        """get_block_offset_one_core

        Parameters
        ----------
        self : self

        Returns
        -------
        None
        """
        self.out_begin = self.block.var * tvm.const(self.out_num_per_core,
                                                    "int32")
        self.out_end = \
            self.block.var*tvm.const(self.out_num_per_core, "int32") \
            + tvm.const(self.out_num_per_core, "int32")
        # conv index of onecore to index ubvconv_deq
        if self.compile_plat in ("Ascend310", ):
            kernel_api.kernel_vector_dup_fuc(self.ir_builder,
                                             [self.index_ub, 0], 1,
                                             [1, self.mid_vec_align_len])
            int_ub = kernel_api.ib_new_alloc(self.ir_builder,
                                             "int32", [8],
                                             "int_ub",
                                             scope=tbe_platform.scope_ubuf)
            fp16_ub = kernel_api.ib_new_alloc(self.ir_builder,
                                              "float16", [16],
                                              "fp16_ub",
                                              scope=tbe_platform.scope_ubuf)
            int_reg = self.ir_builder.allocate("int32", (1, ),
                                               name="int_data",
                                               scope=cce_params.scope_reg)
            self.ir_builder.emit(
                tvm.call_extern('int32', 'pipe_barrier', self.args_str))
            with self.ir_builder.if_scope(self.out_begin <= self.nbins):
                int_reg[0] = self.block.var
                self.ir_builder.emit(
                    tvm.call_extern('int32', 'pipe_barrier', self.args_str))
                kernel_api.kernel_vector_dup_fuc(self.ir_builder, [int_ub, 0],
                                                 int_reg[0], [1, 64])
                _addr_list = [[fp16_ub, 0], [int_ub, 0]]
                self.ir_builder.emit(
                    tvm.call_extern('int32', 'set_deqscale', self.deqscale))
                kernel_api.kernel_cast_to_fuc(self.ir_builder, _addr_list,
                                              [1, 64], "vconv_deq")
                _addr_list = [[self.index_ub, 0], [fp16_ub, 0]]
                # fp16 to s32
                kernel_api.kernel_cast_to_fuc(self.ir_builder, _addr_list,
                                              [1, self.mid_vec_align_len],
                                              "vconv_f162f32")
                kernel_api.kernel_scalar_to_one_fuc(
                    self.ir_builder, [[self.index_ub, 0], [self.index_ub, 0]],
                    [1, self.mid_vec_align_len],
                    ["vmuls", self.out_num_per_core])
                kernel_api.kernel_scalar_to_one_fuc(
                    self.ir_builder, [[self.index_ub, 0], [self.index_ub, 0]],
                    [1, self.mid_vec_align_len], ["vmuls", self.reg[3]])
                kernel_api.kernel_scalar_to_one_fuc(
                    self.ir_builder, [[self.index_ub, 0], [self.index_ub, 0]],
                    [1, self.mid_vec_align_len], ["vadds", self.reg[0]])
                self.ir_builder.emit(
                    tvm.call_extern('int32', 'pipe_barrier', self.args_str))
                self.reg[6] = self.index_ub.vload(0, self.mid_dtype)
Ejemplo n.º 14
0
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod",
                       need_build=False, need_print=False):
    """
    do element-wise truncatemod operation between two input tensors

    Parameters:
    ----------
    shape1 : shape of input data1

    shape2 : shape of input data2

    dtype : source data type, support float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_tf_truncatemod"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    max_dim = 8
    shape1_len = len(shape1)
    shape2_len = len(shape2)
    if shape1_len > max_dim or shape2_len > max_dim:
        raise RuntimeError(
            "mod_cce only support up to %d dimensions while the shape's \
            dimensions is %d, %d" % (max_dim, shape1_len, shape2_len))
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape1)
    util.check_shape_rule(shape2)

    util.check_shape_size(shape1, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape2, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    device_api_map = {"float16": "cc_device_truncatemod_float16",
                      "float32": "cc_device_truncatemod_float",
                      "int32": "cc_device_truncatemod_int32"}

    dtype = dtype.lower()
    if dtype not in check_list:
        raise RuntimeError(
            "tf_truncatemod_cce only support %s while dtype is %s" % (
                ",".join(check_list), dtype))

    shape1, shape2, shape_out = util.produce_shapes(shape1, shape2)
    util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    device_api = device_api_map[inp_dtype]

    # block
    block_num = "block_num"
    block_idx = "block_idx"
    # x param
    v_xndim_cnt = tvm.const(len(shape1), "int32")
    p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape")
    xpad_c0 = tvm.const(0, "int32")
    data_input_x = tvm.placeholder(shape1, name="data_input_x",
                                   dtype=inp_dtype)
    # y param
    v_yndim_cnt = tvm.const(len(shape2), "int32")
    p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape")
    ypad_c0 = tvm.const(0, "int32")
    data_input_y = tvm.placeholder(shape2, name="data_input_y",
                                   dtype=inp_dtype)
    # output
    v_out_ndim_cnt = tvm.const(len(shape_out), "int32")
    p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape")
    out_padc0 = tvm.const(0, "int32")

    output = tvm.extern(shape_out,
                        [p_xshape, data_input_x, p_yshape, data_input_y,
                         p_out_shape], lambda ins, outs:
                        tvm.call_extern("int32_t", device_api,
                                        block_num,
                                        block_idx,
                                        v_xndim_cnt,
                                        ins[0].access_ptr("r"),  # shape x
                                        xpad_c0,
                                        ins[1].access_ptr("r"),  # input x
                                        v_yndim_cnt,
                                        ins[2].access_ptr("r"),  # shape y
                                        ypad_c0,
                                        ins[3].access_ptr("r"),  # input y
                                        v_out_ndim_cnt,
                                        ins[4].access_ptr("r"),  # shape out
                                        out_padc0,
                                        outs[0].access_ptr("w")),
                        name="output", dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    # print IR
    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input_x, data_input_y, output],
                            simple_mode=True))
            # Compile to generate the cce file
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input_x, data_input_y, output], "cce",
                      name=kernel_name)
Ejemplo n.º 15
0
def custom_round(shape,
                 dtype,
                 kernel_name="cce_round",
                 need_build=False,
                 need_print=False):
    """
    doing round operations, calculating data type is float16 or float32 or int32
    
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype

    kernel_name : cce kernel name, default value is "cce_round"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
        
    """
    check_list = ["float16", "float32", "int32"]
    device_api_map = {
        "float16": "cc_device_round_float16",
        "float32": "cc_device_round_float",
        "int32": "cc_device_round_int32"
    }

    max_dim = 8
    shape_len = len(shape)
    if shape_len > max_dim:
        raise RuntimeError(
            "round_cce only support up to %d dimensions while the shape's dimension is %d"
            % (max_dim, shape_len))

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in check_list):
        raise RuntimeError("round_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)
    device_api = device_api_map[inp_dtype]

    block_num = "block_num"
    block_idx = "block_idx"
    v_ndim = tvm.const(len(shape), "int32")
    padC0 = tvm.const(0, "int32")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_input, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_ndim,
            ins[1].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Ejemplo n.º 16
0
def custom_Power(shape,
                 dtype,
                 gamma,
                 alpha,
                 beta,
                 kernel_name="cce_caffe_power",
                 need_build=False,
                 need_print=False):
    """
    calculate (alpha * data + beta) ** gamma, calulation method exp(gamma * log(alpha * data + beta)).
    when alpha * data + beta < 0 , the output is a meaningless value.
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32

    gamma : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    alpha : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    beta : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    kernel_name : string
        kernel name in generated CCE kernal. default value is "cce_caffe_power"


    need_buid : bool
        if need to build CCEC kernel

    need_print : bool
        if need to print Halide IR

    Returns
    -------
    None
        
    """
    supported_dtypes = ["float16", "float32"]
    device_api = "cc_device_pow"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("power_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim_x = len(shape)
    v_ndim_y = 0
    p_shape_y = 0
    p_input_y = "nullptr"
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0

    p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift")
    p_power = util.create_param_ptr([gamma], inp_dtype, "p_power")
    p_shape_x = util.create_param_ptr(shape, "int32", "p_shape_x")

    # scale --> alpha, shitf --> beta, power --> gamma
    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_power, p_shape_x],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # power
            v_ndim_x,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            v_ndim_y,
            v_ndim_y,
            p_shape_y,
            padC0,
            p_input_y,
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Ejemplo n.º 17
0
def _do_operation(ir_builder, place_holders, plantform_paras, loops_remains,
                  const_1, block_offset, shape_each_core, num_remain_by_128,
                  is_not_align):
    #alloc_res[0:data_zero_ub 1:data_fp16_1
    # 2:data_fp16_mask 3:data_fp32_1 4:data_tensor_ub 5:data_mask_ub]
    #offsets[0:total_gm_data_offset 1:total_gm_mask_offset
    # 2:offset_gm_data 3:offset_gm_mask
    # 4:total_ub_data_offset 5:total_ub_mask_offset]
    #repeates[0:repeate_ub_data 1:repeate_ub_mask 2:repeate_ub_vector
    # 3:repeate_d 4:repeate_m 5:repeate_v]
    # 6 = the list size

    reg = ir_builder.allocate(place_holders[0].dtype, (1, ),
                              name="reg",
                              scope=tbe_platform.scope_reg)
    [alloc_res, offsets, repeates] = [[None] * 7, [0] * 6, [0] * 6]
    [offsets[0],
     offsets[1]] = [offsets[0] + block_offset, offsets[1] + block_offset // 8]

    [alloc_res[0], alloc_res[1], alloc_res[2], alloc_res[3]] = [
        _new_alloc(ir_builder,
                   'float16', (ELEMS_BATCH_PROCESS_FP16, ),
                   "data_zero_ub",
                   scope=tbe_platform.scope_ubuf),
        _new_alloc(ir_builder,
                   'float16', (ELEMS_BATCH_PROCESS_FP16, ),
                   "data_fp16one_ub",
                   scope=tbe_platform.scope_ubuf),
        _new_alloc(ir_builder,
                   'float16', (ELEMS_BATCH_PROCESS_FP16, ),
                   "data_fp16_all1_mask_ub",
                   scope=tbe_platform.scope_ubuf),
        _new_alloc(ir_builder,
                   'float32', (ELEMS_BATCH_PROCESS_FP16, ),
                   "data_fp32one_ub",
                   scope=tbe_platform.scope_ubuf)
    ] if (place_holders[0].dtype == 'float32') else [
        _new_alloc(ir_builder,
                   'float16', (ELEMS_BATCH_PROCESS_FP16, ),
                   "data_zero_ub",
                   scope=tbe_platform.scope_ubuf), None, None, None
    ]

    [alloc_res[4], alloc_res[5], alloc_res[6]] = [
        _new_alloc(ir_builder,
                   place_holders[0].dtype, (plantform_paras[0], ),
                   "data_tensor_ub",
                   scope=tbe_platform.scope_ubuf),
        _new_alloc(ir_builder,
                   place_holders[1].dtype, (plantform_paras[0] // 8, ),
                   "data_mask_ub",
                   scope=tbe_platform.scope_ubuf),
        _new_alloc(ir_builder,
                   place_holders[3].dtype, (1, ),
                   "keep_prob_tensor_ub",
                   scope=tbe_platform.scope_ubuf)
    ] if (loops_remains[0] > 0) else [None, None, None]
    const_buf = _new_alloc(ir_builder,
                           const_1.dtype, (ELEMS_BATCH_PROCESS_FP16, ),
                           "const_1_ub",
                           scope=tbe_platform.scope_ubuf)
    if loops_remains[0] > 0:
        with ir_builder.for_range(0, loops_remains[0],
                                  name='index0') as index0:
            [offsets[2], offsets[3]] = [
                block_offset + plantform_paras[0] * index0,
                block_offset // 8 + plantform_paras[0] // 8 * index0
            ]
            # 16: fp16 elems can be move by once is 16,
            # lots of '16' below for this reason
            # 32: uint8 elems can be move by once is 32,
            # lots of '32' below for this reason
            # 64: fp32 elems can be process by vector instruction,
            # lots of '64' below for this reason
            [repeates[0], repeates[1], repeates[2]] = [
                plantform_paras[0] // 16, plantform_paras[0] // 8 //
                32, plantform_paras[0] // ELEMS_BATCH_PROCESS_FP16
            ] if (place_holders[0].dtype == 'float16') else [
                plantform_paras[0] // 8, plantform_paras[0] // 8 //
                32, plantform_paras[0] // 64
            ]

            ir_builder.emit(
                tvm.call_extern('float16', "vector_dup",
                                alloc_res[0].access_ptr("rw"),
                                tvm.const(0.0,
                                          dtype='float16'), 1, 1, 1, 8, 8))
            ir_builder.emit(
                tvm.call_extern(const_1.dtype, "vector_dup",
                                const_buf.access_ptr("rw"),
                                tvm.const(1.0, dtype=const_1.dtype), 1, 1, 1,
                                8, 8))

            if place_holders[0].dtype == 'float32':
                ir_builder.emit(
                    tvm.call_extern('float16', "vector_dup",
                                    alloc_res[1].access_ptr("rw"),
                                    tvm.const(1.0,
                                              dtype='float16'), 1, 1, 1, 8, 8))

            ir_builder.emit(
                tvm.call_extern(
                    place_holders[1].dtype, "copy_gm_to_ubuf",
                    alloc_res[5].access_ptr("w"),
                    place_holders[1].access_ptr("r", offset=offsets[3]), 0, 1,
                    repeates[1], 0, 0))

            ir_builder.emit(
                tvm.call_extern(
                    place_holders[0].dtype, "copy_gm_to_ubuf",
                    alloc_res[4].access_ptr("w"),
                    place_holders[0].access_ptr("r", offset=offsets[2]), 0, 1,
                    repeates[0], 0, 0))
            ir_builder.emit(
                tvm.call_extern(place_holders[3].dtype, "copy_gm_to_ubuf",
                                alloc_res[6].access_ptr("w"),
                                place_holders[3].access_ptr("r", offset=0), 0,
                                1, 1, 0, 0))
            cce_intrin_md.reset_mask_insn(ir_builder,
                                          const_1.dtype,
                                          bits=1,
                                          mask_func=None)
            ir_builder.emit(
                tvm.call_extern(place_holders[3].dtype, 'vdiv',
                                alloc_res[6].access_ptr('w'),
                                const_buf.access_ptr('r'),
                                alloc_res[6].access_ptr('r'), 1, 1, 1, 1, 8, 8,
                                8))

            cce_intrin_md.reset_mask_insn(ir_builder,
                                          const_1.dtype,
                                          bits=ELEMS_BATCH_PROCESS_FP16,
                                          mask_func=None)
            ir_builder.emit(
                tvm.call_extern(place_holders[3].dtype, "reg_mov",
                                tvm.call_extern(reg.dtype, "reg", reg[0]),
                                alloc_res[6].access_ptr("r", offset=0)))

            offset_src = 64 * 255 if place_holders[
                0].dtype == "float32" else 128 * 255
            repeate_vmuls = repeates[2] // 255
            repeat_left = repeates[2] % 255
            for i in range(repeate_vmuls):
                ir_builder.emit(
                    tvm.call_extern(
                        place_holders[0].dtype, 'vmuls',
                        alloc_res[4].access_ptr('w', offset=offset_src * i),
                        alloc_res[4].access_ptr('r'), reg[0], 255, 1, 1, 8, 8))
            ir_builder.emit(
                tvm.call_extern(
                    place_holders[0].dtype, 'vmuls',
                    alloc_res[4].access_ptr('w',
                                            offset=offset_src * repeate_vmuls),
                    alloc_res[4].access_ptr('r',
                                            offset=offset_src * repeate_vmuls),
                    reg[0], repeat_left, 1, 1, 8, 8))

            with ir_builder.for_range(0, loops_remains[1],
                                      name='index1') as index1:
                ir_builder.emit(
                    tvm.call_extern(
                        place_holders[1].dtype, 'set_cmpmask',
                        alloc_res[5].access_ptr('r', offset=16 * index1)))
                _sel_data(ir_builder, place_holders[0], alloc_res,
                          ELEMS_BATCH_PROCESS_FP16 * index1)

            ir_builder.emit(
                tvm.call_extern(
                    place_holders[2].dtype, "copy_ubuf_to_gm",
                    place_holders[2].access_ptr('w', offset=offsets[2]),
                    alloc_res[4].access_ptr("r"), 0, 1, repeates[0], 0, 0))

        [offsets[0], offsets[1]] = [
            offsets[0] + plantform_paras[0] * loops_remains[0],
            offsets[1] + plantform_paras[0] * loops_remains[0] // 8
        ]

    if loops_remains[2]:
        # 0:data_shape 1:mask_shape
        if num_remain_by_128 != 0 and is_not_align:
            remain_shapes = ((int(place_holders[0].shape[0]) -
                              plantform_paras[0] * loops_remains[0], ),
                             (int(place_holders[1].shape[0]) -
                              plantform_paras[0] // 8 * loops_remains[0], ))
        else:
            remain_shapes = ((shape_each_core -
                              plantform_paras[0] * loops_remains[0], ),
                             (shape_each_core // 8 -
                              plantform_paras[0] // 8 * loops_remains[0], ))
        [alloc_res[4], alloc_res[5], alloc_res[6]] = [
            _new_alloc(ir_builder,
                       place_holders[0].dtype,
                       remain_shapes[0],
                       "data_tensor_ub",
                       scope=tbe_platform.scope_ubuf),
            _new_alloc(ir_builder,
                       place_holders[1].dtype,
                       remain_shapes[1],
                       "data_mask_ub",
                       scope=tbe_platform.scope_ubuf),
            _new_alloc(ir_builder,
                       place_holders[3].dtype, (1, ),
                       "keep_prob_tensor_ub",
                       scope=tbe_platform.scope_ubuf)
        ]

        [repeates[3], repeates[4], repeates[5]] = [
            int(math.ceil(remain_shapes[0][0] * 1.0 / 8)),
            int(math.ceil(remain_shapes[1][0] * 1.0 / 32)),
            int(remain_shapes[0][0] * 1.0 / 64)
        ] if (place_holders[0].dtype == 'float32') else [
            int(math.ceil(remain_shapes[0][0] * 1.0 / 16)),
            int(math.ceil(remain_shapes[1][0] * 1.0 / 32)),
            int(remain_shapes[0][0] * 1.0 / ELEMS_BATCH_PROCESS_FP16)
        ]

        ir_builder.emit(
            tvm.call_extern('float16', "vector_dup",
                            alloc_res[0].access_ptr("rw"),
                            tvm.const(0.0, dtype='float16'), 1, 1, 1, 8, 8))
        ir_builder.emit(
            tvm.call_extern(const_1.dtype, "vector_dup",
                            const_buf.access_ptr("rw"),
                            tvm.const(1.0, dtype=const_1.dtype), 1, 1, 1, 8,
                            8))

        if place_holders[0].dtype == 'float32':
            ir_builder.emit(
                tvm.call_extern('float16', "vector_dup",
                                alloc_res[1].access_ptr("rw"),
                                tvm.const(1.0,
                                          dtype='float16'), 1, 1, 1, 8, 8))
        ir_builder.emit(
            tvm.call_extern(
                place_holders[1].dtype, "copy_gm_to_ubuf",
                alloc_res[5].access_ptr("w"),
                place_holders[1].access_ptr("r", offset=offsets[1]), 0, 1,
                repeates[4], 0, 0))

        ir_builder.emit(
            tvm.call_extern(
                place_holders[0].dtype, "copy_gm_to_ubuf",
                alloc_res[4].access_ptr("w"),
                place_holders[0].access_ptr("r", offset=offsets[0]), 0, 1,
                repeates[3], 0, 0))

        ir_builder.emit(
            tvm.call_extern(place_holders[3].dtype, "copy_gm_to_ubuf",
                            alloc_res[6].access_ptr("w"),
                            place_holders[3].access_ptr("r", offset=0), 0, 1,
                            1, 0, 0))
        cce_intrin_md.reset_mask_insn(ir_builder,
                                      const_1.dtype,
                                      bits=1,
                                      mask_func=None)

        ir_builder.emit(
            tvm.call_extern(place_holders[3].dtype, 'vdiv',
                            alloc_res[6].access_ptr('w'),
                            const_buf.access_ptr('r'),
                            alloc_res[6].access_ptr('r'), 1, 1, 1, 1, 8, 8, 8))

        cce_intrin_md.reset_mask_insn(ir_builder,
                                      const_1.dtype,
                                      bits=ELEMS_BATCH_PROCESS_FP16,
                                      mask_func=None)
        ir_builder.emit(
            tvm.call_extern(place_holders[0].dtype, "reg_mov",
                            tvm.call_extern(reg.dtype, "reg", reg[0]),
                            alloc_res[6].access_ptr("r", offset=0)))

        offset_src = 64 * 255 if place_holders[
            0].dtype == "float32" else 128 * 255
        repeate_vmuls = repeates[5] // 255
        repeat_left = repeates[5] % 255
        for i in range(repeate_vmuls):
            ir_builder.emit(
                tvm.call_extern(
                    place_holders[0].dtype, 'vmuls',
                    alloc_res[4].access_ptr('w', offset=offset_src * i),
                    alloc_res[4].access_ptr('r'), reg[0], 255, 1, 1, 8, 8))
        ir_builder.emit(
            tvm.call_extern(
                place_holders[0].dtype, 'vmuls',
                alloc_res[4].access_ptr('w',
                                        offset=offset_src * repeate_vmuls),
                alloc_res[4].access_ptr('r',
                                        offset=offset_src * repeate_vmuls),
                reg[0], repeat_left, 1, 1, 8, 8))

        remains_divs = ELEMS_BATCH_PROCESS_FP16 if place_holders[0].dtype == 'float16' \
            else 64
        [loops_remains[1], loops_remains[3]] = [
            remain_shapes[0][0] // remains_divs,
            remain_shapes[0][0] % remains_divs
        ]

        loops = ((loops_remains[1]) // 2) + 1 if place_holders[0].dtype == 'float32' \
            else loops_remains[1]
        with ir_builder.for_range(0, loops, name='index2') as index2:
            ir_builder.emit(
                tvm.call_extern(
                    place_holders[1].dtype, 'set_cmpmask',
                    alloc_res[5].access_ptr('r', offset=16 * index2)))
            _sel_data(ir_builder, place_holders[0], alloc_res,
                      ELEMS_BATCH_PROCESS_FP16 * index2)

        [offsets[4], offsets[5]] = [
            plantform_paras[1] * loops_remains[1], plantform_paras[2] *
            loops_remains[1]
        ] if (place_holders[0].dtype == 'float32') else [
            plantform_paras[1] * loops_remains[1], plantform_paras[2] *
            loops_remains[1]
        ]

        if loops_remains[3]:
            cce_intrin_md.reset_mask_insn(ir_builder,
                                          place_holders[0].dtype,
                                          bits=loops_remains[3],
                                          mask_func=None)

            ir_builder.emit(
                tvm.call_extern(
                    place_holders[0].dtype, 'vmuls',
                    alloc_res[4].access_ptr('w', offset=offsets[4]),
                    alloc_res[4].access_ptr('r', offset=offsets[4]), reg[0], 1,
                    1, 1, 8, 8))

            if place_holders[0].dtype == 'float16' or loops_remains[1] == 0:
                ir_builder.emit(
                    tvm.call_extern(
                        place_holders[1].dtype, 'set_cmpmask',
                        alloc_res[5].access_ptr('r', offset=offsets[5])))
                _sel_data(ir_builder, place_holders[0], alloc_res, offsets[4])
                cce_intrin_md.reset_mask_insn(ir_builder,
                                              place_holders[0].dtype,
                                              bits=ELEMS_BATCH_PROCESS_FP16,
                                              mask_func=None)

        ir_builder.emit(
            tvm.call_extern(
                place_holders[2].dtype, "copy_ubuf_to_gm",
                place_holders[2].access_ptr('w', offset=offsets[0]),
                alloc_res[4].access_ptr("r"), 0, 1, repeates[3], 0, 0))
Ejemplo n.º 18
0
def _func_more_row(args):
    """
    function of moving data for more row scene

    """
    tvm_ib, param, data, dst, data_ub, data_res, data_tail, reg, reg_addr,\
    num_g, num_row_cur_core, c_0 = args

    _, n_no, n_ni, c_0 = data.shape
    row_ele = n_no*n_ni*c_0
    h_i, w_i, c_i, n_i = dst.shape
    c_1 = _ceil_div(c_i, c_0)
    h_w = h_i*w_i
    num_row_before_core = num_g*param.get("num_row_one_group")\
                          + param.get("block_index")\
                          * param.get("num_row_one_core")
    num_hw_dst_before_core = num_row_before_core // c_1
    num_c0_dst_cur_hw_before = num_row_before_core % c_1
    num_c0_dst_cur_hw = c_1 - num_c0_dst_cur_hw_before
    reg_count = 8

    with tvm_ib.if_scope(num_row_cur_core <= num_c0_dst_cur_hw):
        data_offset = num_c0_dst_cur_hw_before*h_w*row_ele\
                      + num_hw_dst_before_core*row_ele
        n_burst = num_row_cur_core
        burst_len_data = _ceil_div(row_ele, param.get("cp_align_len"))
        src_stride = _ceil_div((h_w - 1)*row_ele, param.get("cp_align_len"))
        args = tvm_ib, param, data, data_ub, data_offset, 0, n_burst,\
               burst_len_data, src_stride, 0
        _func_gm_to_ub(args)

        c_t = tvm.min((num_c0_dst_cur_hw_before + 1) * c_0, c_i)
        c_cur = c_t - (num_c0_dst_cur_hw_before*c_0)
        with tvm_ib.for_range(0, num_row_cur_core, name="num_tr") as num_tr:
            with tvm_ib.for_range(0, c_cur, name="num_c")  as num_c:
                with tvm_ib.for_range(0, n_no, name="num_no") as num_no:
                    n_t = tvm.min((num_no + 1)*n_ni, n_i)
                    n_cur = n_t - num_no*n_ni
                    with tvm_ib.if_scope(n_cur % reg_count == 0):
                        n_cur_times_8 = n_cur // reg_count
                        reg_list = [n for n in range(reg_count)]
                        with tvm_ib.for_range(0, n_cur_times_8,
                                              name="num_nc") as num_nc:
                            for reg_idx in reg_list:
                                tvm_ib.emit(tvm.call_extern(
                                    data_ub.dtype,
                                    "reg_mov",
                                    tvm.call_extern(reg.dtype,
                                                    "reg",
                                                    reg[reg_idx]),
                                    data_ub.access_ptr(
                                        'r',
                                        offset=(num_tr * row_ele +
                                                num_no * n_ni * c_0 +
                                                (reg_idx +
                                                 num_nc * reg_count) * c_0 +
                                                num_c))
                                ))

                            for reg_idx in reg_list:
                                tvm_ib.emit(tvm.call_extern(
                                    data_res.dtype,
                                    "reg_mov",
                                    data_res.access_ptr(
                                        'w',
                                        offset=(num_tr * c_0 * n_i +
                                                num_c * n_i +
                                                num_no * n_ni +
                                                (reg_idx +
                                                 num_nc * reg_count))),
                                    tvm.call_extern(reg.dtype,
                                                    "reg",
                                                    reg[reg_idx])
                                ))
                    with tvm_ib.else_scope():
                        with tvm_ib.for_range(0, n_cur,
                                              name="num_nc") as num_nc:
                            tvm_ib.emit(tvm.call_extern(
                                data_ub.dtype, "reg_mov",
                                tvm.call_extern(reg.dtype, "reg",
                                                reg[0]),
                                data_ub.access_ptr(
                                    'r',
                                    offset=(
                                        num_tr * row_ele +
                                        num_no * n_ni * c_0 +
                                        num_nc * c_0 + num_c))
                            ))

                            tvm_ib.emit(tvm.call_extern(
                                data_res.dtype, "reg_mov",
                                data_res.access_ptr(
                                    'w',
                                    offset=(
                                        num_tr * c_0 * n_i +
                                        num_c * n_i +
                                        num_no * n_ni +
                                        num_nc)),
                                tvm.call_extern(reg.dtype, "reg",
                                                reg[0])
                            ))

        c_t = tvm.min((num_c0_dst_cur_hw_before+num_row_cur_core)*c_0, c_i)
        c_cur = c_t - (num_c0_dst_cur_hw_before * c_0)
        total_len = c_cur * n_i
        reg_addr[5] = total_len
        dst_offset = num_hw_dst_before_core*c_i*n_i\
                     + num_c0_dst_cur_hw_before*c_0*n_i
        args = tvm_ib, param, dst, data_res, data_tail, reg, reg_addr, 0, 0,\
               dst_offset, reg_addr[5]
        _res_to_gm_more_row(args)
    with tvm_ib.if_scope(num_row_cur_core > num_c0_dst_cur_hw):
        num_c0_head = num_c0_dst_cur_hw
        num_row_after = num_row_cur_core - num_c0_head
        reg_addr[2] = num_row_after
        num_g_mid = reg_addr[2] // c_1
        num_c0_tail = reg_addr[2] % c_1

        # gm to ub to ub_res
        with tvm_ib.if_scope(num_c0_head > 0):
            data_offset = num_c0_dst_cur_hw_before * h_w * row_ele\
                          + num_hw_dst_before_core * row_ele
            n_burst = num_c0_head
            burst_len_data = _ceil_div(row_ele, param.get("cp_align_len"))
            src_stride = _ceil_div((h_w - 1) * row_ele,
                                   param.get("cp_align_len"))
            args = tvm_ib, param, data, data_ub, data_offset, 0, n_burst,\
                   burst_len_data, src_stride, 0
            _func_gm_to_ub(args)
            # ub to ub_res
            with tvm_ib.for_range(0, num_c0_head, name="num_c0") as num_c0:
                c_t = tvm.min((num_c0_dst_cur_hw_before + num_c0 + 1) * c_0,
                              c_i)
                c_cur = c_t - (num_c0_dst_cur_hw_before + num_c0)*c_0
                with tvm_ib.for_range(0, c_cur, name="num_cr") as num_cr:
                    with tvm_ib.for_range(0, n_no, name="num_no") as num_no:
                        n_t = tvm.min((num_no + 1) * n_ni, n_i)
                        n_cur = n_t - num_no*n_ni
                        with tvm_ib.for_range(0, n_cur, name="num_nc")\
                                as num_nc:
                            tvm_ib.emit(tvm.call_extern(
                                data_ub.dtype, "reg_mov",
                                tvm.call_extern(reg.dtype, "reg", reg[0]),
                                data_ub.access_ptr('r',
                                                   offset=(num_c0*row_ele
                                                           + num_no*n_ni*c_0
                                                           + num_nc*c_0
                                                           + num_cr))
                            ))
                            tvm_ib.emit(tvm.call_extern(
                                data_res.dtype, "reg_mov",
                                data_res.access_ptr('w', offset=(
                                    num_c0*c_0*n_i + num_cr*n_i
                                    + num_no*n_ni + num_nc)),
                                tvm.call_extern(reg.dtype, "reg", reg[0])
                            ))

        with tvm_ib.if_scope(num_g_mid > 0):
            num_row_before_mid = num_row_before_core + num_c0_head
            reg_addr[3] = num_row_before_mid
            num_hw_dst_before_mid = reg_addr[3] // c_1
            n_burst = c_1
            burst_len_data = _ceil_div(row_ele, param.get("cp_align_len"))
            src_stride = _ceil_div((h_w - 1) * row_ele,
                                   param.get("cp_align_len"))
            ub_offset_mid_begin = num_c0_head*row_ele
            with tvm_ib.for_range(0, num_g_mid, name="num_mg") as num_mg:
                data_offset = (num_hw_dst_before_mid + num_mg) * row_ele
                ub_offset = ub_offset_mid_begin + num_mg*c_1*row_ele
                args = tvm_ib, param, data, data_ub, data_offset, ub_offset,\
                       n_burst, burst_len_data, src_stride, 0
                _func_gm_to_ub(args)

            # ub to ub_res
            res_offset_mid_begin = c_i*n_i - num_c0_dst_cur_hw_before*c_0*n_i
            with tvm_ib.for_range(0, num_g_mid, name="num_mg") as num_mg:
                with tvm_ib.for_range(0, c_i, name="num_ci") as num_ci:
                    c_t = tvm.min((num_ci + 1) * c_0, c_i)
                    c_cur = c_t - num_ci*c_0
                    with tvm_ib.for_range(0, c_cur, name="num_cr") as num_cr:
                        with tvm_ib.for_range(0, n_no, name="num_no") as num_no:
                            n_t = tvm.min((num_no + 1)*n_ni, n_i)
                            n_cur = n_t - num_no*n_ni
                            with tvm_ib.for_range(0, n_cur, name="num_nc")\
                                    as num_nc:
                                tvm_ib.emit(tvm.call_extern(
                                    data_ub.dtype, "reg_mov",
                                    tvm.call_extern(reg.dtype, "reg", reg[0]),
                                    data_ub.access_ptr(
                                        'r',
                                        offset=(ub_offset_mid_begin
                                                + num_mg*c_1*row_ele +
                                                num_ci*row_ele + num_no*n_ni*c_0
                                                + num_nc*c_0 + num_cr))
                                ))
                                tvm_ib.emit(tvm.call_extern(
                                    data_res.dtype, "reg_mov",
                                    data_res.access_ptr(
                                        'w',
                                        offset=(res_offset_mid_begin
                                                + num_mg*c_i*n_i
                                                + num_ci*c_0*n_i
                                                + num_cr*n_i
                                                + num_no*n_ni + num_nc)),
                                    tvm.call_extern(reg.dtype, "reg", reg[0])
                                ))

        with tvm_ib.if_scope(num_c0_tail > 0):
            num_row_before_tail = num_row_before_core + num_c0_head\
                                  + num_g_mid*c_1
            reg_addr[4] = num_row_before_tail
            num_hw_dst_before_tail = reg_addr[4] // c_1

            ub_offset_tail_begin = (num_c0_head + num_g_mid*c_1)*row_ele
            data_offset = num_hw_dst_before_tail*row_ele
            n_burst = num_c0_tail
            burst_len_data = _ceil_div(row_ele, param.get("cp_align_len"))
            src_stride = _ceil_div((h_w - 1) * row_ele,
                                   param.get("cp_align_len"))
            args = tvm_ib, param, data, data_ub, data_offset,\
                   ub_offset_tail_begin, n_burst, burst_len_data, src_stride, 0
            _func_gm_to_ub(args)

            # ub to ub_res
            res_offset_tail_begin = c_i*n_i - num_c0_dst_cur_hw_before*c_0*n_i\
                                    + num_g_mid*n_i*c_i
            with tvm_ib.for_range(0, num_c0_tail, name="num_tc") as num_tc:
                c_t = tvm.min((num_tc + 1) * c_0, c_i)
                c_cur = c_t - num_tc*c_0
                with tvm_ib.for_range(0, c_cur, name="num_cr") as num_cr:
                    with tvm_ib.for_range(0, n_no, name="num_no") as num_no:
                        n_t = tvm.min((num_no + 1)*n_ni, n_i)
                        n_cur = n_t - num_no*n_ni
                        with tvm_ib.for_range(0, n_cur, name="num_nc")\
                                as num_nc:
                            tvm_ib.emit(tvm.call_extern(
                                data_ub.dtype, "reg_mov",
                                tvm.call_extern(reg.dtype, "reg", reg[0]),
                                data_ub.access_ptr(
                                    'r',
                                    offset=(ub_offset_tail_begin
                                            + num_tc*row_ele
                                            + num_no*n_ni*c_0
                                            + num_nc*c_0 + num_cr))
                            ))
                            tvm_ib.emit(tvm.call_extern(
                                data_res.dtype, "reg_mov",
                                data_res.access_ptr('w', offset=(
                                    res_offset_tail_begin +
                                    num_tc*c_0*n_i + num_cr * n_i
                                    + num_no*n_ni + num_nc)),
                                tvm.call_extern(reg.dtype, "reg", reg[0])
                            ))
        # ub_res to dst
        total_len = c_i*n_i - num_c0_dst_cur_hw_before*c_0*n_i\
                    + num_g_mid*n_i*c_i + num_c0_tail*c_0*n_i
        reg_addr[6] = total_len
        dst_offset = num_hw_dst_before_core*c_i*n_i\
                     + num_c0_dst_cur_hw_before*c_0*n_i
        args = tvm_ib, param, dst, data_res, data_tail, reg, reg_addr, 1, 0,\
               dst_offset, reg_addr[6]
        _res_to_gm_more_row(args)
Ejemplo n.º 19
0
def _histogram_fixed_width_ir(dst, src, nbins, shape_list):
    """
    IR node builder make function

    Parameters
    ----------
    dst: list
        the placeholders of dst
    src: list
        the placeholders of src, data, data_range = src
    nbins: int
        number of histogram bins.
    shape_list: list
        the shape list of srcs, data_shape, data_range_shape = shape_list

    Returns
    -------
    ib.get(): stmt
        The result statement.
    """
    data, data_range = src
    ib_create = tvm.ir_builder.create()
    # params init
    params = IrParams(ib_create, [data.dtype, data_range.dtype, dst[0].dtype],
                      [shape_list[0], shape_list[1], dst[0].shape], nbins)
    # calc out_begin and out_end  per core
    # init src_mid_input_ub
    kernel_api.kernel_vector_dup_fuc(
        params.ir_builder, [params.range0_ub, 0], SCALAR_ZERO,
        [params.mid_vec_align_len, params.mid_vec_align_len])
    kernel_api.kernel_vector_dup_fuc(
        params.ir_builder, [params.range0_ub, params.mid_vec_align_len],
        2**(-126), [params.mid_vec_align_len, params.mid_vec_align_len])
    # init tensor: output tensor, len=nbins
    kernel_api.kernel_vector_dup_fuc(
        params.ir_builder, [params.des_output_ub, 0], SCALAR_ZERO,
        [params.out_num_per_core, params.output_vec_align_len])
    kernel_api.kernel_vector_dup_fuc(
        params.ir_builder, [params.des_tmp_output_ub, 0], SCALAR_ZERO,
        [params.out_num_per_core, params.output_vec_align_len])
    # copy data_range from out to ub
    kernel_api.kernel_cp_fuc(
        params.ir_builder, [[params.range_src_ub, 0], [data_range, 0]],
        [params.data_range_shape[0], params.input_align_len],
        "copy_gm_to_ubuf")
    # vconv input to float32
    _fuction_data_conv_ir(
        [[params.src_mid_input_range_ub, 0], [params.range_src_ub, 0]], [
            params.data_range_shape[0],
            max(params.input_vec_align_len, params.mid_vec_align_len)
        ], params)
    params.ir_builder.emit(
        tvm.call_extern('int32', 'pipe_barrier', params.args_str))

    params.reg[0] = params.src_mid_input_range_ub.vload(0, params.mid_dtype)
    params.reg[1] = params.src_mid_input_range_ub.vload(1, params.mid_dtype)
    # range1 - range0
    kernel_api.kernel_vector_dup_fuc(
        params.ir_builder, [params.src_mid_input_ub, 0], params.reg[1],
        [params.mid_align_len, params.mid_align_len])
    _addr_list = [[params.src_mid_input_range_ub, 0],
                  [params.src_mid_input_ub, 0],
                  [params.src_mid_input_range_ub, 0]]
    kernel_api.kernel_two_to_one_common_fuc(params.ir_builder, _addr_list,
                                            [1, params.mid_align_len], "vsub")
    params.ir_builder.emit(
        tvm.call_extern('int32', 'pipe_barrier', params.args_str))

    params.reg[2] = params.src_mid_input_range_ub.vload(0, params.mid_dtype)

    _addr_list = [[params.src_mid_input_range_ub, 0],
                  [params.src_mid_input_range_ub, 0]]
    kernel_api.kernel_scalar_to_one_fuc(
        params.ir_builder, _addr_list, [1, params.mid_vec_align_len],
        ["vmuls", float(1.0 / params.nbins)])
    params.ir_builder.emit(
        tvm.call_extern('int32', 'pipe_barrier', params.args_str))
    params.reg[3] = params.src_mid_input_range_ub.vload(0, params.mid_dtype)
    # get 0-64 mask_value for set_vector_mask
    params.get_mask_list(64)
    params.get_block_offset_one_core()
    loop_and_mask_list = \
        kernel_api.get_loopnum_and_masklist(params.data_shape[0],
                                            SEGMENT_SIZE_COPY_GM_TO_UB)

    def _run_fuc(data_len, data_offset, copy_ub):
        # copy data(len=data_len,offset=data_offset) from out to ub
        kernel_api.kernel_cp_fuc(params.ir_builder,
                                 [[copy_ub, 0], [data, data_offset]],
                                 [data_len, params.input_align_len],
                                 "copy_gm_to_ubuf")
        _fuction_data_conv_ir(
            [[params.src_mid_input_ub, 0], [copy_ub, 0]],
            [data_len,
             max(params.input_align_len, params.mid_vec_align_len)], params)
        # fixed process:nbins*(values - value_range[0])/(value_range[1]
        # - value_range[0])
        _data_info_list = [data_len, params.mid_vec_align_len]

        # clac histogram in one SEGMENT_SIZE_COPY_GM_TO_UB and sum to output UB
        _function_histogram_process_ir(params.src_mid_input_ub, 0,
                                       _data_info_list, params)

    # data process SEGMENT_SIZE_COPY_GM_TO_UB by SEGMENT_SIZE_COPY_GM_TO_UB
    with params.ir_builder.for_range(0, loop_and_mask_list[0],
                                     name='m') as pre_index:
        _run_fuc(SEGMENT_SIZE_COPY_GM_TO_UB,
                 pre_index * SEGMENT_SIZE_COPY_GM_TO_UB, params.src_ub)
    # tail_data process; len = data_len % SEGMENT_SIZE_COPY_GM_TO_UB
    if loop_and_mask_list[1] == 1:
        _run_fuc(params.data_shape[0] % SEGMENT_SIZE_COPY_GM_TO_UB,
                 loop_and_mask_list[0] * SEGMENT_SIZE_COPY_GM_TO_UB,
                 params.src_ub)

    # copy result to out by mul cores
    def _copy_out(copy_data_len):
        kernel_api.kernel_cp_fuc(
            params.ir_builder,
            [[dst[0], params.out_begin], [params.des_output_ub, 0]],
            [copy_data_len, params.output_align_len], "copy_ubuf_to_gm")

    with params.ir_builder.if_scope(
            params.out_begin < tvm.const(params.nbins, "int32")):
        with params.ir_builder.if_scope(
                params.out_end <= tvm.const(params.nbins, "int32")):
            _copy_out(params.out_num_per_core)
        with params.ir_builder.else_scope():
            tail_core_num = params.nbins % params.out_num_per_core
            if tail_core_num != SCALAR_ZERO:
                _copy_out(tail_core_num)
    return params.ir_builder.get()
Ejemplo n.º 20
0
def elewise_binary_phony_ex(stmt_op):
    """
    elewise_binary_phony_ex which will eliminate its second input tensor completely
    """
    ins, outs, _, _ = cce_util.get_dma_buffer(stmt_op)
    ir_builder = tvm.ir_builder.create()

    def new_alloc(ir_builder, dtype, shape, name, scope):
        """
        new_alloc
        """
        buf_var = ir_builder.allocate(dtype, shape, name=name, scope=scope)
        new_buffer = tvm.decl_buffer(shape, buf_var.dtype, name=name, scope=scope, data=buf_var)

        return new_buffer

    # Move first input to out
    dtype = ins[0].dtype
    total_element = 0
    for dim in ins[0].shape:
        if total_element == 0:
            total_element = dim
        else:
            total_element *= dim
    _block_unit_size = ALIGNMENT_BYTES // cce_util.get_align_factor(dtype)[1]
    total_block = int(total_element) // int(_block_unit_size)
    remain = int(total_element % _block_unit_size)

    if total_block > 0:
        ir_builder.emit(tvm.call_extern(
            ins[0].dtype, "copy_ubuf_to_gm",
            outs[0].access_ptr("rw"),
            ins[0].access_ptr("r"),
            0, 1, total_block, 0, 0))

    if remain > 0 and total_block > 0:
        # Roll back for remaining data
        roll_back_size = _block_unit_size - remain

        # Allocate reg buffer needed for holding src data
        reg = new_alloc(ir_builder,
                        ins[0].dtype,
                        (_block_unit_size,),
                        "copy_part",
                        scope=cce.scope_ubuf)

        # reg_mov src data
        with ir_builder.for_range(0, _block_unit_size, name="reg_idx") as reg_idx:
            ir_builder.emit(tvm.call_extern(
                ins[0].dtype, "reg_mov",
                reg.access_ptr("rw", offset=reg_idx),
                ins[0].access_ptr("r", offset=total_block*_block_unit_size-roll_back_size+reg_idx)))
        ir_builder.emit(tvm.call_extern(
            ins[0].dtype, "copy_ubuf_to_gm",
            outs[0].access_ptr("rw", offset=total_block*_block_unit_size-roll_back_size),
            reg.access_ptr("r"),
            0, 1, 1, 0, 0))

    if remain > 0 and total_block == 0:
        ir_builder.emit(tvm.call_extern(
            ins[0].dtype, "copy_ubuf_to_gm",
            outs[0].access_ptr("rw", offset=0),
            ins[0].access_ptr("r", offset=0),
            0, 1, 1, 0, 0))
    return ir_builder.get()
Ejemplo n.º 21
0
    def _core_func(out_begin, out_end, element_num_of_core):
        """
        :param out_begin : multi core output begin address
        :param out_end : multi core output end address
        :param element_num_of_core : element num of one core
        """
        if out_end != total_element or element_num_of_core == total_element:
            core_cal_num = element_num_of_core
        else:
            core_cal_num = total_element % element_num_of_core

        if core_cal_num == 0:
            return

        _do_vector_dup((params.output_ub, 0), output_data_len, params.dtype,
                       params)
        _do_cp_input_gm(input_gm, input_data_len, 0, params)

        c0_pad_len = _ceil_fill(channel, channel0) - channel

        def _do_data_copy(i, block_index):
            """
            :param block_index : block_index
            """
            i_hw = i // channel1
            c1_index = i % channel1

            if channel % channel0 == 0:
                _data_copy(c1_index, block_index, i_hw, channel0, 0)
            else:
                offset = ((i // channel1) -
                          (out_begin // channel1)) * c0_pad_len
                with params.ib_.if_scope(c1_index != channel1 - 1):
                    _data_copy(c1_index, block_index, i_hw, channel0, -offset)
                with params.ib_.else_scope():
                    _data_copy(c1_index, block_index, i_hw, channel % channel0,
                               -offset)

        with params.ib_.for_range(0, core_cal_num, for_type="serial",
                                  name="j") as j:
            i = out_begin + j
            _do_data_copy(i, j)

        core_out_len = core_cal_num * channel0

        if channel % channel0 != 0:
            out_begin_offset = params.ib_.allocate("int32", (1, ),
                                                   name="out_begin_offset",
                                                   scope=cce_params.scope_reg)
            out_begin_offset[0] = (out_begin // channel1) * c0_pad_len
            core_out_len -= ((out_end // channel1) * c0_pad_len -
                             out_begin_offset[0])
            pad_len = _ceil_fill(core_out_len,
                                 params.cp_align_len) - core_out_len
            with params.ib_.for_range(0,
                                      params.cp_align_len,
                                      for_type="serial",
                                      name="j") as j:
                i = out_begin + j + core_cal_num
                _do_data_copy(i, j + core_cal_num)

                real_pad_len = ((i + 1) * channel0 - (
                    (i + 1) // channel1) * c0_pad_len) - (
                        (out_begin + core_cal_num) * channel0 -
                        ((out_begin + core_cal_num) // channel1) * c0_pad_len)
                with params.ib_.if_scope(real_pad_len >= pad_len):
                    params.ib_.emit(tvm.call_extern(params.dtype, 'break'))

        num_cp = _ceil_div(core_out_len, params.cp_align_len)
        output_offset = out_begin * channel0 - (out_begin //
                                                channel1) * c0_pad_len
        params.ib_.emit(
            tvm.call_extern(params.dtype, 'copy_ubuf_to_gm',
                            output_gm.access_ptr("rw", offset=output_offset),
                            params.output_ub.access_ptr("r", offset=0), 0, 1,
                            num_cp, 0, 0))
Ejemplo n.º 22
0
def bn_reduce_sum(stmt_op):
    """
    Collapse second input tensor to one repeat
    and use vcadd to calculate sum to output
    """
    # Get input and output buffers
    input_size_list = [1]
    for_extents = []
    ir_builder = tvm.ir_builder.create()
    cce_util.get_init_op(stmt_op)

    def _post_order_for(_stmt):
        if isinstance(_stmt, tvm.stmt.For):
            input_size_list[0] = input_size_list[0] * _stmt.extent.value
            for_extents.append(_stmt.extent.value)

    tvm.ir_pass.IRTransform(stmt_op, None, _post_order_for, ["For"])
    ins, outs = \
        cce_util.get_buffer(stmt_op, need_unique=True, need_origin_adress=True)
    in_buffer = ins[1]
    out_buffer = outs[0]
    input_size = input_size_list[0]

    # Check if input can be collapsed into one repeat
    vector_inst_one_repeat_size = \
        cce_params.VECTOR_INST_BLOCK_WIDTH // \
        cce_util.get_align_factor(in_buffer.dtype)[1]

    # get reduce_axis shape
    if len(for_extents) == 1:
        input_reduce_axis_shape = for_extents[0]
        ub_loop_num = 1
    else:
        input_reduce_axis_shape = for_extents[0]
        ub_loop_num = for_extents[1]

    collapse_loop_num = \
        math.log(input_reduce_axis_shape / vector_inst_one_repeat_size, 2)

    # judge reduce_shape is remaining or not after dichotomy add
    remain_flag = False
    collapse_repeat = 0
    if not collapse_loop_num.is_integer():
        collapse_repeat = int(math.pow(2, int(collapse_loop_num)))
        out_of_collapse_repeat = \
            input_reduce_axis_shape / vector_inst_one_repeat_size - \
            collapse_repeat
        if not out_of_collapse_repeat.is_integer():
            raise RuntimeError("Input size is not aligned:",
                               input_reduce_axis_shape)
        remain_flag = True

    # Do Emit Insn
    def collapse(ir_b, buffer, current_size):
        """Function to do emit insn"""
        repeat = current_size // 2 / vector_inst_one_repeat_size
        tail_flag = False
        if not repeat.is_integer():
            tail_flag = True
        repeat = int(repeat)

        ir_b.emit(
            tvm.call_extern(buffer.dtype, "vadd",
                            buffer.access_ptr("rw", offset=0),
                            buffer.access_ptr("r", offset=0),
                            buffer.access_ptr("r", offset=8), repeat, 1, 2, 2,
                            8, 16, 16))

        # solve tail vadd
        if tail_flag:
            tail_mask = \
                (current_size - repeat * 2 * vector_inst_one_repeat_size) // 2
            te.platform.cce_intrin_md.reset_mask_insn(ir_builder,
                                                      in_buffer.dtype,
                                                      tail_mask)
            ir_b.emit(
                tvm.call_extern(
                    buffer.dtype, "vadd",
                    buffer.access_ptr("rw",
                                      offset=repeat *
                                      vector_inst_one_repeat_size),
                    buffer.access_ptr("r",
                                      offset=repeat * 2 *
                                      vector_inst_one_repeat_size),
                    buffer.access_ptr(
                        "r",
                        offset=repeat * 2 * vector_inst_one_repeat_size + 8),
                    1, 1, 2, 2, 0, 0, 0))
            te.platform.cce_intrin_md.reset_mask_insn(ir_builder,
                                                      in_buffer.dtype)
        return current_size // 2

    # emit vadd
    cur_size = input_size
    for loop in range(int(collapse_loop_num)):
        cur_size = collapse(ir_builder, in_buffer, cur_size)

    if remain_flag:
        # solve remain repeat
        mask_bits = \
            input_reduce_axis_shape / collapse_repeat - \
            vector_inst_one_repeat_size
        add_repeat_stride = int(8 + mask_bits / 8)
        te.platform.cce_intrin_md.reset_mask_insn(ir_builder, in_buffer.dtype,
                                                  mask_bits)
        ir_builder.emit(
            tvm.call_extern(
                in_buffer.dtype, "vadd", in_buffer.access_ptr("rw", offset=0),
                in_buffer.access_ptr("r", offset=0),
                in_buffer.access_ptr("r", offset=vector_inst_one_repeat_size),
                ub_loop_num, 1, 1, 1, add_repeat_stride, add_repeat_stride,
                add_repeat_stride))

        # emit vcadd for remain
        te.platform.cce_intrin_md.reset_mask_insn(ir_builder, in_buffer.dtype)
        ir_builder.emit(
            tvm.call_extern(in_buffer.dtype, "vcadd",
                            out_buffer.access_ptr("rw", offset=0),
                            in_buffer.access_ptr("r", offset=0), ub_loop_num,
                            1, 1, add_repeat_stride))
    else:
        # emit vcadd for no remain
        ir_builder.emit(
            tvm.call_extern(in_buffer.dtype, "vcadd",
                            out_buffer.access_ptr("rw", offset=0),
                            in_buffer.access_ptr("r", offset=0), ub_loop_num,
                            1, 1, 8))

    return ir_builder.get()
Ejemplo n.º 23
0
def binary_reduce_output(stmt_op):
    """Move reduce results to two destinations"""
    # Get input and output buffers
    input_size_list = [1]
    ir_builder = tvm.ir_builder.create()

    def _post_order_for(_stmt):
        if isinstance(_stmt, tvm.stmt.For):
            input_size_list[0] = input_size_list[0] * _stmt.extent.value

    def new_alloc(tvm_ib, dtype, shape, name, scope):
        """Funtion to alloc mem"""
        buf_var = tvm_ib.allocate(dtype, shape, name=name, scope=scope)
        new_buffer = tvm.decl_buffer(shape,
                                     buf_var.dtype,
                                     name=name,
                                     scope=scope,
                                     data=buf_var)
        return new_buffer

    _ = tvm.ir_pass.IRTransform(stmt_op, None, _post_order_for, ["For"])
    ins, outs = cce_util.get_buffer(stmt_op)
    # Alloc second buffer for binary collection
    out_buffer_sec = \
        cce_emitinsn_params.cceEmitParamsIns.get_param("binary_reduce"
                                                       "_output_buffer")
    in_buffer = ins[0], ins[1]
    out_buffer = outs[0], out_buffer_sec
    input_size = input_size_list[0]
    output_size = input_size
    block_unit = cce_util.get_align_factor(in_buffer[0].dtype)[0]
    remain_buffer = new_alloc(ir_builder, out_buffer[0].dtype, (block_unit, ),
                              "copy_part_0", cce_params.scope_ubuf)
    remain_buffer_sec = new_alloc(ir_builder, out_buffer[1].dtype,
                                  (block_unit, ), "copy_part_1",
                                  cce_params.scope_ubuf)
    burst_len = max(output_size // block_unit, 1)
    remains = max(output_size - burst_len * block_unit, 0)
    remains_fill = block_unit - remains

    # Main part
    global_offset = out_buffer[0].elem_offset
    ir_builder.emit(
        tvm.call_extern(out_buffer[0].dtype, "copy_ubuf_to_gm",
                        out_buffer[0].access_ptr("rw"),
                        in_buffer[1].access_ptr("r"), 0, 1, burst_len, 0, 0))
    ir_builder.emit(
        tvm.call_extern(out_buffer[1].dtype, "copy_ubuf_to_gm",
                        out_buffer[1].access_ptr("rw", offset=global_offset),
                        in_buffer[0].access_ptr("r"), 0, 1, burst_len, 0, 0))
    # Remain part
    if remains > 0:
        with ir_builder.for_range(0, block_unit, name="copy_part_fill_loop") \
                as reg_mov_loop:
            ir_builder.emit(
                tvm.call_extern(
                    remain_buffer.dtype, "reg_mov",
                    remain_buffer.access_ptr("rw", offset=reg_mov_loop),
                    in_buffer[1].access_ptr("r",
                                            offset=burst_len * block_unit -
                                            remains_fill + reg_mov_loop)))
            ir_builder.emit(
                tvm.call_extern(
                    remain_buffer_sec.dtype, "reg_mov",
                    remain_buffer_sec.access_ptr("rw", offset=reg_mov_loop),
                    in_buffer[0].access_ptr("r",
                                            offset=burst_len * block_unit -
                                            remains_fill + reg_mov_loop)))
        ir_builder.emit(
            tvm.call_extern(
                out_buffer[0].dtype, "copy_ubuf_to_gm",
                out_buffer[0].access_ptr("rw",
                                         offset=burst_len * block_unit -
                                         remains_fill),
                remain_buffer.access_ptr("r"), 0, 1, 1, 0, 0))
        ir_builder.emit(
            tvm.call_extern(
                out_buffer[1].dtype, "copy_ubuf_to_gm",
                out_buffer[1].access_ptr("rw",
                                         offset=global_offset +
                                         burst_len * block_unit -
                                         remains_fill),
                remain_buffer_sec.access_ptr("r"), 0, 1, 1, 0, 0))
    return ir_builder.get()
Ejemplo n.º 24
0
def custom_expm1(shape,
                 dtype,
                 kernel_name="cce_tf_expm1",
                 need_build=False,
                 need_print=False):
    """
    algorithm: expm1

    calculating data's expm1, y= (e ** x) - 1,dtype is float16 or float32.

    Parameters
    ----------
    shape : shape of data.

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32.

    kernel_name : cce kernel name, default value is "cce_tf_expm1".

    need_buid : if need to build CCEC kernel, default value is False.

    need_print : if need to print the ir, default value is False.

    Returns
    -------
    None

    """

    # [aicpu] int32_t cc_device_exp(uint32_t blockNum, uint32_t blockIdx, int32_t dataType, const void *scale, const void *shift,
    # const void *base, int32_t dimCnt, int32_t *shape, uint32_t padC0, const void *x, void *y);

    supported_dtypes = ["float16", "float32"]

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("tf_expm1_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    # step 1. calculate y = exp ** x by aicpu api
    device_api = "DeviceExp"
    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0
    p_scale = util.create_param_ptr([1], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([-1], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output_exp = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output_exp",
        dtype=inp_dtype)

    offset = tvm.const((-1), dtype=inp_dtype)

    # step 2. cauculate y = exp ** x - 1 by tvm
    output = tvm.compute(
        shape,
        lambda *indice: output_exp(*indice) + offset.astype(inp_dtype),
        name="output")

    # step 3. schedule the computation by tvm
    s = tvm.create_schedule(output.op)

    # step 4. build by tvm
    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Ejemplo n.º 25
0
def _res_to_gm_split_row(args):
    """
    function of moving data from data_res(UB) to dst(GM) for split row scene

    """
    tvm_ib, param, data, dst, data_res, data_tail, reg, reg_addr,\
    index, res_offset, dst_offset, total_len, h_w, row_ele, num_ele_unit,\
    c_0, c_i, h_i, n_i, n_ni = args

    with tvm_ib.if_scope(total_len % param.get("cp_align_len") > 0):
        with tvm_ib.if_scope(total_len > param.get("cp_align_len")):
            total_len_align = total_len - param.get("cp_align_len")
            reg_addr[index] = total_len_align
            burst_len = _ceil_div(total_len_align, param.get("cp_align_len"))
            tvm_ib.emit(tvm.call_extern(dst.dtype, "copy_ubuf_to_gm",
                                        dst.access_ptr('w', offset=dst_offset),
                                        data_res.access_ptr("r",
                                                            offset=res_offset),
                                        0, 1, burst_len, 0, 0))
            with tvm_ib.for_range(0, param.get("cp_align_len"), name="num_a")\
                    as num_a:
                tvm_ib.emit(tvm.call_extern(
                    data_res.dtype, "reg_mov",
                    tvm.call_extern(reg.dtype, "reg", reg[0]),
                    data_res.access_ptr('r',
                                        offset=res_offset + total_len_align
                                        + num_a)
                ))
                tvm_ib.emit(tvm.call_extern(
                    data_tail.dtype, "reg_mov",
                    data_tail.access_ptr('w', offset=num_a),
                    tvm.call_extern(reg.dtype, "reg", reg[0])
                ))
            tvm_ib.emit(
                tvm.call_extern(dst.dtype, "copy_ubuf_to_gm",
                                dst.access_ptr('w', offset=dst_offset
                                               + reg_addr[index]),
                                data_tail.access_ptr("r", offset=0),
                                0, 1, 1, 0, 0))
        with tvm_ib.else_scope():
            num_ele = param.get("cp_align_len") - total_len
            with tvm_ib.for_range(0, num_ele, name="num_e") as num_e:
                reg_addr[index] = total_len + num_e
                dst_pos = dst_offset + reg_addr[index]
                args = dst_pos, h_w, row_ele, num_ele_unit,\
                       c_0, c_i, h_i, n_i, n_ni
                data_pos = _dst_to_data_pos(args)
                tvm_ib.emit(tvm.call_extern(data_tail.dtype, "copy_gm_to_ubuf",
                                            data_tail.access_ptr("w", offset=0),
                                            data.access_ptr('r',
                                                            offset=data_pos),
                                            0, 1, 1, 0, 0))
                tvm_ib.emit(tvm.call_extern(
                    data_tail.dtype, "reg_mov",
                    tvm.call_extern(reg.dtype, "reg", reg[0]),
                    data_tail.access_ptr('r', offset=0)
                ))
                tvm_ib.emit(tvm.call_extern(
                    data_res.dtype, "reg_mov",
                    data_res.access_ptr('w', offset=total_len + num_e),
                    tvm.call_extern(reg.dtype, "reg", reg[0])
                ))
            tvm_ib.emit(tvm.call_extern(dst.dtype, "copy_ubuf_to_gm",
                                        dst.access_ptr('w', offset=dst_offset),
                                        data_res.access_ptr("r",
                                                            offset=0),
                                        0, 1, 1, 0, 0))
    with tvm_ib.else_scope():
        burst_len = total_len // param.get("cp_align_len")
        tvm_ib.emit(tvm.call_extern(dst.dtype, "copy_ubuf_to_gm",
                                    dst.access_ptr('w', offset=dst_offset),
                                    data_res.access_ptr("r", offset=res_offset),
                                    0, 1, burst_len, 0, 0))
Ejemplo n.º 26
0
def custom_pow(shape,
               shape_y,
               dtype,
               kernel_name="cce_tf_pow",
               need_build=False,
               need_print=False):
    """
    calculate x^y, calculating data type is float16 or float32 or int32
    when x < 0 , the output is a meaningless value.
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support
    float16, float32, int32

    kernel_name : cce kernel name, default value is "tf_pow_cce"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32", "int32"]
    device_api = "cc_device_pow"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError("tf_pow_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_lhs = tvm.placeholder(shape, name="data_lhs", dtype=inp_dtype)
    data_rhs = tvm.placeholder(shape, name="data_rhs", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([0], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_power = util.create_param_ptr([0], inp_dtype, "p_power")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_lhs, data_rhs, p_scale, p_shift, p_power, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[2].access_ptr("r"),  # scale
            ins[3].access_ptr("r"),  # shift
            ins[4].access_ptr("r"),  # power
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            v_ndim,
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[1].access_ptr("r"),  # input y
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(
                tvm.lower(schedule, [data_lhs, data_rhs, output],
                          simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_lhs, data_rhs, output],
                      "cce",
                      name=kernel_name)
Ejemplo n.º 27
0
 def _do_calcu_f32_mini(cal_offset, nbins_index):
     with params.ir_builder.if_scope(
             tvm.any((nbins_index == SCALAR_ZERO),
                     (nbins_index == params.nbins))):
         with params.ir_builder.if_scope(nbins_index == 0):
             kernel_api.kernel_vector_dup_fuc(
                 params.ir_builder, [params.vcadd_ub, 0], 0,
                 [params.mid_vec_align_len, params.mid_vec_align_len])
             if params.compile_plat in ("Ascend310", ):
                 kernel_api.kernel_scalar_to_one_fuc(
                     params.ir_builder,
                     [[params.index_ub, 0], [params.index_ub, 0]],
                     [1, params.mid_vec_align_len],
                     ["vadds", params.reg[3]])
         with params.ir_builder.else_scope():
             kernel_api.kernel_vector_dup_fuc(
                 params.ir_builder, [params.vcadd_ub, 0],
                 params.histogram_data_len,
                 [params.mid_vec_align_len, params.mid_vec_align_len])
     with params.ir_builder.else_scope():
         mask_paras = kernel_api.get_loopnum_and_masklist(
             params.histogram_data_len, params.mid_vec_align_len)
         _do_cmp_calcu(mask_paras[0] + mask_paras[1], cal_offset,
                       nbins_index)
         if mask_paras[0] > SCALAR_ONE:
             params.ir_builder.emit(
                 tvm.call_extern(
                     params.vcadd_ub.dtype, "vadd",
                     params.vcadd_ub.access_ptr("rw", offset=0),
                     params.vcadd_ub.access_ptr(
                         "r", offset=params.mid_vec_align_len),
                     params.vcadd_ub.access_ptr("r", offset=0),
                     mask_paras[0] - 1, 1, 1, 1, 0, 8, 0))
         if mask_paras[0] > SCALAR_ZERO:
             if mask_paras[1] == SCALAR_ONE:
                 params.ir_builder.emit(
                     tvm.call_extern("uint64", 'set_vector_mask',
                                     mask_paras[2][1], mask_paras[2][0]))
                 add_offset = mask_paras[0] * params.mid_vec_align_len
                 params.ir_builder.emit(
                     tvm.call_extern(
                         params.vcadd_ub.dtype, "vadd",
                         params.vcadd_ub.access_ptr("rw", offset=0),
                         params.vcadd_ub.access_ptr("r", offset=0),
                         params.vcadd_ub.access_ptr("r", offset=add_offset),
                         1, 1, 1, 1, 8, 8, 8))
                 params.ir_builder.emit(
                     tvm.call_extern("uint64", 'set_vector_mask',
                                     params.uint64_all_one,
                                     params.uint64_all_one))
         if mask_paras[0] == SCALAR_ZERO:
             params.ir_builder.emit(
                 tvm.call_extern("uint64", 'set_vector_mask',
                                 mask_paras[2][1], mask_paras[2][0]))
         params.ir_builder.emit(
             tvm.call_extern(params.vcadd_ub.dtype, "vcadd",
                             params.vcadd_ub.access_ptr("rw", offset=0),
                             params.vcadd_ub.access_ptr("r", offset=0), 1,
                             1, 1, 8))
         if mask_paras[0] == SCALAR_ZERO:
             params.ir_builder.emit(
                 tvm.call_extern("uint64", 'set_vector_mask',
                                 params.uint64_all_one,
                                 params.uint64_all_one))
         # bypass problem :addr not 32B align
         params.ir_builder.emit(
             tvm.call_extern('int32', 'pipe_barrier', params.args_str))
         params.reg[4] = params.vcadd_ub.vload(0, params.mid_dtype)
         kernel_api.kernel_vector_dup_fuc(
             params.ir_builder, [params.vcadd_ub, 0], params.reg[4],
             [params.mid_vec_align_len, params.mid_vec_align_len])
     # add num of index to src_output_ub
     nbins_index_core = nbins_index - params.block.var * \
                        params.out_num_per_core
     params.offset = \
         (nbins_index_core // params.mid_vec_align_len) * \
         params.mid_vec_align_len
     params.ir_builder.emit(
         tvm.call_extern("uint64", 'set_vector_mask', 0,
                         params.set_mask_list[nbins_index_core % 64]))
     params.ir_builder.emit(
         tvm.call_extern(
             params.src_output_ub.dtype, "vadd",
             params.src_output_ub.access_ptr("rw", offset=params.offset),
             params.vcadd_ub.access_ptr("r", offset=0),
             params.src_output_ub.access_ptr("r", offset=params.offset), 1,
             1, 1, 1, 8, 8, 8))
     # add num of index to src_output_ub_p1
     with params.ir_builder.if_scope(nbins_index_core != SCALAR_ZERO):
         with params.ir_builder.if_scope(nbins_index_core %
                                         64 == SCALAR_ZERO):
             params.ir_builder.emit(
                 tvm.call_extern("uint64", 'set_vector_mask', 0,
                                 params.set_mask_list[63]))
         with params.ir_builder.else_scope():
             params.ir_builder.emit(
                 tvm.call_extern(
                     "uint64", 'set_vector_mask', 0,
                     params.set_mask_list[nbins_index_core % 64 - 1]))
         params.offset = \
             ((nbins_index_core - 1) // params.mid_vec_align_len) * \
             params.mid_vec_align_len
         params.ir_builder.emit(
             tvm.call_extern(
                 params.src_output_ub_p1.dtype, "vadd",
                 params.src_output_ub_p1.access_ptr("rw",
                                                    offset=params.offset),
                 params.vcadd_ub.access_ptr("r", offset=0),
                 params.src_output_ub_p1.access_ptr("r",
                                                    offset=params.offset),
                 1, 1, 1, 1, 8, 8, 8))
     params.ir_builder.emit(
         tvm.call_extern("uint64", 'set_vector_mask', params.uint64_all_one,
                         params.uint64_all_one))
Ejemplo n.º 28
0
def _func_gm_to_ub(args):
    """
    function of moving data from data to data_ub

    """
    tvm_ib, param, data, data_ub, data_offset, ub_offset, ori_nburst,\
    burst_len, src_stride, dst_stride = args

    with tvm_ib.if_scope(ori_nburst > 0):
        with tvm_ib.if_scope(burst_len > 0):
            with tvm_ib.if_scope(burst_len <= 65535):
                with tvm_ib.if_scope(src_stride >= 0):
                    with tvm_ib.if_scope(dst_stride >= 0):
                        with tvm_ib.if_scope(dst_stride <= 65535):
                            with tvm_ib.if_scope(src_stride <= 65535):
                                with tvm_ib.if_scope(ori_nburst <= 4095):
                                    tvm_ib.emit(
                                        tvm.call_extern(
                                            data_ub.dtype,
                                            "copy_gm_to_ubuf",
                                            data_ub.access_ptr(
                                                "w", offset=ub_offset),
                                            data.access_ptr(
                                                'r', offset=data_offset),
                                            0, ori_nburst,
                                            burst_len,
                                            src_stride, dst_stride))
                                with tvm_ib.else_scope():
                                    n_burst = 4095
                                    c_cycle = ori_nburst // n_burst
                                    c_mod = ori_nburst % n_burst
                                    with tvm_ib.for_range(0, c_cycle,
                                                          name="num_cy")\
                                            as num_cy:
                                        data_cur = data_offset + (
                                            burst_len + src_stride) \
                                                   * param.get("cp_align_len")\
                                                   * n_burst * num_cy
                                        ub_cur = ub_offset + (
                                            burst_len + dst_stride) \
                                                 * param.get("cp_align_len")\
                                                 * n_burst * num_cy
                                        tvm_ib.emit(
                                            tvm.call_extern(
                                                data_ub.dtype,
                                                "copy_gm_to_ubuf",
                                                data_ub.access_ptr(
                                                    "w", offset=ub_cur),
                                                data.access_ptr(
                                                    'r', offset=data_cur),
                                                0, n_burst,
                                                burst_len,
                                                src_stride,
                                                dst_stride))
                                    with tvm_ib.if_scope(c_mod > 0):
                                        data_cur = data_offset + (
                                            burst_len + src_stride) \
                                                   * param.get("cp_align_len")\
                                                   * n_burst * c_cycle
                                        ub_cur = ub_offset + (
                                            burst_len + dst_stride) \
                                                 * param.get("cp_align_len")\
                                                 * n_burst * c_cycle
                                        tvm_ib.emit(
                                            tvm.call_extern(
                                                data_ub.dtype,
                                                "copy_gm_to_ubuf",
                                                data_ub.access_ptr(
                                                    "w", offset=ub_cur),
                                                data.access_ptr(
                                                    'r', offset=data_cur),
                                                0, c_mod, burst_len,
                                                src_stride,
                                                dst_stride))
                            with tvm_ib.else_scope():
                                with tvm_ib.for_range(0, ori_nburst,
                                                      name="num_nb") as num_nb:
                                    data_cur = data_offset + (
                                        burst_len + src_stride)\
                                               * param.get("cp_align_len")\
                                               * num_nb
                                    ub_cur = ub_offset + (
                                        burst_len + dst_stride)\
                                             * param.get("cp_align_len")\
                                             * num_nb
                                    tvm_ib.emit(
                                        tvm.call_extern(
                                            data_ub.dtype,
                                            "copy_gm_to_ubuf",
                                            data_ub.access_ptr(
                                                "w", offset=ub_cur),
                                            data.access_ptr(
                                                'r', offset=data_cur),
                                            0, 1, burst_len,
                                            0, 0))
Ejemplo n.º 29
0
def custom_Exp(shape,
               dtype,
               gamma,
               alpha,
               beta,
               kernel_name="cce_exp",
               need_build=False,
               need_print=False):
    """
    calculate gamma **(alpha * data + beta),
    calculate exp(log(gamma) * alpha * data) * (gamma ** beta)

    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support \
    float16, float32

    gamma : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, base

    alpha : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, scale

    beta : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, shift

    kernel_name : cce kernel name, default value is "cce_exp"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32"]
    device_api = "DeviceExp"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError(
            "caffe_exp_layer_cce only support %s while dtype is %s" %
            (",".join(supported_dtypes), dtype))

    if gamma != -1 and gamma <= 0:
        # api  cc_device_exp_c handle gamma == -1 as e
        raise ValueError(
            "please ensure gamma is greater than 0, where gamma = %s" %
            str(gamma))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([gamma], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    # scale --> alpha, shitf --> beta, base --> gamma
    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input, output], "cce", name=kernel_name)
Ejemplo n.º 30
0
def custom_exp(shape,
               dtype,
               kernel_name="cce_tf_exp",
               need_build=False,
               need_print=False):
    """
    algorithm: exp  

    calculating data's exp,y= e ** x ,dtype is float16,
    
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32

    kernel_name : cce kernel name, default value is "cce_tf_exp"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32"]
    device_api = "DeviceExp"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("tf_exp_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0
    p_scale = util.create_param_ptr([1], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([-1], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)