def _sel_data(ir_builder, src_data, alloc_mem, offset_length): """ select data from src_data with alloc_mem data """ if src_data.dtype == 'float16': # list alloc_mem has diff data adds, ref: list alloc_res ir_builder.emit( tvm.call_extern(src_data.dtype, 'vsel', alloc_mem[4].access_ptr('w', offset=offset_length), alloc_mem[4].access_ptr('r', offset=offset_length), alloc_mem[0].access_ptr('r', offset=0), 1, 1, 1, 1, 0, 0, 0)) else: ir_builder.emit( tvm.call_extern('float16', 'vsel', alloc_mem[2].access_ptr('w'), alloc_mem[1].access_ptr('r'), alloc_mem[0].access_ptr('r'), 1, 1, 1, 1, 0, 0, 0)) ir_builder.emit( tvm.call_extern("float32", "vconv_f162f32", alloc_mem[3].access_ptr("rw"), alloc_mem[2].access_ptr("r"), 2, 1, 1, 8, 4)) ir_builder.emit( tvm.call_extern( "float32", "vmul", alloc_mem[4].access_ptr("rw", offset=offset_length), alloc_mem[4].access_ptr("r", offset=offset_length), alloc_mem[3].access_ptr("r"), 2, 1, 1, 1, 8, 8, 8))
def _out_put_one_time(out_begin, block_index, sub_num, tail_core=False): """ :param out_begin : multi core output begin address :param block_index : block index :param sub_num : sub cycle num """ c0_pad_len = _ceil_fill(channel, channel0) - channel _do_vector_dup((params.output_ub, 0), len_params["output_data_len"], params.dtype, params) with params.ib_.for_range(0, sub_num, for_type="serial", name="i") as sub_i: i = out_begin + block_index * len_params["one_output_num"] + sub_i _do_data_copy(i, sub_i, out_begin, c0_pad_len) core_out_len = sub_num * channel0 if channel % channel0 != 0 and tail_core: out_begin_offset = params.ib_.allocate("int32", (1, ), name="out_begin_offset", scope=cce_params.scope_reg) out_begin_offset[0] = (out_begin // channel1) * c0_pad_len core_out_len -= (( (out_begin + block_index * len_params["one_output_num"] + sub_num) // channel1) * c0_pad_len - out_begin_offset[0]) pad_len = _ceil_fill(core_out_len, params.cp_align_len) - core_out_len with params.ib_.for_range(0, params.cp_align_len, for_type="serial", name="j") as sub_i: i = out_begin + block_index * len_params[ "one_output_num"] + sub_i + sub_num _do_data_copy( i, sub_i + sub_num, out_begin + block_index * len_params["one_output_num"], c0_pad_len) real_pad_len = ( (i + 1) * channel0 - ((i + 1) // channel1) * c0_pad_len ) - ((out_begin + block_index * len_params["one_output_num"] + sub_num) * channel0 - ((out_begin + block_index * len_params["one_output_num"] + sub_num) // channel1) * c0_pad_len) with params.ib_.if_scope(real_pad_len >= pad_len): params.ib_.emit(tvm.call_extern(params.dtype, 'break')) num_cp = _ceil_div(core_out_len, params.cp_align_len) out_gm_offset = ( out_begin + block_index * len_params["one_output_num"]) * channel0 - ( (out_begin + block_index * len_params["one_output_num"]) // channel1) * c0_pad_len params.ib_.emit( tvm.call_extern(params.dtype, 'copy_ubuf_to_gm', output_gm.access_ptr("rw", offset=out_gm_offset), params.output_ub.access_ptr("r", offset=0), 0, 1, num_cp, 0, 0))
def _out_put_one_time(out_begin, block_index, sub_num): """ :param out_begin : multi core output begin address :param block_index : block index :param sub_num : sub cycle num """ _do_vector_dup((params.output_ub, 0), output_data_len, params.dtype, params) with params.ib_.for_range(0, sub_num, for_type="serial", name="sub_i") as sub_i: i_tmp = out_begin + block_index * one_output_num + sub_i with params.ib_.for_range(0, channel0, for_type="serial", name="c0_index") as c0_index: output_offset = sub_i * channel0 * channel0 + c0_index * channel0 + c0_index c1_index = i_tmp // (hight * weight) i_hw = i_tmp % (hight * weight) with params.ib_.if_scope( channel0 * c1_index + c0_index < channel): input_offset = i_hw * channel + channel0 * c1_index + c0_index value = params.input_ub.vload(input_offset) params.ib_.emit( params.output_ub.vstore(output_offset, value)) num_cp = _ceil_div(sub_num * channel0 * channel0, params.cp_align_len) out_gm_offset = (out_begin + block_index * one_output_num) * channel0 * channel0 params.ib_.emit( tvm.call_extern(params.dtype, 'copy_ubuf_to_gm', output_gm.access_ptr("rw", offset=out_gm_offset), params.output_ub.access_ptr("r", offset=0), 0, 1, num_cp, 0, 0))
def set_pipe_barrier(self, val): """ :param val : "PIPE_ALL", "PIPE_MTE3", "PIPE_MTE2", "PIPE_MTE1", "PIPE_M", "PIPE_V", "PIPE_S" """ args_str = tvm.call_pure_intrin("int32", "tvm_cce_string_print", val) self.ib_.emit(tvm.call_extern('int32', 'pipe_barrier', args_str))
def _do_cmp_calcu(repeat, cal_offset, nbins_index): if params.compile_plat in ("Ascend910", "Ascend610", "Ascend710"): params.ir_builder.emit( tvm.call_extern( params.vcadd_ub.dtype, "vadds", params.vcadd_ub.access_ptr("rw", offset=0), calc_ub_info[0].access_ptr("r", offset=cal_offset), nbins_index * params.reg[3] + params.reg[0], repeat, 1, 1, 8, 8)) else: # scalar can not supprt int32 to float32 in mini params.ir_builder.emit( tvm.call_extern('int32', 'pipe_barrier', params.args_str)) params.reg[5] = params.index_ub.vload(0, params.mid_dtype) kernel_api.kernel_scalar_to_one_fuc( params.ir_builder, [[params.index_ub, 0], [params.index_ub, 0]], [1, params.mid_vec_align_len], ["vadds", params.reg[3]]) params.ir_builder.emit( tvm.call_extern( params.vcadd_ub.dtype, "vadds", params.vcadd_ub.access_ptr("rw", offset=0), calc_ub_info[0].access_ptr("r", offset=cal_offset), params.reg[5], repeat, 1, 1, 8, 8)) params.ir_builder.emit( tvm.call_extern(params.vcadd_ub.dtype, "vmax", params.vcadd_ub.access_ptr("rw", offset=0), params.vcadd_ub.access_ptr("r", offset=0), params.range0_ub.access_ptr("r", offset=0), repeat, 1, 1, 1, 8, 8, 0)) params.ir_builder.emit( tvm.call_extern( params.vcadd_ub.dtype, "vmin", params.vcadd_ub.access_ptr("rw", offset=0), params.vcadd_ub.access_ptr("r", offset=0), params.range0_ub.access_ptr("r", offset=params.mid_vec_align_len), repeat, 1, 1, 1, 8, 8, 0)) params.ir_builder.emit( tvm.call_extern(params.vcadd_ub.dtype, "vmuls", params.vcadd_ub.access_ptr("rw", offset=0), params.vcadd_ub.access_ptr("r", offset=0), tvm.const(2**38, dtype=params.vcadd_ub.dtype), repeat, 1, 1, 8, 8)) params.ir_builder.emit( tvm.call_extern(params.vcadd_ub.dtype, "vmuls", params.vcadd_ub.access_ptr("rw", offset=0), params.vcadd_ub.access_ptr("r", offset=0), tvm.const(2**44, dtype=params.vcadd_ub.dtype), repeat, 1, 1, 8, 8)) params.ir_builder.emit( tvm.call_extern(params.vcadd_ub.dtype, "vmuls", params.vcadd_ub.access_ptr("rw", offset=0), params.vcadd_ub.access_ptr("r", offset=0), tvm.const(2**44, dtype=params.vcadd_ub.dtype), repeat, 1, 1, 8, 8))
def _dump(data_len, cycle_offset): """ :param data_len : length to dup :param cycle_offset : cycle_offset """ params.ib_.emit( tvm.call_extern( dtype, 'vector_dup', buf.access_ptr("rw", offset=buf_offset + cycle_offset), tvm.const(val, dtype), _ceil_div(data_len, _get_vec_align_len(dtype)), 1, 1, 8, 8))
def _temp_ir(dst, data): tvm_ib = tvm.ir_builder.create() float_size = cce.cce_intrin.get_bit_len(data.dtype) // 8 cp_align_len = cce_params.BLOCK_REDUCE_INT8 // float_size n_i, c_i, h_i, w_i = dst.shape ub_bytes = UB_SIZE_B ub_ele = ub_bytes // float_size shape_ele = n_i * c_i * h_i * w_i data_ub = _new_alloc(tvm_ib, dst.dtype, ub_ele, "data_ub", scope=cce.scope_ubuf) with tvm_ib.if_scope(shape_ele <= ub_ele): burst_len = _ceil_div(shape_ele, cp_align_len) tvm_ib.emit( tvm.call_extern(data_ub.dtype, "copy_gm_to_ubuf", data_ub.access_ptr("w", offset=0), data.access_ptr('r', offset=0), 0, 1, burst_len, 0, 0)) tvm_ib.emit( tvm.call_extern(dst.dtype, "copy_ubuf_to_gm", dst.access_ptr('w', offset=0), data_ub.access_ptr("r", offset=0), 0, 1, burst_len, 0, 0)) with tvm_ib.if_scope(shape_ele > ub_ele): loop = shape_ele // ub_ele mod = shape_ele % ub_ele with tvm_ib.for_range(0, loop, name="num_p") as num_p: burst_len = _ceil_div(ub_ele, cp_align_len) tvm_ib.emit( tvm.call_extern(data_ub.dtype, "copy_gm_to_ubuf", data_ub.access_ptr("w", offset=0), data.access_ptr('r', offset=num_p * ub_ele), 0, 1, burst_len, 0, 0)) tvm_ib.emit( tvm.call_extern(dst.dtype, "copy_ubuf_to_gm", dst.access_ptr('w', offset=num_p * ub_ele), data_ub.access_ptr("r", offset=0), 0, 1, burst_len, 0, 0)) with tvm_ib.if_scope(mod > 0): burst_len = _ceil_div(mod, cp_align_len) tvm_ib.emit( tvm.call_extern(data_ub.dtype, "copy_gm_to_ubuf", data_ub.access_ptr("w", offset=0), data.access_ptr('r', offset=loop * ub_ele), 0, 1, burst_len, 0, 0)) tvm_ib.emit( tvm.call_extern(dst.dtype, "copy_ubuf_to_gm", dst.access_ptr('w', offset=loop * ub_ele), data_ub.access_ptr("r", offset=0), 0, 1, burst_len, 0, 0)) return tvm_ib.get()
def collapse(ir_b, buffer, current_size): """Function to do emit insn""" repeat = current_size // 2 / vector_inst_one_repeat_size tail_flag = False if not repeat.is_integer(): tail_flag = True repeat = int(repeat) ir_b.emit( tvm.call_extern(buffer.dtype, "vadd", buffer.access_ptr("rw", offset=0), buffer.access_ptr("r", offset=0), buffer.access_ptr("r", offset=8), repeat, 1, 2, 2, 8, 16, 16)) # solve tail vadd if tail_flag: tail_mask = \ (current_size - repeat * 2 * vector_inst_one_repeat_size) // 2 te.platform.cce_intrin_md.reset_mask_insn(ir_builder, in_buffer.dtype, tail_mask) ir_b.emit( tvm.call_extern( buffer.dtype, "vadd", buffer.access_ptr("rw", offset=repeat * vector_inst_one_repeat_size), buffer.access_ptr("r", offset=repeat * 2 * vector_inst_one_repeat_size), buffer.access_ptr( "r", offset=repeat * 2 * vector_inst_one_repeat_size + 8), 1, 1, 2, 2, 0, 0, 0)) te.platform.cce_intrin_md.reset_mask_insn(ir_builder, in_buffer.dtype) return current_size // 2
def _do_cp_input_gm(input_gm, data_len, offset, params): """ :param input_gm: gm input buf :param data_len : length to be add :param offset: gm offset :param params : parameters """ params.ib_.emit( tvm.call_extern(params.dtype, 'copy_gm_to_ubuf', params.input_ub.access_ptr("rw", offset=0), input_gm.access_ptr("r", offset=offset), 0, 1, _ceil_div(data_len, params.cp_align_len), 0, 0)) params.set_pipe_barrier('PIPE_ALL')
def _special_ir(dst, data): tvm_ib = tvm.ir_builder.create() float_size = cce.cce_intrin.get_bit_len(data.dtype) // 8 cp_align_len = cce_params.BLOCK_REDUCE_INT8 // float_size n_i, c_i, _, _ = dst.shape c_0 = 16 n_true = _ceil_fill(n_i, c_0) ub_max = 3968 * 16 data_ub = _new_alloc(tvm_ib, dst.dtype, ub_max, "data_ub", scope=cce.scope_ubuf) loop = c_i // c_0 with tvm_ib.for_range(0, loop, name="n_loop") as n_loop: data_offset = n_loop * c_0 * n_true burst_len = n_i * c_0 // cp_align_len tvm_ib.emit( tvm.call_extern(data_ub.dtype, "copy_gm_to_ubuf", data_ub.access_ptr("w", offset=0), data.access_ptr('r', offset=data_offset), 0, 1, burst_len, 0, 0)) dst_offset = n_loop * c_0 burst_len_data = c_0 // cp_align_len dst_stride = (c_i - c_0) // cp_align_len tvm_ib.emit( tvm.call_extern(dst.dtype, "copy_ubuf_to_gm", dst.access_ptr('w', offset=dst_offset), data_ub.access_ptr("r", offset=0), 0, n_i, burst_len_data, 0, dst_stride)) return tvm_ib.get()
def _core_func(out_begin, out_end, element_num_of_core): """ :param out_begin : multi core output begin address :param out_end : multi core output end address :param element_num_of_core : element num of one core """ if out_end != total_element or element_num_of_core == total_element: core_cal_num = element_num_of_core else: core_cal_num = total_element % element_num_of_core if core_cal_num == 0: return _do_vector_dup((params.output_ub, 0), output_data_len, params.dtype, params) _do_cp_input_gm(input_gm, input_data_len, 0, params) with params.ib_.for_range(0, core_cal_num, for_type="serial", name="i") as i: i_tmp = out_begin + i with params.ib_.for_range(0, channel0, for_type="serial", name="c0_index") as c0_index: output_offset = i * channel0 * channel0 + c0_index * channel0 + c0_index c1_index = i_tmp // (hight * weight) i_hw = i_tmp % (hight * weight) with params.ib_.if_scope(channel0 * (c1_index) + c0_index < channel): value = params.input_ub.vload(i_hw * channel + channel0 * c1_index + c0_index) params.ib_.emit( params.output_ub.vstore(output_offset, value)) num_cp = _ceil_div((core_cal_num * channel0 * channel0), params.cp_align_len) params.ib_.emit( tvm.call_extern( params.dtype, 'copy_ubuf_to_gm', output_gm.access_ptr("rw", offset=out_begin * channel0 * channel0), params.output_ub.access_ptr("r", offset=0), 0, 1, num_cp, 0, 0))
def _kernel_ir(dst, src, dst_type, src_type): """ convert a scale from src type to dst type NOTICE: SCALE ONLY """ ir_builder = tvm.ir_builder.create() in_tensor = src[0] a_ub = _new_alloc(ir_builder, src_type, in_tensor.shape, "a_ub", scope=tbe_platform.scope_ubuf) out_tensor = dst[0] b_ub = _new_alloc(ir_builder, dst_type, in_tensor.shape, "b_ub", scope=tbe_platform.scope_ubuf) reg = ir_builder.allocate(dst_type, (1, ), name='reg', scope=tbe_platform.scope_reg) ir_builder.emit( tvm.call_extern(src_type, "copy_gm_to_ubuf", a_ub.access_ptr("w"), in_tensor.access_ptr("r"), 0, 1, 1, 0, 0)) ir_builder.emit( tvm.call_extern(src_type, "reg_mov", tvm.call_extern(dst_type, "reg", reg[0]), a_ub.access_ptr('r', offset=0))) ir_builder.emit( tvm.call_extern(dst_type, "reg_mov", b_ub.access_ptr('w', offset=0), tvm.call_extern(dst_type, "reg", reg[0]))) ir_builder.emit( tvm.call_extern(dst_type, "copy_ubuf_to_gm", out_tensor.access_ptr('w'), b_ub.access_ptr("r"), 0, 1, 1, 0, 0)) return ir_builder.get()
def get_block_offset_one_core(self): """get_block_offset_one_core Parameters ---------- self : self Returns ------- None """ self.out_begin = self.block.var * tvm.const(self.out_num_per_core, "int32") self.out_end = \ self.block.var*tvm.const(self.out_num_per_core, "int32") \ + tvm.const(self.out_num_per_core, "int32") # conv index of onecore to index ubvconv_deq if self.compile_plat in ("Ascend310", ): kernel_api.kernel_vector_dup_fuc(self.ir_builder, [self.index_ub, 0], 1, [1, self.mid_vec_align_len]) int_ub = kernel_api.ib_new_alloc(self.ir_builder, "int32", [8], "int_ub", scope=tbe_platform.scope_ubuf) fp16_ub = kernel_api.ib_new_alloc(self.ir_builder, "float16", [16], "fp16_ub", scope=tbe_platform.scope_ubuf) int_reg = self.ir_builder.allocate("int32", (1, ), name="int_data", scope=cce_params.scope_reg) self.ir_builder.emit( tvm.call_extern('int32', 'pipe_barrier', self.args_str)) with self.ir_builder.if_scope(self.out_begin <= self.nbins): int_reg[0] = self.block.var self.ir_builder.emit( tvm.call_extern('int32', 'pipe_barrier', self.args_str)) kernel_api.kernel_vector_dup_fuc(self.ir_builder, [int_ub, 0], int_reg[0], [1, 64]) _addr_list = [[fp16_ub, 0], [int_ub, 0]] self.ir_builder.emit( tvm.call_extern('int32', 'set_deqscale', self.deqscale)) kernel_api.kernel_cast_to_fuc(self.ir_builder, _addr_list, [1, 64], "vconv_deq") _addr_list = [[self.index_ub, 0], [fp16_ub, 0]] # fp16 to s32 kernel_api.kernel_cast_to_fuc(self.ir_builder, _addr_list, [1, self.mid_vec_align_len], "vconv_f162f32") kernel_api.kernel_scalar_to_one_fuc( self.ir_builder, [[self.index_ub, 0], [self.index_ub, 0]], [1, self.mid_vec_align_len], ["vmuls", self.out_num_per_core]) kernel_api.kernel_scalar_to_one_fuc( self.ir_builder, [[self.index_ub, 0], [self.index_ub, 0]], [1, self.mid_vec_align_len], ["vmuls", self.reg[3]]) kernel_api.kernel_scalar_to_one_fuc( self.ir_builder, [[self.index_ub, 0], [self.index_ub, 0]], [1, self.mid_vec_align_len], ["vadds", self.reg[0]]) self.ir_builder.emit( tvm.call_extern('int32', 'pipe_barrier', self.args_str)) self.reg[6] = self.index_ub.vload(0, self.mid_dtype)
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod", need_build=False, need_print=False): """ do element-wise truncatemod operation between two input tensors Parameters: ---------- shape1 : shape of input data1 shape2 : shape of input data2 dtype : source data type, support float16,float32,int32 kernel_name : cce kernel name, default value is "cce_tf_truncatemod" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ max_dim = 8 shape1_len = len(shape1) shape2_len = len(shape2) if shape1_len > max_dim or shape2_len > max_dim: raise RuntimeError( "mod_cce only support up to %d dimensions while the shape's \ dimensions is %d, %d" % (max_dim, shape1_len, shape2_len)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape1) util.check_shape_rule(shape2) util.check_shape_size(shape1, SHAPE_SIZE_LIMIT) util.check_shape_size(shape2, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] device_api_map = {"float16": "cc_device_truncatemod_float16", "float32": "cc_device_truncatemod_float", "int32": "cc_device_truncatemod_int32"} dtype = dtype.lower() if dtype not in check_list: raise RuntimeError( "tf_truncatemod_cce only support %s while dtype is %s" % ( ",".join(check_list), dtype)) shape1, shape2, shape_out = util.produce_shapes(shape1, shape2) util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() device_api = device_api_map[inp_dtype] # block block_num = "block_num" block_idx = "block_idx" # x param v_xndim_cnt = tvm.const(len(shape1), "int32") p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape") xpad_c0 = tvm.const(0, "int32") data_input_x = tvm.placeholder(shape1, name="data_input_x", dtype=inp_dtype) # y param v_yndim_cnt = tvm.const(len(shape2), "int32") p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape") ypad_c0 = tvm.const(0, "int32") data_input_y = tvm.placeholder(shape2, name="data_input_y", dtype=inp_dtype) # output v_out_ndim_cnt = tvm.const(len(shape_out), "int32") p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape") out_padc0 = tvm.const(0, "int32") output = tvm.extern(shape_out, [p_xshape, data_input_x, p_yshape, data_input_y, p_out_shape], lambda ins, outs: tvm.call_extern("int32_t", device_api, block_num, block_idx, v_xndim_cnt, ins[0].access_ptr("r"), # shape x xpad_c0, ins[1].access_ptr("r"), # input x v_yndim_cnt, ins[2].access_ptr("r"), # shape y ypad_c0, ins[3].access_ptr("r"), # input y v_out_ndim_cnt, ins[4].access_ptr("r"), # shape out out_padc0, outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) # print IR if need_print: with build_config: print(tvm.lower(schedule, [data_input_x, data_input_y, output], simple_mode=True)) # Compile to generate the cce file if need_build: with build_config: tvm.build(schedule, [data_input_x, data_input_y, output], "cce", name=kernel_name)
def custom_round(shape, dtype, kernel_name="cce_round", need_build=False, need_print=False): """ doing round operations, calculating data type is float16 or float32 or int32 Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype kernel_name : cce kernel name, default value is "cce_round" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ check_list = ["float16", "float32", "int32"] device_api_map = { "float16": "cc_device_round_float16", "float32": "cc_device_round_float", "int32": "cc_device_round_int32" } max_dim = 8 shape_len = len(shape) if shape_len > max_dim: raise RuntimeError( "round_cce only support up to %d dimensions while the shape's dimension is %d" % (max_dim, shape_len)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in check_list): raise RuntimeError("round_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) device_api = device_api_map[inp_dtype] block_num = "block_num" block_idx = "block_idx" v_ndim = tvm.const(len(shape), "int32") padC0 = tvm.const(0, "int32") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_input, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_ndim, ins[1].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def custom_Power(shape, dtype, gamma, alpha, beta, kernel_name="cce_caffe_power", need_build=False, need_print=False): """ calculate (alpha * data + beta) ** gamma, calulation method exp(gamma * log(alpha * data + beta)). when alpha * data + beta < 0 , the output is a meaningless value. Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32 gamma : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma alpha : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma beta : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma kernel_name : string kernel name in generated CCE kernal. default value is "cce_caffe_power" need_buid : bool if need to build CCEC kernel need_print : bool if need to print Halide IR Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "cc_device_pow" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("power_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim_x = len(shape) v_ndim_y = 0 p_shape_y = 0 p_input_y = "nullptr" block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale") p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift") p_power = util.create_param_ptr([gamma], inp_dtype, "p_power") p_shape_x = util.create_param_ptr(shape, "int32", "p_shape_x") # scale --> alpha, shitf --> beta, power --> gamma output = tvm.extern( shape, [data_input, p_scale, p_shift, p_power, p_shape_x], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # power v_ndim_x, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x v_ndim_y, v_ndim_y, p_shape_y, padC0, p_input_y, outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def _do_operation(ir_builder, place_holders, plantform_paras, loops_remains, const_1, block_offset, shape_each_core, num_remain_by_128, is_not_align): #alloc_res[0:data_zero_ub 1:data_fp16_1 # 2:data_fp16_mask 3:data_fp32_1 4:data_tensor_ub 5:data_mask_ub] #offsets[0:total_gm_data_offset 1:total_gm_mask_offset # 2:offset_gm_data 3:offset_gm_mask # 4:total_ub_data_offset 5:total_ub_mask_offset] #repeates[0:repeate_ub_data 1:repeate_ub_mask 2:repeate_ub_vector # 3:repeate_d 4:repeate_m 5:repeate_v] # 6 = the list size reg = ir_builder.allocate(place_holders[0].dtype, (1, ), name="reg", scope=tbe_platform.scope_reg) [alloc_res, offsets, repeates] = [[None] * 7, [0] * 6, [0] * 6] [offsets[0], offsets[1]] = [offsets[0] + block_offset, offsets[1] + block_offset // 8] [alloc_res[0], alloc_res[1], alloc_res[2], alloc_res[3]] = [ _new_alloc(ir_builder, 'float16', (ELEMS_BATCH_PROCESS_FP16, ), "data_zero_ub", scope=tbe_platform.scope_ubuf), _new_alloc(ir_builder, 'float16', (ELEMS_BATCH_PROCESS_FP16, ), "data_fp16one_ub", scope=tbe_platform.scope_ubuf), _new_alloc(ir_builder, 'float16', (ELEMS_BATCH_PROCESS_FP16, ), "data_fp16_all1_mask_ub", scope=tbe_platform.scope_ubuf), _new_alloc(ir_builder, 'float32', (ELEMS_BATCH_PROCESS_FP16, ), "data_fp32one_ub", scope=tbe_platform.scope_ubuf) ] if (place_holders[0].dtype == 'float32') else [ _new_alloc(ir_builder, 'float16', (ELEMS_BATCH_PROCESS_FP16, ), "data_zero_ub", scope=tbe_platform.scope_ubuf), None, None, None ] [alloc_res[4], alloc_res[5], alloc_res[6]] = [ _new_alloc(ir_builder, place_holders[0].dtype, (plantform_paras[0], ), "data_tensor_ub", scope=tbe_platform.scope_ubuf), _new_alloc(ir_builder, place_holders[1].dtype, (plantform_paras[0] // 8, ), "data_mask_ub", scope=tbe_platform.scope_ubuf), _new_alloc(ir_builder, place_holders[3].dtype, (1, ), "keep_prob_tensor_ub", scope=tbe_platform.scope_ubuf) ] if (loops_remains[0] > 0) else [None, None, None] const_buf = _new_alloc(ir_builder, const_1.dtype, (ELEMS_BATCH_PROCESS_FP16, ), "const_1_ub", scope=tbe_platform.scope_ubuf) if loops_remains[0] > 0: with ir_builder.for_range(0, loops_remains[0], name='index0') as index0: [offsets[2], offsets[3]] = [ block_offset + plantform_paras[0] * index0, block_offset // 8 + plantform_paras[0] // 8 * index0 ] # 16: fp16 elems can be move by once is 16, # lots of '16' below for this reason # 32: uint8 elems can be move by once is 32, # lots of '32' below for this reason # 64: fp32 elems can be process by vector instruction, # lots of '64' below for this reason [repeates[0], repeates[1], repeates[2]] = [ plantform_paras[0] // 16, plantform_paras[0] // 8 // 32, plantform_paras[0] // ELEMS_BATCH_PROCESS_FP16 ] if (place_holders[0].dtype == 'float16') else [ plantform_paras[0] // 8, plantform_paras[0] // 8 // 32, plantform_paras[0] // 64 ] ir_builder.emit( tvm.call_extern('float16', "vector_dup", alloc_res[0].access_ptr("rw"), tvm.const(0.0, dtype='float16'), 1, 1, 1, 8, 8)) ir_builder.emit( tvm.call_extern(const_1.dtype, "vector_dup", const_buf.access_ptr("rw"), tvm.const(1.0, dtype=const_1.dtype), 1, 1, 1, 8, 8)) if place_holders[0].dtype == 'float32': ir_builder.emit( tvm.call_extern('float16', "vector_dup", alloc_res[1].access_ptr("rw"), tvm.const(1.0, dtype='float16'), 1, 1, 1, 8, 8)) ir_builder.emit( tvm.call_extern( place_holders[1].dtype, "copy_gm_to_ubuf", alloc_res[5].access_ptr("w"), place_holders[1].access_ptr("r", offset=offsets[3]), 0, 1, repeates[1], 0, 0)) ir_builder.emit( tvm.call_extern( place_holders[0].dtype, "copy_gm_to_ubuf", alloc_res[4].access_ptr("w"), place_holders[0].access_ptr("r", offset=offsets[2]), 0, 1, repeates[0], 0, 0)) ir_builder.emit( tvm.call_extern(place_holders[3].dtype, "copy_gm_to_ubuf", alloc_res[6].access_ptr("w"), place_holders[3].access_ptr("r", offset=0), 0, 1, 1, 0, 0)) cce_intrin_md.reset_mask_insn(ir_builder, const_1.dtype, bits=1, mask_func=None) ir_builder.emit( tvm.call_extern(place_holders[3].dtype, 'vdiv', alloc_res[6].access_ptr('w'), const_buf.access_ptr('r'), alloc_res[6].access_ptr('r'), 1, 1, 1, 1, 8, 8, 8)) cce_intrin_md.reset_mask_insn(ir_builder, const_1.dtype, bits=ELEMS_BATCH_PROCESS_FP16, mask_func=None) ir_builder.emit( tvm.call_extern(place_holders[3].dtype, "reg_mov", tvm.call_extern(reg.dtype, "reg", reg[0]), alloc_res[6].access_ptr("r", offset=0))) offset_src = 64 * 255 if place_holders[ 0].dtype == "float32" else 128 * 255 repeate_vmuls = repeates[2] // 255 repeat_left = repeates[2] % 255 for i in range(repeate_vmuls): ir_builder.emit( tvm.call_extern( place_holders[0].dtype, 'vmuls', alloc_res[4].access_ptr('w', offset=offset_src * i), alloc_res[4].access_ptr('r'), reg[0], 255, 1, 1, 8, 8)) ir_builder.emit( tvm.call_extern( place_holders[0].dtype, 'vmuls', alloc_res[4].access_ptr('w', offset=offset_src * repeate_vmuls), alloc_res[4].access_ptr('r', offset=offset_src * repeate_vmuls), reg[0], repeat_left, 1, 1, 8, 8)) with ir_builder.for_range(0, loops_remains[1], name='index1') as index1: ir_builder.emit( tvm.call_extern( place_holders[1].dtype, 'set_cmpmask', alloc_res[5].access_ptr('r', offset=16 * index1))) _sel_data(ir_builder, place_holders[0], alloc_res, ELEMS_BATCH_PROCESS_FP16 * index1) ir_builder.emit( tvm.call_extern( place_holders[2].dtype, "copy_ubuf_to_gm", place_holders[2].access_ptr('w', offset=offsets[2]), alloc_res[4].access_ptr("r"), 0, 1, repeates[0], 0, 0)) [offsets[0], offsets[1]] = [ offsets[0] + plantform_paras[0] * loops_remains[0], offsets[1] + plantform_paras[0] * loops_remains[0] // 8 ] if loops_remains[2]: # 0:data_shape 1:mask_shape if num_remain_by_128 != 0 and is_not_align: remain_shapes = ((int(place_holders[0].shape[0]) - plantform_paras[0] * loops_remains[0], ), (int(place_holders[1].shape[0]) - plantform_paras[0] // 8 * loops_remains[0], )) else: remain_shapes = ((shape_each_core - plantform_paras[0] * loops_remains[0], ), (shape_each_core // 8 - plantform_paras[0] // 8 * loops_remains[0], )) [alloc_res[4], alloc_res[5], alloc_res[6]] = [ _new_alloc(ir_builder, place_holders[0].dtype, remain_shapes[0], "data_tensor_ub", scope=tbe_platform.scope_ubuf), _new_alloc(ir_builder, place_holders[1].dtype, remain_shapes[1], "data_mask_ub", scope=tbe_platform.scope_ubuf), _new_alloc(ir_builder, place_holders[3].dtype, (1, ), "keep_prob_tensor_ub", scope=tbe_platform.scope_ubuf) ] [repeates[3], repeates[4], repeates[5]] = [ int(math.ceil(remain_shapes[0][0] * 1.0 / 8)), int(math.ceil(remain_shapes[1][0] * 1.0 / 32)), int(remain_shapes[0][0] * 1.0 / 64) ] if (place_holders[0].dtype == 'float32') else [ int(math.ceil(remain_shapes[0][0] * 1.0 / 16)), int(math.ceil(remain_shapes[1][0] * 1.0 / 32)), int(remain_shapes[0][0] * 1.0 / ELEMS_BATCH_PROCESS_FP16) ] ir_builder.emit( tvm.call_extern('float16', "vector_dup", alloc_res[0].access_ptr("rw"), tvm.const(0.0, dtype='float16'), 1, 1, 1, 8, 8)) ir_builder.emit( tvm.call_extern(const_1.dtype, "vector_dup", const_buf.access_ptr("rw"), tvm.const(1.0, dtype=const_1.dtype), 1, 1, 1, 8, 8)) if place_holders[0].dtype == 'float32': ir_builder.emit( tvm.call_extern('float16', "vector_dup", alloc_res[1].access_ptr("rw"), tvm.const(1.0, dtype='float16'), 1, 1, 1, 8, 8)) ir_builder.emit( tvm.call_extern( place_holders[1].dtype, "copy_gm_to_ubuf", alloc_res[5].access_ptr("w"), place_holders[1].access_ptr("r", offset=offsets[1]), 0, 1, repeates[4], 0, 0)) ir_builder.emit( tvm.call_extern( place_holders[0].dtype, "copy_gm_to_ubuf", alloc_res[4].access_ptr("w"), place_holders[0].access_ptr("r", offset=offsets[0]), 0, 1, repeates[3], 0, 0)) ir_builder.emit( tvm.call_extern(place_holders[3].dtype, "copy_gm_to_ubuf", alloc_res[6].access_ptr("w"), place_holders[3].access_ptr("r", offset=0), 0, 1, 1, 0, 0)) cce_intrin_md.reset_mask_insn(ir_builder, const_1.dtype, bits=1, mask_func=None) ir_builder.emit( tvm.call_extern(place_holders[3].dtype, 'vdiv', alloc_res[6].access_ptr('w'), const_buf.access_ptr('r'), alloc_res[6].access_ptr('r'), 1, 1, 1, 1, 8, 8, 8)) cce_intrin_md.reset_mask_insn(ir_builder, const_1.dtype, bits=ELEMS_BATCH_PROCESS_FP16, mask_func=None) ir_builder.emit( tvm.call_extern(place_holders[0].dtype, "reg_mov", tvm.call_extern(reg.dtype, "reg", reg[0]), alloc_res[6].access_ptr("r", offset=0))) offset_src = 64 * 255 if place_holders[ 0].dtype == "float32" else 128 * 255 repeate_vmuls = repeates[5] // 255 repeat_left = repeates[5] % 255 for i in range(repeate_vmuls): ir_builder.emit( tvm.call_extern( place_holders[0].dtype, 'vmuls', alloc_res[4].access_ptr('w', offset=offset_src * i), alloc_res[4].access_ptr('r'), reg[0], 255, 1, 1, 8, 8)) ir_builder.emit( tvm.call_extern( place_holders[0].dtype, 'vmuls', alloc_res[4].access_ptr('w', offset=offset_src * repeate_vmuls), alloc_res[4].access_ptr('r', offset=offset_src * repeate_vmuls), reg[0], repeat_left, 1, 1, 8, 8)) remains_divs = ELEMS_BATCH_PROCESS_FP16 if place_holders[0].dtype == 'float16' \ else 64 [loops_remains[1], loops_remains[3]] = [ remain_shapes[0][0] // remains_divs, remain_shapes[0][0] % remains_divs ] loops = ((loops_remains[1]) // 2) + 1 if place_holders[0].dtype == 'float32' \ else loops_remains[1] with ir_builder.for_range(0, loops, name='index2') as index2: ir_builder.emit( tvm.call_extern( place_holders[1].dtype, 'set_cmpmask', alloc_res[5].access_ptr('r', offset=16 * index2))) _sel_data(ir_builder, place_holders[0], alloc_res, ELEMS_BATCH_PROCESS_FP16 * index2) [offsets[4], offsets[5]] = [ plantform_paras[1] * loops_remains[1], plantform_paras[2] * loops_remains[1] ] if (place_holders[0].dtype == 'float32') else [ plantform_paras[1] * loops_remains[1], plantform_paras[2] * loops_remains[1] ] if loops_remains[3]: cce_intrin_md.reset_mask_insn(ir_builder, place_holders[0].dtype, bits=loops_remains[3], mask_func=None) ir_builder.emit( tvm.call_extern( place_holders[0].dtype, 'vmuls', alloc_res[4].access_ptr('w', offset=offsets[4]), alloc_res[4].access_ptr('r', offset=offsets[4]), reg[0], 1, 1, 1, 8, 8)) if place_holders[0].dtype == 'float16' or loops_remains[1] == 0: ir_builder.emit( tvm.call_extern( place_holders[1].dtype, 'set_cmpmask', alloc_res[5].access_ptr('r', offset=offsets[5]))) _sel_data(ir_builder, place_holders[0], alloc_res, offsets[4]) cce_intrin_md.reset_mask_insn(ir_builder, place_holders[0].dtype, bits=ELEMS_BATCH_PROCESS_FP16, mask_func=None) ir_builder.emit( tvm.call_extern( place_holders[2].dtype, "copy_ubuf_to_gm", place_holders[2].access_ptr('w', offset=offsets[0]), alloc_res[4].access_ptr("r"), 0, 1, repeates[3], 0, 0))
def _func_more_row(args): """ function of moving data for more row scene """ tvm_ib, param, data, dst, data_ub, data_res, data_tail, reg, reg_addr,\ num_g, num_row_cur_core, c_0 = args _, n_no, n_ni, c_0 = data.shape row_ele = n_no*n_ni*c_0 h_i, w_i, c_i, n_i = dst.shape c_1 = _ceil_div(c_i, c_0) h_w = h_i*w_i num_row_before_core = num_g*param.get("num_row_one_group")\ + param.get("block_index")\ * param.get("num_row_one_core") num_hw_dst_before_core = num_row_before_core // c_1 num_c0_dst_cur_hw_before = num_row_before_core % c_1 num_c0_dst_cur_hw = c_1 - num_c0_dst_cur_hw_before reg_count = 8 with tvm_ib.if_scope(num_row_cur_core <= num_c0_dst_cur_hw): data_offset = num_c0_dst_cur_hw_before*h_w*row_ele\ + num_hw_dst_before_core*row_ele n_burst = num_row_cur_core burst_len_data = _ceil_div(row_ele, param.get("cp_align_len")) src_stride = _ceil_div((h_w - 1)*row_ele, param.get("cp_align_len")) args = tvm_ib, param, data, data_ub, data_offset, 0, n_burst,\ burst_len_data, src_stride, 0 _func_gm_to_ub(args) c_t = tvm.min((num_c0_dst_cur_hw_before + 1) * c_0, c_i) c_cur = c_t - (num_c0_dst_cur_hw_before*c_0) with tvm_ib.for_range(0, num_row_cur_core, name="num_tr") as num_tr: with tvm_ib.for_range(0, c_cur, name="num_c") as num_c: with tvm_ib.for_range(0, n_no, name="num_no") as num_no: n_t = tvm.min((num_no + 1)*n_ni, n_i) n_cur = n_t - num_no*n_ni with tvm_ib.if_scope(n_cur % reg_count == 0): n_cur_times_8 = n_cur // reg_count reg_list = [n for n in range(reg_count)] with tvm_ib.for_range(0, n_cur_times_8, name="num_nc") as num_nc: for reg_idx in reg_list: tvm_ib.emit(tvm.call_extern( data_ub.dtype, "reg_mov", tvm.call_extern(reg.dtype, "reg", reg[reg_idx]), data_ub.access_ptr( 'r', offset=(num_tr * row_ele + num_no * n_ni * c_0 + (reg_idx + num_nc * reg_count) * c_0 + num_c)) )) for reg_idx in reg_list: tvm_ib.emit(tvm.call_extern( data_res.dtype, "reg_mov", data_res.access_ptr( 'w', offset=(num_tr * c_0 * n_i + num_c * n_i + num_no * n_ni + (reg_idx + num_nc * reg_count))), tvm.call_extern(reg.dtype, "reg", reg[reg_idx]) )) with tvm_ib.else_scope(): with tvm_ib.for_range(0, n_cur, name="num_nc") as num_nc: tvm_ib.emit(tvm.call_extern( data_ub.dtype, "reg_mov", tvm.call_extern(reg.dtype, "reg", reg[0]), data_ub.access_ptr( 'r', offset=( num_tr * row_ele + num_no * n_ni * c_0 + num_nc * c_0 + num_c)) )) tvm_ib.emit(tvm.call_extern( data_res.dtype, "reg_mov", data_res.access_ptr( 'w', offset=( num_tr * c_0 * n_i + num_c * n_i + num_no * n_ni + num_nc)), tvm.call_extern(reg.dtype, "reg", reg[0]) )) c_t = tvm.min((num_c0_dst_cur_hw_before+num_row_cur_core)*c_0, c_i) c_cur = c_t - (num_c0_dst_cur_hw_before * c_0) total_len = c_cur * n_i reg_addr[5] = total_len dst_offset = num_hw_dst_before_core*c_i*n_i\ + num_c0_dst_cur_hw_before*c_0*n_i args = tvm_ib, param, dst, data_res, data_tail, reg, reg_addr, 0, 0,\ dst_offset, reg_addr[5] _res_to_gm_more_row(args) with tvm_ib.if_scope(num_row_cur_core > num_c0_dst_cur_hw): num_c0_head = num_c0_dst_cur_hw num_row_after = num_row_cur_core - num_c0_head reg_addr[2] = num_row_after num_g_mid = reg_addr[2] // c_1 num_c0_tail = reg_addr[2] % c_1 # gm to ub to ub_res with tvm_ib.if_scope(num_c0_head > 0): data_offset = num_c0_dst_cur_hw_before * h_w * row_ele\ + num_hw_dst_before_core * row_ele n_burst = num_c0_head burst_len_data = _ceil_div(row_ele, param.get("cp_align_len")) src_stride = _ceil_div((h_w - 1) * row_ele, param.get("cp_align_len")) args = tvm_ib, param, data, data_ub, data_offset, 0, n_burst,\ burst_len_data, src_stride, 0 _func_gm_to_ub(args) # ub to ub_res with tvm_ib.for_range(0, num_c0_head, name="num_c0") as num_c0: c_t = tvm.min((num_c0_dst_cur_hw_before + num_c0 + 1) * c_0, c_i) c_cur = c_t - (num_c0_dst_cur_hw_before + num_c0)*c_0 with tvm_ib.for_range(0, c_cur, name="num_cr") as num_cr: with tvm_ib.for_range(0, n_no, name="num_no") as num_no: n_t = tvm.min((num_no + 1) * n_ni, n_i) n_cur = n_t - num_no*n_ni with tvm_ib.for_range(0, n_cur, name="num_nc")\ as num_nc: tvm_ib.emit(tvm.call_extern( data_ub.dtype, "reg_mov", tvm.call_extern(reg.dtype, "reg", reg[0]), data_ub.access_ptr('r', offset=(num_c0*row_ele + num_no*n_ni*c_0 + num_nc*c_0 + num_cr)) )) tvm_ib.emit(tvm.call_extern( data_res.dtype, "reg_mov", data_res.access_ptr('w', offset=( num_c0*c_0*n_i + num_cr*n_i + num_no*n_ni + num_nc)), tvm.call_extern(reg.dtype, "reg", reg[0]) )) with tvm_ib.if_scope(num_g_mid > 0): num_row_before_mid = num_row_before_core + num_c0_head reg_addr[3] = num_row_before_mid num_hw_dst_before_mid = reg_addr[3] // c_1 n_burst = c_1 burst_len_data = _ceil_div(row_ele, param.get("cp_align_len")) src_stride = _ceil_div((h_w - 1) * row_ele, param.get("cp_align_len")) ub_offset_mid_begin = num_c0_head*row_ele with tvm_ib.for_range(0, num_g_mid, name="num_mg") as num_mg: data_offset = (num_hw_dst_before_mid + num_mg) * row_ele ub_offset = ub_offset_mid_begin + num_mg*c_1*row_ele args = tvm_ib, param, data, data_ub, data_offset, ub_offset,\ n_burst, burst_len_data, src_stride, 0 _func_gm_to_ub(args) # ub to ub_res res_offset_mid_begin = c_i*n_i - num_c0_dst_cur_hw_before*c_0*n_i with tvm_ib.for_range(0, num_g_mid, name="num_mg") as num_mg: with tvm_ib.for_range(0, c_i, name="num_ci") as num_ci: c_t = tvm.min((num_ci + 1) * c_0, c_i) c_cur = c_t - num_ci*c_0 with tvm_ib.for_range(0, c_cur, name="num_cr") as num_cr: with tvm_ib.for_range(0, n_no, name="num_no") as num_no: n_t = tvm.min((num_no + 1)*n_ni, n_i) n_cur = n_t - num_no*n_ni with tvm_ib.for_range(0, n_cur, name="num_nc")\ as num_nc: tvm_ib.emit(tvm.call_extern( data_ub.dtype, "reg_mov", tvm.call_extern(reg.dtype, "reg", reg[0]), data_ub.access_ptr( 'r', offset=(ub_offset_mid_begin + num_mg*c_1*row_ele + num_ci*row_ele + num_no*n_ni*c_0 + num_nc*c_0 + num_cr)) )) tvm_ib.emit(tvm.call_extern( data_res.dtype, "reg_mov", data_res.access_ptr( 'w', offset=(res_offset_mid_begin + num_mg*c_i*n_i + num_ci*c_0*n_i + num_cr*n_i + num_no*n_ni + num_nc)), tvm.call_extern(reg.dtype, "reg", reg[0]) )) with tvm_ib.if_scope(num_c0_tail > 0): num_row_before_tail = num_row_before_core + num_c0_head\ + num_g_mid*c_1 reg_addr[4] = num_row_before_tail num_hw_dst_before_tail = reg_addr[4] // c_1 ub_offset_tail_begin = (num_c0_head + num_g_mid*c_1)*row_ele data_offset = num_hw_dst_before_tail*row_ele n_burst = num_c0_tail burst_len_data = _ceil_div(row_ele, param.get("cp_align_len")) src_stride = _ceil_div((h_w - 1) * row_ele, param.get("cp_align_len")) args = tvm_ib, param, data, data_ub, data_offset,\ ub_offset_tail_begin, n_burst, burst_len_data, src_stride, 0 _func_gm_to_ub(args) # ub to ub_res res_offset_tail_begin = c_i*n_i - num_c0_dst_cur_hw_before*c_0*n_i\ + num_g_mid*n_i*c_i with tvm_ib.for_range(0, num_c0_tail, name="num_tc") as num_tc: c_t = tvm.min((num_tc + 1) * c_0, c_i) c_cur = c_t - num_tc*c_0 with tvm_ib.for_range(0, c_cur, name="num_cr") as num_cr: with tvm_ib.for_range(0, n_no, name="num_no") as num_no: n_t = tvm.min((num_no + 1)*n_ni, n_i) n_cur = n_t - num_no*n_ni with tvm_ib.for_range(0, n_cur, name="num_nc")\ as num_nc: tvm_ib.emit(tvm.call_extern( data_ub.dtype, "reg_mov", tvm.call_extern(reg.dtype, "reg", reg[0]), data_ub.access_ptr( 'r', offset=(ub_offset_tail_begin + num_tc*row_ele + num_no*n_ni*c_0 + num_nc*c_0 + num_cr)) )) tvm_ib.emit(tvm.call_extern( data_res.dtype, "reg_mov", data_res.access_ptr('w', offset=( res_offset_tail_begin + num_tc*c_0*n_i + num_cr * n_i + num_no*n_ni + num_nc)), tvm.call_extern(reg.dtype, "reg", reg[0]) )) # ub_res to dst total_len = c_i*n_i - num_c0_dst_cur_hw_before*c_0*n_i\ + num_g_mid*n_i*c_i + num_c0_tail*c_0*n_i reg_addr[6] = total_len dst_offset = num_hw_dst_before_core*c_i*n_i\ + num_c0_dst_cur_hw_before*c_0*n_i args = tvm_ib, param, dst, data_res, data_tail, reg, reg_addr, 1, 0,\ dst_offset, reg_addr[6] _res_to_gm_more_row(args)
def _histogram_fixed_width_ir(dst, src, nbins, shape_list): """ IR node builder make function Parameters ---------- dst: list the placeholders of dst src: list the placeholders of src, data, data_range = src nbins: int number of histogram bins. shape_list: list the shape list of srcs, data_shape, data_range_shape = shape_list Returns ------- ib.get(): stmt The result statement. """ data, data_range = src ib_create = tvm.ir_builder.create() # params init params = IrParams(ib_create, [data.dtype, data_range.dtype, dst[0].dtype], [shape_list[0], shape_list[1], dst[0].shape], nbins) # calc out_begin and out_end per core # init src_mid_input_ub kernel_api.kernel_vector_dup_fuc( params.ir_builder, [params.range0_ub, 0], SCALAR_ZERO, [params.mid_vec_align_len, params.mid_vec_align_len]) kernel_api.kernel_vector_dup_fuc( params.ir_builder, [params.range0_ub, params.mid_vec_align_len], 2**(-126), [params.mid_vec_align_len, params.mid_vec_align_len]) # init tensor: output tensor, len=nbins kernel_api.kernel_vector_dup_fuc( params.ir_builder, [params.des_output_ub, 0], SCALAR_ZERO, [params.out_num_per_core, params.output_vec_align_len]) kernel_api.kernel_vector_dup_fuc( params.ir_builder, [params.des_tmp_output_ub, 0], SCALAR_ZERO, [params.out_num_per_core, params.output_vec_align_len]) # copy data_range from out to ub kernel_api.kernel_cp_fuc( params.ir_builder, [[params.range_src_ub, 0], [data_range, 0]], [params.data_range_shape[0], params.input_align_len], "copy_gm_to_ubuf") # vconv input to float32 _fuction_data_conv_ir( [[params.src_mid_input_range_ub, 0], [params.range_src_ub, 0]], [ params.data_range_shape[0], max(params.input_vec_align_len, params.mid_vec_align_len) ], params) params.ir_builder.emit( tvm.call_extern('int32', 'pipe_barrier', params.args_str)) params.reg[0] = params.src_mid_input_range_ub.vload(0, params.mid_dtype) params.reg[1] = params.src_mid_input_range_ub.vload(1, params.mid_dtype) # range1 - range0 kernel_api.kernel_vector_dup_fuc( params.ir_builder, [params.src_mid_input_ub, 0], params.reg[1], [params.mid_align_len, params.mid_align_len]) _addr_list = [[params.src_mid_input_range_ub, 0], [params.src_mid_input_ub, 0], [params.src_mid_input_range_ub, 0]] kernel_api.kernel_two_to_one_common_fuc(params.ir_builder, _addr_list, [1, params.mid_align_len], "vsub") params.ir_builder.emit( tvm.call_extern('int32', 'pipe_barrier', params.args_str)) params.reg[2] = params.src_mid_input_range_ub.vload(0, params.mid_dtype) _addr_list = [[params.src_mid_input_range_ub, 0], [params.src_mid_input_range_ub, 0]] kernel_api.kernel_scalar_to_one_fuc( params.ir_builder, _addr_list, [1, params.mid_vec_align_len], ["vmuls", float(1.0 / params.nbins)]) params.ir_builder.emit( tvm.call_extern('int32', 'pipe_barrier', params.args_str)) params.reg[3] = params.src_mid_input_range_ub.vload(0, params.mid_dtype) # get 0-64 mask_value for set_vector_mask params.get_mask_list(64) params.get_block_offset_one_core() loop_and_mask_list = \ kernel_api.get_loopnum_and_masklist(params.data_shape[0], SEGMENT_SIZE_COPY_GM_TO_UB) def _run_fuc(data_len, data_offset, copy_ub): # copy data(len=data_len,offset=data_offset) from out to ub kernel_api.kernel_cp_fuc(params.ir_builder, [[copy_ub, 0], [data, data_offset]], [data_len, params.input_align_len], "copy_gm_to_ubuf") _fuction_data_conv_ir( [[params.src_mid_input_ub, 0], [copy_ub, 0]], [data_len, max(params.input_align_len, params.mid_vec_align_len)], params) # fixed process:nbins*(values - value_range[0])/(value_range[1] # - value_range[0]) _data_info_list = [data_len, params.mid_vec_align_len] # clac histogram in one SEGMENT_SIZE_COPY_GM_TO_UB and sum to output UB _function_histogram_process_ir(params.src_mid_input_ub, 0, _data_info_list, params) # data process SEGMENT_SIZE_COPY_GM_TO_UB by SEGMENT_SIZE_COPY_GM_TO_UB with params.ir_builder.for_range(0, loop_and_mask_list[0], name='m') as pre_index: _run_fuc(SEGMENT_SIZE_COPY_GM_TO_UB, pre_index * SEGMENT_SIZE_COPY_GM_TO_UB, params.src_ub) # tail_data process; len = data_len % SEGMENT_SIZE_COPY_GM_TO_UB if loop_and_mask_list[1] == 1: _run_fuc(params.data_shape[0] % SEGMENT_SIZE_COPY_GM_TO_UB, loop_and_mask_list[0] * SEGMENT_SIZE_COPY_GM_TO_UB, params.src_ub) # copy result to out by mul cores def _copy_out(copy_data_len): kernel_api.kernel_cp_fuc( params.ir_builder, [[dst[0], params.out_begin], [params.des_output_ub, 0]], [copy_data_len, params.output_align_len], "copy_ubuf_to_gm") with params.ir_builder.if_scope( params.out_begin < tvm.const(params.nbins, "int32")): with params.ir_builder.if_scope( params.out_end <= tvm.const(params.nbins, "int32")): _copy_out(params.out_num_per_core) with params.ir_builder.else_scope(): tail_core_num = params.nbins % params.out_num_per_core if tail_core_num != SCALAR_ZERO: _copy_out(tail_core_num) return params.ir_builder.get()
def elewise_binary_phony_ex(stmt_op): """ elewise_binary_phony_ex which will eliminate its second input tensor completely """ ins, outs, _, _ = cce_util.get_dma_buffer(stmt_op) ir_builder = tvm.ir_builder.create() def new_alloc(ir_builder, dtype, shape, name, scope): """ new_alloc """ buf_var = ir_builder.allocate(dtype, shape, name=name, scope=scope) new_buffer = tvm.decl_buffer(shape, buf_var.dtype, name=name, scope=scope, data=buf_var) return new_buffer # Move first input to out dtype = ins[0].dtype total_element = 0 for dim in ins[0].shape: if total_element == 0: total_element = dim else: total_element *= dim _block_unit_size = ALIGNMENT_BYTES // cce_util.get_align_factor(dtype)[1] total_block = int(total_element) // int(_block_unit_size) remain = int(total_element % _block_unit_size) if total_block > 0: ir_builder.emit(tvm.call_extern( ins[0].dtype, "copy_ubuf_to_gm", outs[0].access_ptr("rw"), ins[0].access_ptr("r"), 0, 1, total_block, 0, 0)) if remain > 0 and total_block > 0: # Roll back for remaining data roll_back_size = _block_unit_size - remain # Allocate reg buffer needed for holding src data reg = new_alloc(ir_builder, ins[0].dtype, (_block_unit_size,), "copy_part", scope=cce.scope_ubuf) # reg_mov src data with ir_builder.for_range(0, _block_unit_size, name="reg_idx") as reg_idx: ir_builder.emit(tvm.call_extern( ins[0].dtype, "reg_mov", reg.access_ptr("rw", offset=reg_idx), ins[0].access_ptr("r", offset=total_block*_block_unit_size-roll_back_size+reg_idx))) ir_builder.emit(tvm.call_extern( ins[0].dtype, "copy_ubuf_to_gm", outs[0].access_ptr("rw", offset=total_block*_block_unit_size-roll_back_size), reg.access_ptr("r"), 0, 1, 1, 0, 0)) if remain > 0 and total_block == 0: ir_builder.emit(tvm.call_extern( ins[0].dtype, "copy_ubuf_to_gm", outs[0].access_ptr("rw", offset=0), ins[0].access_ptr("r", offset=0), 0, 1, 1, 0, 0)) return ir_builder.get()
def _core_func(out_begin, out_end, element_num_of_core): """ :param out_begin : multi core output begin address :param out_end : multi core output end address :param element_num_of_core : element num of one core """ if out_end != total_element or element_num_of_core == total_element: core_cal_num = element_num_of_core else: core_cal_num = total_element % element_num_of_core if core_cal_num == 0: return _do_vector_dup((params.output_ub, 0), output_data_len, params.dtype, params) _do_cp_input_gm(input_gm, input_data_len, 0, params) c0_pad_len = _ceil_fill(channel, channel0) - channel def _do_data_copy(i, block_index): """ :param block_index : block_index """ i_hw = i // channel1 c1_index = i % channel1 if channel % channel0 == 0: _data_copy(c1_index, block_index, i_hw, channel0, 0) else: offset = ((i // channel1) - (out_begin // channel1)) * c0_pad_len with params.ib_.if_scope(c1_index != channel1 - 1): _data_copy(c1_index, block_index, i_hw, channel0, -offset) with params.ib_.else_scope(): _data_copy(c1_index, block_index, i_hw, channel % channel0, -offset) with params.ib_.for_range(0, core_cal_num, for_type="serial", name="j") as j: i = out_begin + j _do_data_copy(i, j) core_out_len = core_cal_num * channel0 if channel % channel0 != 0: out_begin_offset = params.ib_.allocate("int32", (1, ), name="out_begin_offset", scope=cce_params.scope_reg) out_begin_offset[0] = (out_begin // channel1) * c0_pad_len core_out_len -= ((out_end // channel1) * c0_pad_len - out_begin_offset[0]) pad_len = _ceil_fill(core_out_len, params.cp_align_len) - core_out_len with params.ib_.for_range(0, params.cp_align_len, for_type="serial", name="j") as j: i = out_begin + j + core_cal_num _do_data_copy(i, j + core_cal_num) real_pad_len = ((i + 1) * channel0 - ( (i + 1) // channel1) * c0_pad_len) - ( (out_begin + core_cal_num) * channel0 - ((out_begin + core_cal_num) // channel1) * c0_pad_len) with params.ib_.if_scope(real_pad_len >= pad_len): params.ib_.emit(tvm.call_extern(params.dtype, 'break')) num_cp = _ceil_div(core_out_len, params.cp_align_len) output_offset = out_begin * channel0 - (out_begin // channel1) * c0_pad_len params.ib_.emit( tvm.call_extern(params.dtype, 'copy_ubuf_to_gm', output_gm.access_ptr("rw", offset=output_offset), params.output_ub.access_ptr("r", offset=0), 0, 1, num_cp, 0, 0))
def bn_reduce_sum(stmt_op): """ Collapse second input tensor to one repeat and use vcadd to calculate sum to output """ # Get input and output buffers input_size_list = [1] for_extents = [] ir_builder = tvm.ir_builder.create() cce_util.get_init_op(stmt_op) def _post_order_for(_stmt): if isinstance(_stmt, tvm.stmt.For): input_size_list[0] = input_size_list[0] * _stmt.extent.value for_extents.append(_stmt.extent.value) tvm.ir_pass.IRTransform(stmt_op, None, _post_order_for, ["For"]) ins, outs = \ cce_util.get_buffer(stmt_op, need_unique=True, need_origin_adress=True) in_buffer = ins[1] out_buffer = outs[0] input_size = input_size_list[0] # Check if input can be collapsed into one repeat vector_inst_one_repeat_size = \ cce_params.VECTOR_INST_BLOCK_WIDTH // \ cce_util.get_align_factor(in_buffer.dtype)[1] # get reduce_axis shape if len(for_extents) == 1: input_reduce_axis_shape = for_extents[0] ub_loop_num = 1 else: input_reduce_axis_shape = for_extents[0] ub_loop_num = for_extents[1] collapse_loop_num = \ math.log(input_reduce_axis_shape / vector_inst_one_repeat_size, 2) # judge reduce_shape is remaining or not after dichotomy add remain_flag = False collapse_repeat = 0 if not collapse_loop_num.is_integer(): collapse_repeat = int(math.pow(2, int(collapse_loop_num))) out_of_collapse_repeat = \ input_reduce_axis_shape / vector_inst_one_repeat_size - \ collapse_repeat if not out_of_collapse_repeat.is_integer(): raise RuntimeError("Input size is not aligned:", input_reduce_axis_shape) remain_flag = True # Do Emit Insn def collapse(ir_b, buffer, current_size): """Function to do emit insn""" repeat = current_size // 2 / vector_inst_one_repeat_size tail_flag = False if not repeat.is_integer(): tail_flag = True repeat = int(repeat) ir_b.emit( tvm.call_extern(buffer.dtype, "vadd", buffer.access_ptr("rw", offset=0), buffer.access_ptr("r", offset=0), buffer.access_ptr("r", offset=8), repeat, 1, 2, 2, 8, 16, 16)) # solve tail vadd if tail_flag: tail_mask = \ (current_size - repeat * 2 * vector_inst_one_repeat_size) // 2 te.platform.cce_intrin_md.reset_mask_insn(ir_builder, in_buffer.dtype, tail_mask) ir_b.emit( tvm.call_extern( buffer.dtype, "vadd", buffer.access_ptr("rw", offset=repeat * vector_inst_one_repeat_size), buffer.access_ptr("r", offset=repeat * 2 * vector_inst_one_repeat_size), buffer.access_ptr( "r", offset=repeat * 2 * vector_inst_one_repeat_size + 8), 1, 1, 2, 2, 0, 0, 0)) te.platform.cce_intrin_md.reset_mask_insn(ir_builder, in_buffer.dtype) return current_size // 2 # emit vadd cur_size = input_size for loop in range(int(collapse_loop_num)): cur_size = collapse(ir_builder, in_buffer, cur_size) if remain_flag: # solve remain repeat mask_bits = \ input_reduce_axis_shape / collapse_repeat - \ vector_inst_one_repeat_size add_repeat_stride = int(8 + mask_bits / 8) te.platform.cce_intrin_md.reset_mask_insn(ir_builder, in_buffer.dtype, mask_bits) ir_builder.emit( tvm.call_extern( in_buffer.dtype, "vadd", in_buffer.access_ptr("rw", offset=0), in_buffer.access_ptr("r", offset=0), in_buffer.access_ptr("r", offset=vector_inst_one_repeat_size), ub_loop_num, 1, 1, 1, add_repeat_stride, add_repeat_stride, add_repeat_stride)) # emit vcadd for remain te.platform.cce_intrin_md.reset_mask_insn(ir_builder, in_buffer.dtype) ir_builder.emit( tvm.call_extern(in_buffer.dtype, "vcadd", out_buffer.access_ptr("rw", offset=0), in_buffer.access_ptr("r", offset=0), ub_loop_num, 1, 1, add_repeat_stride)) else: # emit vcadd for no remain ir_builder.emit( tvm.call_extern(in_buffer.dtype, "vcadd", out_buffer.access_ptr("rw", offset=0), in_buffer.access_ptr("r", offset=0), ub_loop_num, 1, 1, 8)) return ir_builder.get()
def binary_reduce_output(stmt_op): """Move reduce results to two destinations""" # Get input and output buffers input_size_list = [1] ir_builder = tvm.ir_builder.create() def _post_order_for(_stmt): if isinstance(_stmt, tvm.stmt.For): input_size_list[0] = input_size_list[0] * _stmt.extent.value def new_alloc(tvm_ib, dtype, shape, name, scope): """Funtion to alloc mem""" buf_var = tvm_ib.allocate(dtype, shape, name=name, scope=scope) new_buffer = tvm.decl_buffer(shape, buf_var.dtype, name=name, scope=scope, data=buf_var) return new_buffer _ = tvm.ir_pass.IRTransform(stmt_op, None, _post_order_for, ["For"]) ins, outs = cce_util.get_buffer(stmt_op) # Alloc second buffer for binary collection out_buffer_sec = \ cce_emitinsn_params.cceEmitParamsIns.get_param("binary_reduce" "_output_buffer") in_buffer = ins[0], ins[1] out_buffer = outs[0], out_buffer_sec input_size = input_size_list[0] output_size = input_size block_unit = cce_util.get_align_factor(in_buffer[0].dtype)[0] remain_buffer = new_alloc(ir_builder, out_buffer[0].dtype, (block_unit, ), "copy_part_0", cce_params.scope_ubuf) remain_buffer_sec = new_alloc(ir_builder, out_buffer[1].dtype, (block_unit, ), "copy_part_1", cce_params.scope_ubuf) burst_len = max(output_size // block_unit, 1) remains = max(output_size - burst_len * block_unit, 0) remains_fill = block_unit - remains # Main part global_offset = out_buffer[0].elem_offset ir_builder.emit( tvm.call_extern(out_buffer[0].dtype, "copy_ubuf_to_gm", out_buffer[0].access_ptr("rw"), in_buffer[1].access_ptr("r"), 0, 1, burst_len, 0, 0)) ir_builder.emit( tvm.call_extern(out_buffer[1].dtype, "copy_ubuf_to_gm", out_buffer[1].access_ptr("rw", offset=global_offset), in_buffer[0].access_ptr("r"), 0, 1, burst_len, 0, 0)) # Remain part if remains > 0: with ir_builder.for_range(0, block_unit, name="copy_part_fill_loop") \ as reg_mov_loop: ir_builder.emit( tvm.call_extern( remain_buffer.dtype, "reg_mov", remain_buffer.access_ptr("rw", offset=reg_mov_loop), in_buffer[1].access_ptr("r", offset=burst_len * block_unit - remains_fill + reg_mov_loop))) ir_builder.emit( tvm.call_extern( remain_buffer_sec.dtype, "reg_mov", remain_buffer_sec.access_ptr("rw", offset=reg_mov_loop), in_buffer[0].access_ptr("r", offset=burst_len * block_unit - remains_fill + reg_mov_loop))) ir_builder.emit( tvm.call_extern( out_buffer[0].dtype, "copy_ubuf_to_gm", out_buffer[0].access_ptr("rw", offset=burst_len * block_unit - remains_fill), remain_buffer.access_ptr("r"), 0, 1, 1, 0, 0)) ir_builder.emit( tvm.call_extern( out_buffer[1].dtype, "copy_ubuf_to_gm", out_buffer[1].access_ptr("rw", offset=global_offset + burst_len * block_unit - remains_fill), remain_buffer_sec.access_ptr("r"), 0, 1, 1, 0, 0)) return ir_builder.get()
def custom_expm1(shape, dtype, kernel_name="cce_tf_expm1", need_build=False, need_print=False): """ algorithm: expm1 calculating data's expm1, y= (e ** x) - 1,dtype is float16 or float32. Parameters ---------- shape : shape of data. dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32. kernel_name : cce kernel name, default value is "cce_tf_expm1". need_buid : if need to build CCEC kernel, default value is False. need_print : if need to print the ir, default value is False. Returns ------- None """ # [aicpu] int32_t cc_device_exp(uint32_t blockNum, uint32_t blockIdx, int32_t dataType, const void *scale, const void *shift, # const void *base, int32_t dimCnt, int32_t *shape, uint32_t padC0, const void *x, void *y); supported_dtypes = ["float16", "float32"] util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("tf_expm1_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) # step 1. calculate y = exp ** x by aicpu api device_api = "DeviceExp" v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([1], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_base = util.create_param_ptr([-1], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output_exp = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output_exp", dtype=inp_dtype) offset = tvm.const((-1), dtype=inp_dtype) # step 2. cauculate y = exp ** x - 1 by tvm output = tvm.compute( shape, lambda *indice: output_exp(*indice) + offset.astype(inp_dtype), name="output") # step 3. schedule the computation by tvm s = tvm.create_schedule(output.op) # step 4. build by tvm if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def _res_to_gm_split_row(args): """ function of moving data from data_res(UB) to dst(GM) for split row scene """ tvm_ib, param, data, dst, data_res, data_tail, reg, reg_addr,\ index, res_offset, dst_offset, total_len, h_w, row_ele, num_ele_unit,\ c_0, c_i, h_i, n_i, n_ni = args with tvm_ib.if_scope(total_len % param.get("cp_align_len") > 0): with tvm_ib.if_scope(total_len > param.get("cp_align_len")): total_len_align = total_len - param.get("cp_align_len") reg_addr[index] = total_len_align burst_len = _ceil_div(total_len_align, param.get("cp_align_len")) tvm_ib.emit(tvm.call_extern(dst.dtype, "copy_ubuf_to_gm", dst.access_ptr('w', offset=dst_offset), data_res.access_ptr("r", offset=res_offset), 0, 1, burst_len, 0, 0)) with tvm_ib.for_range(0, param.get("cp_align_len"), name="num_a")\ as num_a: tvm_ib.emit(tvm.call_extern( data_res.dtype, "reg_mov", tvm.call_extern(reg.dtype, "reg", reg[0]), data_res.access_ptr('r', offset=res_offset + total_len_align + num_a) )) tvm_ib.emit(tvm.call_extern( data_tail.dtype, "reg_mov", data_tail.access_ptr('w', offset=num_a), tvm.call_extern(reg.dtype, "reg", reg[0]) )) tvm_ib.emit( tvm.call_extern(dst.dtype, "copy_ubuf_to_gm", dst.access_ptr('w', offset=dst_offset + reg_addr[index]), data_tail.access_ptr("r", offset=0), 0, 1, 1, 0, 0)) with tvm_ib.else_scope(): num_ele = param.get("cp_align_len") - total_len with tvm_ib.for_range(0, num_ele, name="num_e") as num_e: reg_addr[index] = total_len + num_e dst_pos = dst_offset + reg_addr[index] args = dst_pos, h_w, row_ele, num_ele_unit,\ c_0, c_i, h_i, n_i, n_ni data_pos = _dst_to_data_pos(args) tvm_ib.emit(tvm.call_extern(data_tail.dtype, "copy_gm_to_ubuf", data_tail.access_ptr("w", offset=0), data.access_ptr('r', offset=data_pos), 0, 1, 1, 0, 0)) tvm_ib.emit(tvm.call_extern( data_tail.dtype, "reg_mov", tvm.call_extern(reg.dtype, "reg", reg[0]), data_tail.access_ptr('r', offset=0) )) tvm_ib.emit(tvm.call_extern( data_res.dtype, "reg_mov", data_res.access_ptr('w', offset=total_len + num_e), tvm.call_extern(reg.dtype, "reg", reg[0]) )) tvm_ib.emit(tvm.call_extern(dst.dtype, "copy_ubuf_to_gm", dst.access_ptr('w', offset=dst_offset), data_res.access_ptr("r", offset=0), 0, 1, 1, 0, 0)) with tvm_ib.else_scope(): burst_len = total_len // param.get("cp_align_len") tvm_ib.emit(tvm.call_extern(dst.dtype, "copy_ubuf_to_gm", dst.access_ptr('w', offset=dst_offset), data_res.access_ptr("r", offset=res_offset), 0, 1, burst_len, 0, 0))
def custom_pow(shape, shape_y, dtype, kernel_name="cce_tf_pow", need_build=False, need_print=False): """ calculate x^y, calculating data type is float16 or float32 or int32 when x < 0 , the output is a meaningless value. Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32, int32 kernel_name : cce kernel name, default value is "tf_pow_cce" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32", "int32"] device_api = "cc_device_pow" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not dtype.lower() in supported_dtypes: raise RuntimeError("tf_pow_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_lhs = tvm.placeholder(shape, name="data_lhs", dtype=inp_dtype) data_rhs = tvm.placeholder(shape, name="data_rhs", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" pad_c0 = 0 p_scale = util.create_param_ptr([0], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_power = util.create_param_ptr([0], inp_dtype, "p_power") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_lhs, data_rhs, p_scale, p_shift, p_power, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[2].access_ptr("r"), # scale ins[3].access_ptr("r"), # shift ins[4].access_ptr("r"), # power v_ndim, ins[5].access_ptr("r"), # shape pad_c0, ins[0].access_ptr("r"), # input x v_ndim, v_ndim, ins[5].access_ptr("r"), # shape pad_c0, ins[1].access_ptr("r"), # input y outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) if need_print: with build_config: print( tvm.lower(schedule, [data_lhs, data_rhs, output], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data_lhs, data_rhs, output], "cce", name=kernel_name)
def _do_calcu_f32_mini(cal_offset, nbins_index): with params.ir_builder.if_scope( tvm.any((nbins_index == SCALAR_ZERO), (nbins_index == params.nbins))): with params.ir_builder.if_scope(nbins_index == 0): kernel_api.kernel_vector_dup_fuc( params.ir_builder, [params.vcadd_ub, 0], 0, [params.mid_vec_align_len, params.mid_vec_align_len]) if params.compile_plat in ("Ascend310", ): kernel_api.kernel_scalar_to_one_fuc( params.ir_builder, [[params.index_ub, 0], [params.index_ub, 0]], [1, params.mid_vec_align_len], ["vadds", params.reg[3]]) with params.ir_builder.else_scope(): kernel_api.kernel_vector_dup_fuc( params.ir_builder, [params.vcadd_ub, 0], params.histogram_data_len, [params.mid_vec_align_len, params.mid_vec_align_len]) with params.ir_builder.else_scope(): mask_paras = kernel_api.get_loopnum_and_masklist( params.histogram_data_len, params.mid_vec_align_len) _do_cmp_calcu(mask_paras[0] + mask_paras[1], cal_offset, nbins_index) if mask_paras[0] > SCALAR_ONE: params.ir_builder.emit( tvm.call_extern( params.vcadd_ub.dtype, "vadd", params.vcadd_ub.access_ptr("rw", offset=0), params.vcadd_ub.access_ptr( "r", offset=params.mid_vec_align_len), params.vcadd_ub.access_ptr("r", offset=0), mask_paras[0] - 1, 1, 1, 1, 0, 8, 0)) if mask_paras[0] > SCALAR_ZERO: if mask_paras[1] == SCALAR_ONE: params.ir_builder.emit( tvm.call_extern("uint64", 'set_vector_mask', mask_paras[2][1], mask_paras[2][0])) add_offset = mask_paras[0] * params.mid_vec_align_len params.ir_builder.emit( tvm.call_extern( params.vcadd_ub.dtype, "vadd", params.vcadd_ub.access_ptr("rw", offset=0), params.vcadd_ub.access_ptr("r", offset=0), params.vcadd_ub.access_ptr("r", offset=add_offset), 1, 1, 1, 1, 8, 8, 8)) params.ir_builder.emit( tvm.call_extern("uint64", 'set_vector_mask', params.uint64_all_one, params.uint64_all_one)) if mask_paras[0] == SCALAR_ZERO: params.ir_builder.emit( tvm.call_extern("uint64", 'set_vector_mask', mask_paras[2][1], mask_paras[2][0])) params.ir_builder.emit( tvm.call_extern(params.vcadd_ub.dtype, "vcadd", params.vcadd_ub.access_ptr("rw", offset=0), params.vcadd_ub.access_ptr("r", offset=0), 1, 1, 1, 8)) if mask_paras[0] == SCALAR_ZERO: params.ir_builder.emit( tvm.call_extern("uint64", 'set_vector_mask', params.uint64_all_one, params.uint64_all_one)) # bypass problem :addr not 32B align params.ir_builder.emit( tvm.call_extern('int32', 'pipe_barrier', params.args_str)) params.reg[4] = params.vcadd_ub.vload(0, params.mid_dtype) kernel_api.kernel_vector_dup_fuc( params.ir_builder, [params.vcadd_ub, 0], params.reg[4], [params.mid_vec_align_len, params.mid_vec_align_len]) # add num of index to src_output_ub nbins_index_core = nbins_index - params.block.var * \ params.out_num_per_core params.offset = \ (nbins_index_core // params.mid_vec_align_len) * \ params.mid_vec_align_len params.ir_builder.emit( tvm.call_extern("uint64", 'set_vector_mask', 0, params.set_mask_list[nbins_index_core % 64])) params.ir_builder.emit( tvm.call_extern( params.src_output_ub.dtype, "vadd", params.src_output_ub.access_ptr("rw", offset=params.offset), params.vcadd_ub.access_ptr("r", offset=0), params.src_output_ub.access_ptr("r", offset=params.offset), 1, 1, 1, 1, 8, 8, 8)) # add num of index to src_output_ub_p1 with params.ir_builder.if_scope(nbins_index_core != SCALAR_ZERO): with params.ir_builder.if_scope(nbins_index_core % 64 == SCALAR_ZERO): params.ir_builder.emit( tvm.call_extern("uint64", 'set_vector_mask', 0, params.set_mask_list[63])) with params.ir_builder.else_scope(): params.ir_builder.emit( tvm.call_extern( "uint64", 'set_vector_mask', 0, params.set_mask_list[nbins_index_core % 64 - 1])) params.offset = \ ((nbins_index_core - 1) // params.mid_vec_align_len) * \ params.mid_vec_align_len params.ir_builder.emit( tvm.call_extern( params.src_output_ub_p1.dtype, "vadd", params.src_output_ub_p1.access_ptr("rw", offset=params.offset), params.vcadd_ub.access_ptr("r", offset=0), params.src_output_ub_p1.access_ptr("r", offset=params.offset), 1, 1, 1, 1, 8, 8, 8)) params.ir_builder.emit( tvm.call_extern("uint64", 'set_vector_mask', params.uint64_all_one, params.uint64_all_one))
def _func_gm_to_ub(args): """ function of moving data from data to data_ub """ tvm_ib, param, data, data_ub, data_offset, ub_offset, ori_nburst,\ burst_len, src_stride, dst_stride = args with tvm_ib.if_scope(ori_nburst > 0): with tvm_ib.if_scope(burst_len > 0): with tvm_ib.if_scope(burst_len <= 65535): with tvm_ib.if_scope(src_stride >= 0): with tvm_ib.if_scope(dst_stride >= 0): with tvm_ib.if_scope(dst_stride <= 65535): with tvm_ib.if_scope(src_stride <= 65535): with tvm_ib.if_scope(ori_nburst <= 4095): tvm_ib.emit( tvm.call_extern( data_ub.dtype, "copy_gm_to_ubuf", data_ub.access_ptr( "w", offset=ub_offset), data.access_ptr( 'r', offset=data_offset), 0, ori_nburst, burst_len, src_stride, dst_stride)) with tvm_ib.else_scope(): n_burst = 4095 c_cycle = ori_nburst // n_burst c_mod = ori_nburst % n_burst with tvm_ib.for_range(0, c_cycle, name="num_cy")\ as num_cy: data_cur = data_offset + ( burst_len + src_stride) \ * param.get("cp_align_len")\ * n_burst * num_cy ub_cur = ub_offset + ( burst_len + dst_stride) \ * param.get("cp_align_len")\ * n_burst * num_cy tvm_ib.emit( tvm.call_extern( data_ub.dtype, "copy_gm_to_ubuf", data_ub.access_ptr( "w", offset=ub_cur), data.access_ptr( 'r', offset=data_cur), 0, n_burst, burst_len, src_stride, dst_stride)) with tvm_ib.if_scope(c_mod > 0): data_cur = data_offset + ( burst_len + src_stride) \ * param.get("cp_align_len")\ * n_burst * c_cycle ub_cur = ub_offset + ( burst_len + dst_stride) \ * param.get("cp_align_len")\ * n_burst * c_cycle tvm_ib.emit( tvm.call_extern( data_ub.dtype, "copy_gm_to_ubuf", data_ub.access_ptr( "w", offset=ub_cur), data.access_ptr( 'r', offset=data_cur), 0, c_mod, burst_len, src_stride, dst_stride)) with tvm_ib.else_scope(): with tvm_ib.for_range(0, ori_nburst, name="num_nb") as num_nb: data_cur = data_offset + ( burst_len + src_stride)\ * param.get("cp_align_len")\ * num_nb ub_cur = ub_offset + ( burst_len + dst_stride)\ * param.get("cp_align_len")\ * num_nb tvm_ib.emit( tvm.call_extern( data_ub.dtype, "copy_gm_to_ubuf", data_ub.access_ptr( "w", offset=ub_cur), data.access_ptr( 'r', offset=data_cur), 0, 1, burst_len, 0, 0))
def custom_Exp(shape, dtype, gamma, alpha, beta, kernel_name="cce_exp", need_build=False, need_print=False): """ calculate gamma **(alpha * data + beta), calculate exp(log(gamma) * alpha * data) * (gamma ** beta) Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support \ float16, float32 gamma : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, base alpha : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, scale beta : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, shift kernel_name : cce kernel name, default value is "cce_exp" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "DeviceExp" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not dtype.lower() in supported_dtypes: raise RuntimeError( "caffe_exp_layer_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) if gamma != -1 and gamma <= 0: # api cc_device_exp_c handle gamma == -1 as e raise ValueError( "please ensure gamma is greater than 0, where gamma = %s" % str(gamma)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" pad_c0 = 0 p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale") p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift") p_base = util.create_param_ptr([gamma], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") # scale --> alpha, shitf --> beta, base --> gamma output = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape pad_c0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(schedule, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data_input, output], "cce", name=kernel_name)
def custom_exp(shape, dtype, kernel_name="cce_tf_exp", need_build=False, need_print=False): """ algorithm: exp calculating data's exp,y= e ** x ,dtype is float16, Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32 kernel_name : cce kernel name, default value is "cce_tf_exp" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "DeviceExp" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("tf_exp_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([1], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_base = util.create_param_ptr([-1], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)