def load_to_l1(input_x, output_x, kernel_name="load_to_l1"): """ copy data from ddr to l1 Parameters ---------- input_x : TVM tensor the input tensor output_x : dict dict of output_x, include keys(shape and dtype) kernel_name : str kernel name, default value is "load_to_l1" Returns ------- None """ input_shape = input_x.get("shape") input_dtype = input_x.get("dtype") input_tensor = tvm.placeholder(input_shape, name="input_tensor", dtype=input_dtype) res = load_to_l1_compute(input_tensor, output_x, kernel_name=kernel_name) sch = tvm.create_schedule([res.op]) sch[res].set_scope(cce.scope_cbuf_fusion) sch[res].emit_insn(res.op.axis[0], 'dma_copy') tensor_list = [input_tensor, res] with build_config: tvm.build(sch, tensor_list, "cce", name=kernel_name)
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False, need_print=False): """ do element-wise equal operation between two input tensors Parameters: ---------- shape_x : shape of input x shape_y : shape of input y dtype : source data type, support float16,float32,int32,int8,uint8 kernel_name : cce kernel name, default value is "cce_tf_equal" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"] dtype = dtype.lower() if not (dtype in check_list): raise RuntimeError( "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) x = tvm.placeholder(shape_x, dtype=dtype, name="x") y = tvm.placeholder(shape_y, dtype=dtype, name="y") x_tmp = te.lang.cce.broadcast(x, shape_max) y_tmp = te.lang.cce.broadcast(y, shape_max) res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res') sch = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(sch, [x, y, res], simple_mode=True)) if need_build: with build_config: tvm.build(sch, [x, y, res], "cce", name=kernel_name)
def assign_sub(var, value, out, kernel_name='assign_sub'): """ Update var by subtracting value from it. Parameters: ---------- var : dict dict of input_var, include shape and dtype, dtype support int8, uint8, int32, float16, float32 value : dict dict of input_value, include shape and dtype, dtype support int8, uint8, int32, float16, float32. Must have the same shape and dtype as input_var out : dict dict of out kernel_name : str cce kernel name, default value is "assign_sub" Returns ------- None """ # get the shape and dtype shape_var = var.get("shape") shape_value = value.get("shape") dtype_var = var.get("dtype") dtype_value = value.get("dtype") # kernel name check: should be unique # check whether the shape is right check_shape(shape_var, param_name="var") check_shape(shape_value, param_name="value") if not operator.eq(shape_var, shape_value): raise RuntimeError("all input shape must be the equal") # check whether dtypes are fp16, fp32, int8, uint8, int32 # and whether they are the same check_list = ("float16", "float32", "int8", "uint8", "int32") check_dtype(dtype_var, check_list, param_name="var") check_dtype(dtype_value, check_list, param_name="value") dtype_var = dtype_var.lower() dtype_value = dtype_value.lower() if dtype_var != dtype_value: raise RuntimeError("all input dtype must be same") shape, _ = refine_shape_axes(shape_var, []) data_var = tvm.placeholder(shape, dtype=dtype_var, name='data_var') data_value = tvm.placeholder(shape, dtype=dtype_value, name='data_value') sch, res = _assign_sub_compute(data_var, data_value, out, kernel_name) with set_bool_storage_config(): tvm.build(sch, [data_var, data_value, res], "cce", name=kernel_name)
def custom_logical_not(shape, dtype, kernel_name="cce_tf_logical_not", need_build=False, need_print=False): """ logical not for the input tensor Parameters ---------- shape : input shape of data dtype : the data type, support bool kernel_name : cce kernel name, default value is "cce_logical_not" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) check_list = ["bool"] if not dtype.lower() in check_list: raise RuntimeError( "logical_not_cce ony supports %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() data = tvm.placeholder(shape, name="data", dtype=inp_dtype) with tvm.target.cce(): result = tvm.compute( shape, lambda *i: tvm.select(data[i] is True, False, True), name="result") schedule = tvm.create_schedule(result.op) if need_print: with build_config: print(tvm.lower(schedule, [data, result], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data, result], "cce", name=kernel_name)
def bn_training_reduce(x, sum, square_sum, kernel_name="bn_training_reduce"): """ algorithm: part of fused_batch_norm_v2 The first step of batch_norm which to calculate the sum and square sum of x. The major component of this operator is reduce operation. Parameters ---------- x: dict dict of input, A 5HD Tensor for input data. sum: dict dict of sum, A `Tensor`. Sum of x. square_sum: dict dict of square_sum, A `Tensor`. Square sum of x. kernel_name: str kernel name, default value is "bn_training_reduce" Returns ------- None """ shape_x = x.get("shape") dtype_x = x.get("dtype") check_shape(shape_x, param_name="x") check_dtype(dtype_x.lower(), ("float16", "float32"), param_name="x") data_format = x.get("format") origin_format = x.get("ori_format") _check_format(data_format, origin_format) x_input = tvm.placeholder(shape_x, name="x_input", dtype=dtype_x.lower()) res = bn_training_reduce_compute(x_input, sum, square_sum, kernel_name=kernel_name) if data_format == "NC1HWC0": with tvm.target.cce(): sch = generic.auto_schedule(res) else: sch, tensor_list = bn_training_reduce_schedule_nd(res) with build_config: tvm.build(sch, tensor_list, "cce", name=kernel_name) return tensor_list = [x_input] + list(res) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def depthwise_weight_6d_2_4d(x, y, src_format, dst_format, kernel_name="depthwise_weight_6d_2_4d"): """Operation and Schedule for depthwise_weight_6d_2_4d. Parameters ---------- x: shape and dtype of input, the dtype support float16, float32, int32, uint16. y: the shape and dtype of outputs, the dtype same as input. src_format: the source data_format dst_format: the target data_format kernel_name : cce kernel name, default value is "depthwise_weight_6d_2_4d" Returns ------- convert C1HWNCoC0 tp HWCN """ _check_parameters(x, y, src_format, dst_format) output_shape = y.get("shape") channel_size = output_shape[2] input_shape = x.get("shape") dtype = x.get("dtype") channel_4d = channel_size op_utils.check_shape(input_shape, param_name="x") check_list = ("float16", "float32", "int32", "uint16") dtype = dtype.lower() op_utils.check_dtype(dtype, check_list, param_name="x") input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype) six2four = _Six2FourParam(input_shape, channel_4d) res = tvm.extern( [six2four.get_out_shape()], [input_data], lambda ins, outs: _intrin_factor(six2four, dtype, ins, outs), name="res", dtype=dtype) sch = tvm.create_schedule(res.op) build_list = [input_data, res] with build_config: tvm.build(sch, build_list, "cce", name=kernel_name)
def depthwise_weight_4d_2_6d(x, y, src_format, dst_format, kernel_name="depthwise_weight_4d_2_6d"): """Operation and Schedule for depthwise_weight_4d_2_6d. Parameters ---------- x: shape and dtype of input, the dtype support float16, float32, int32, uint16. y: the shape and dtype of outputs, the dtype same as input. src_format: the source data_format dst_format: the target data_format kernel_name : cce kernel name, default value is "depthwise_weight_4d_2_6d" Returns ------- convert HWCN to C1HWNCoC0 """ if src_format.lower() != "hwcn": raise RuntimeError("dst_format must be HWCN!") if dst_format.lower() != "c1hwncoc0": raise RuntimeError("src_format must be C1HWNCoC0 !") input_shape = x.get("shape") dtype = x.get("dtype") op_utils.check_shape(input_shape, param_name="x") check_list = ("float16", "float32", "int32", "uint16") dtype = dtype.lower() op_utils.check_dtype(dtype, check_list, param_name="x") input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype) four2six = _Four2SixParam(input_shape) res = tvm.extern( [four2six.get_out_shape()], [input_data], lambda ins, outs: _intrin_factor(four2six, dtype, ins, outs), name="res", dtype=dtype) sch = tvm.create_schedule(res.op) build_list = [input_data, res] with build_config: tvm.build(sch, build_list, "cce", name=kernel_name)
def zn_2_hwcn(src, dst, src_format, dst_format, kernel_name='zn_2_hwcn'): """ algorithm: zn_2_hwcn calculating: change data format from Zn to HWCN Parameters ---------- src: dict contains shape and dtype information of input tensor dst: dict contains shape and dtype information of output tensor src_format: str represents the format of input tensor, only support "Zn" dst_format: str represents the format of output tensor, only support "HWCN" kernel_name: str cce kernel name, default value is "zn_2_hwcn" Returns ------- None """ _check_parameters(src, dst, src_format, dst_format, kernel_name) dst_shape = dst.get("shape") dtype = src.get("dtype") h_i, w_i, c_i, n_i = dst_shape c_0 = 16 if dtype == "int8": c_0 = 32 c_1 = _ceil_div(c_i, c_0) n_ni = 16 n_no = _ceil_div(n_i, n_ni) shape_zn = [c_1*h_i*w_i, n_no, n_ni, c_0] branch = _get_ir_branch(shape_zn, dtype) data = tvm.placeholder(shape_zn, dtype=dtype, name="data") if branch == "more_row": res = tvm.extern(dst_shape, [data], lambda ins, outs: _more_row_ir(outs[0], ins[0], c_0), name="res", dtype=dtype) else: res = tvm.extern(dst_shape, [data], lambda ins, outs: _split_row_ir(outs[0], ins[0]), name="res", dtype=dtype) tensor_list = [data, res] sch = tvm.create_schedule(res.op) with build_config: tvm.build(sch, tensor_list, "cce", name=kernel_name)
def upsample(x, y, scale=1.0, stride_h=2, stride_w=2, kernel_name="upsample"): """ calculating data Parameters --------- x : dict include shape dtype and format stride_h : int the shape change axis h stride_w : int the shape change axis w scale : float the value of tensor change axis, default value is 1.0 y :output kernel_name : str kernel name, default value is "upsample" Returns ------- None """ upsample_check(x, stride_h, stride_w, kernel_name) dtype = x.get("dtype") op_list, ins_list, tensor_dic, feature, y \ = gen_upsample(x, dtype, scale, stride_h, stride_w) schedule = tvm.create_schedule(y.op) # skip the res buffer buffer_mapping(schedule, op_list[:-1]) tilling_spilt_axis_dic \ = tilling_spilt_axis(schedule, tensor_dic, stride_h, stride_w) cal_axis_dic, axis \ = cal_axis_spilt(x, stride_h, stride_w, tilling_spilt_axis_dic, tensor_dic, schedule) axis_list = upsample_compute(schedule, cal_axis_dic, tensor_dic) res_op = tensor_dic.get("res") ins_emit(schedule, op_list, axis_list, ins_list) if axis == 0: schedule[y].bind(cal_axis_dic.get("axis_xo"), tvm.thread_axis("blockIdx.x")) else: res_out, _ = bind_multcore(axis, x, schedule, res_op) schedule[y].bind(res_out, tvm.thread_axis("blockIdx.x")) with build_config: tvm.build(schedule, [feature, y], "cce", name=kernel_name)
def store_to_gm(input_x, output_x, kernel_name="store_to_gm"): """ copy data from l1 to ddr (l1 --> ub --> ddr) Parameters ---------- input_x : TVM tensor the input tensor output_x : dict dict of output_x, include keys(shape and dtype) kernel_name : str kernel name, default value is "store_to_gm" Returns ------- None """ input_shape = input_x.get("shape") input_dtype = input_x.get("dtype") input_tensor = tvm.placeholder(input_shape, name="input_tensor", dtype=input_dtype) res, res_ub = store_to_gm_compute(input_tensor, output_x, kernel_name=kernel_name) sch = tvm.create_schedule([res.op]) split_axis, split_factor = _tilling_axis(input_shape, input_dtype) axis_outer, axis_inner = sch[res].split(res.op.axis[split_axis], factor=split_factor) sch[res_ub].compute_at(sch[res], axis_outer) sch[input_tensor].set_scope(cce.scope_cbuf_fusion) sch[res_ub].set_scope(cce.scope_ubuf) sch[res_ub].emit_insn(res_ub.op.axis[split_axis], 'dma_copy') sch[res].emit_insn(axis_inner, 'dma_copy') tensor_list = [input_tensor, res] with build_config: tvm.build(sch, tensor_list, "cce", name=kernel_name)
def flatten(x, y, kernel_name="flatten"): """return a copy of the tensor collapsed into one dimension. Parameters ---------- x : dict shape and dtype of input. y : dict shape and dtype of output. kernel_name : str kernel name, default value is "flatten" Returns ------- None """ shape = x.get("shape") dtype = x.get("dtype") dtype_lower = dtype.lower() check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float16", "float32") check_shape(shape, param_name="x") check_dtype(dtype_lower, check_list, param_name="x") size = 1 for i, _ in enumerate(shape): size = size * shape[i] shape_new = [size] data = tvm.placeholder(shape_new, name="data", dtype=dtype_lower) data_ub = tvm.compute(shape_new, lambda *i: data(*i), name='data_ub') res = tvm.compute(shape_new, lambda *i: data_ub(*i), name='res') sch = tvm.create_schedule(res.op) sch[data_ub].set_scope(tbe_platform.scope_ubuf) sch_new = _tile_axis([sch, data_ub, res], shape_new, dtype_lower) with build_config: tvm.build(sch_new, [data, res], "cce", name=kernel_name)
def build_cce(self): """ Build cce """ self._compute() tiling_cases = self._calc_tiling_case() for case in tiling_cases: tvm_vars = self.dim_info_vars.copy() right_dim_in = CompileVar("right_dim_in", self.right_range) tvm_vars.append(right_dim_in) split_factor = CompileVar("split_factor", case.get("ub_factor_bound")) tvm_vars.append(split_factor) var_list = [var.get_tvm_var() for var in tvm_vars] sch, tensor_list = self._unpack_schedule( case.get("block_tiling_axis"), right_dim_in.get_tvm_var(), case.get("ub_tiling_axis"), split_factor.get_tvm_var()) # set var bound for var in tvm_vars: sch.set_var_range(var.get_tvm_var(), *(var.get_bound())) self.sch_list.append(sch) self.arg_list.append(var_list + tensor_list) self.rules.append(case.get("key")) self.compile_vars[case.get("key")] = \ [var.get_name() for var in tvm_vars] build_config_items = {"parse_ddr_args": True, "build_fatbin": True} build_config = cce_build.build_config_update_list( cce_build.dynamic_build_config, build_config_items) with build_config: tvm.build(self.sch_list, self.arg_list, rules=self.rules, target="cce", name=self.kernel_name)
def batchnorm_fold2_grad_reduce(dout, x, dout_reduce, dout_x_reduce, kernel_name="batchnorm_fold2_grad_reduce"): """_BatchNormFold2GradReduce op""" shape = x.get("shape") x_format = x.get("format") util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32"] inp_dtype = x.get("dtype").lower() if not inp_dtype in check_list: raise RuntimeError("Dtype of input only support float16, float32") dout_t = tvm.placeholder(shape, name="dout", dtype=inp_dtype) x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype) res_list = batchnorm_fold2_grad_reduce_compute(dout_t, x_t, dout, kernel_name) if x_format == "NC1HWC0": with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [dout_t, x_t] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config) return from impl.bn_training_reduce import bn_training_reduce_schedule_nd sch, tensor_list = bn_training_reduce_schedule_nd(res_list) with build_config: tvm.build(sch, tensor_list, "cce", name=kernel_name)
def space_to_batch_d(x, y, block_size, paddings, kernel_name="space_to_batch_d"): """ the main function of space_to_batch_d Parameters ---------- x: dict,shape and datatype,datatype supports float16,float32 y: dict,shape and datatype,datatype supports float16,float32 block_size: must be greater than one. It indicates the block size paddings: (tuple, list),the padding of the input with zeros across the spatial dimensions as follows: paddings = [[pad_top, pad_bottom], [pad_left, pad_right]] kernel_name: cce kernel name, default value is "space_to_batch_d" Returns ------- None """ if len(paddings) == 4: paddings = [[paddings[0], paddings[1]], [paddings[2], paddings[3]]] _check_param(x, y, paddings, block_size, kernel_name) input_shape = x.get("shape") input_dtype = x.get("dtype").lower() block_shape = [block_size, block_size] data = tvm.placeholder(input_shape, name="data", dtype=input_dtype) res = space_to_batch_nd_d_compute(data, y, block_shape, paddings, kernel_name) sch = tvm.create_schedule(res.op) with build_config: tvm.build(sch, [data, res], "cce", name=kernel_name)
def custom_round(shape, dtype, kernel_name="cce_round", need_build=False, need_print=False): """ doing round operations, calculating data type is float16 or float32 or int32 Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype kernel_name : cce kernel name, default value is "cce_round" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ check_list = ["float16", "float32", "int32"] device_api_map = { "float16": "cc_device_round_float16", "float32": "cc_device_round_float", "int32": "cc_device_round_int32" } max_dim = 8 shape_len = len(shape) if shape_len > max_dim: raise RuntimeError( "round_cce only support up to %d dimensions while the shape's dimension is %d" % (max_dim, shape_len)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in check_list): raise RuntimeError("round_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) device_api = device_api_map[inp_dtype] block_num = "block_num" block_idx = "block_idx" v_ndim = tvm.const(len(shape), "int32") padC0 = tvm.const(0, "int32") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_input, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_ndim, ins[1].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def drop_out_do_mask(input_tensor, input_mask, input_keep_prob, output, kernel_name="dropout_do_mask"): """ algorithm: tf_dropout_do_mask scale_x = x*(1 / keep_prob) res = select(mask == 1, scale_x, 0) Parameters ---------- input_tensor : dict,shape and dtype of input_tensor,only support float16 and float32 input_mask : dict,shape and dtype of input_mask shape of mask,1D, dtype == uint8 length=(size(shape_tensor)+ELEMS_BATCH_PROCESS_FP16 -1)/ELEMS_BATCH_PROCESS_FP16*ELEMS_BATCH_PROCESS_FP16/8 eg. shape_tensor=[2,5,8] shape_mask=[16] shape_res=[2,5,8] shape_tensor=[15,17,19] shape_mask=[608] shape_res=[15,17,19] input_keep_prob : dict,shape and dtype of input_keep_prob shape of keep_prob, only 1 parament and equals to (1) prob scale (0.0,1.0] NOTICE: type same as dytpe output : dict,shape and dtype of output kernel_name : str cce kernel name, default value is "dropout_do_mask" Returns ------- None """ shape_tensor = input_tensor.get("shape") shape_mask = input_mask.get("shape") shape_keep_prob = input_keep_prob.get("shape") dtype = input_tensor.get("dtype") if shape_keep_prob == 1: shape_keep_prob = (shape_keep_prob, ) check_shape(shape_tensor, param_name="input_tensor") check_dtype(dtype.lower(), ["float16", "float32"], param_name="input_tensor") if len(shape_mask) != 1: raise RuntimeError("The length of mask shape must be 1") if shape_keep_prob not in [(1, ), [ 1, ]]: raise RuntimeError("Only support shape (1, ) or [1, ]") # functools_reduce: product of all dimension # Align to ELEMS_BATCH_PROCESS_FP16 product_mask = (functools_reduce(lambda x, y: x*y, shape_tensor[:]) + ELEMS_BATCH_PROCESS_FP16 - 1) // \ ELEMS_BATCH_PROCESS_FP16 * ELEMS_BATCH_PROCESS_FP16 // 8 if product_mask != shape_mask[0]: raise RuntimeError("The mask[0] should=%d, but now=%d" % (product_mask, shape_mask[0])) data_tensor = tvm.placeholder( (functools_reduce(lambda x, y: x * y, shape_tensor), ), dtype=dtype, name="data_tensor") data_mask = tvm.placeholder( (functools_reduce(lambda x, y: x * y, shape_mask), ), dtype='uint8', name="data_mask") keep_prob_tensor = tvm.placeholder(shape_keep_prob, dtype=dtype, name="keep_prob_tensor") const_1 = tvm.const(1.0, dtype=dtype) res = tvm.extern([shape_tensor, shape_mask, shape_keep_prob], [data_tensor, data_mask, keep_prob_tensor], lambda ins, outs: _kernel_ir(outs, ins, const_1), name="res", dtype=dtype) tensor_list = [data_tensor, data_mask, keep_prob_tensor, res] schedule = tvm.create_schedule(res.op) with build_config: tvm.build(schedule, tensor_list, "cce", name=kernel_name)
def custom_batch_matmul(shape_x, shape_y, dtype, trans_a=False, trans_b=False, kernel_name="cce_tf_batch_matmul", need_build=False, need_print=False): """ Multiplies slices of two tensors in batches(each slice can be viewed as an element of a batch), the output is of the same batch size. Each of the individual slices can optionally be transposed before multiplication by setting the trans_a or trans_b flag to True, which are by default False. The input tensors are 2-D or higher with the shape [..., r_x, c_x] and [..., r_y, c_y]. The output tensor is 2-D or higher with the shape [..., r_o, c_o], where r_o = c_x if trans_a else r_x c_o = r_y if trans_b else c_y Parameters ---------- shape_x : shape of the first tensor x with rank > 1 shape_y : shape of the second tensor y with the same type and shape with x dtype : the data type, support int8, uint8,float16,float32,int32 kernel_name : cce kernel name, default value is "cce_batch_matmul" trans_a : if True, shape_x is transposed before multiplication trans_b : if True, shape_y is transposed before multiplication need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) data_dtype = dtype.lower() check_list = ["int8", "uint8", "float16", "float32", "int32"] if data_dtype not in check_list: raise RuntimeError( "batch_matmul_cce ony supports %s while dtype is %s" % (",".join(check_list), dtype)) def transpose_tensor(shape, size): """Transpose the shape, e.g., the shape [..., r_x, c_x] is transposed to [..., c_x, r_x]. Parameters ---------- shape : shape of a tensor size : length of the shape Returns ------- shape_ori : the transposed shape """ shape_ori = () if size == 1: shape_ori = shape_ori + shape elif size == 2: shape_ori = shape_ori + (shape[1], ) + (shape[0], ) else: shape_ori = shape_ori + (shape[:(size - 2)]) + ( shape[size - 1], ) + (shape[size - 2], ) return shape_ori def check_matmul(shape_x, shape_y): """Check whether batch_matmul is supported or not. Parameters ---------- shape_x : shape of the first tensor x shape_y : shape of the second tensor y with the same type and shape with x Returns ------- None """ len_x = len(shape_x) len_y = len(shape_y) if (len_x < 2) or (len_y < 2): raise RuntimeError("Only tensors of rank>=2 are supported!") if shape_x[len_x - 1] != shape_y[len_y - 2]: raise RuntimeError( "Invalid matrix multiplication for the inner 2 dimensions!") if (len_x == len_y) and (len_x > 2): for i in range(len_x - 2): if shape_x[i] != shape_y[i]: raise RuntimeError("Outer dimensions do not match!") return elif (len_x == len_y) and (len_x == 2): return else: raise RuntimeError("The input tensors are not with the same rank!") def _compute(output_shape, x, y, K, trans_a, trans_b, *indices): """matmul compuation in terms of the output shape and the transposes Parameters ---------- output_shape : the final output shape, e.g., shape_x = (2, 6), shape_y = (8, 2), trans_a = True, True_b = True, then, output_shape = (6, 8). x : the first input tensor according to shape_x. y : the second input tensor according to shape_y. K : the number of the axis for sum, in the above example, K = 2. trans_a : if True, x needs to be transposed. trans_b : if True, y needs to be transposed. *indices : the output shape space for tvm.compute. Returns ------- tvm.Tensor """ n_len = len(output_shape) k = tvm.reduce_axis((0, K), 'k') if trans_a is True and trans_b is False: # For example, A: (6, 7, 8), B: (6, 7, 9), so the length is n = 3 # C = A' * B : (6, 8, 9), A' means the transpose of A # indices means the space of (6, 8, 9), k = 7 # x_indices = indices[:1]+(7, )+indices[1:2] = (6, 7, 8) # y_indices = indices[:1]+(7, )+indices[2:] = (6, 7, 9) x_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 2): (n_len - 1)] y_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 1):] return tvm.sum(x(*x_indices) * y(*y_indices), axis=k) elif not trans_a and trans_b: # For example, A: (6, 7, 8), B: (6, 9, 8), C = A * B' : (6, 7, 9) # indices means the space of (6, 7, 9), n=3, k = 8 # x_indices = indices[:2]+(8, ) = (6, 7, 8) # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 9, 8) x_indices = indices[:(n_len - 1)] + (k, ) y_indices = indices[:(n_len - 2)] + indices[(n_len - 1):] + (k, ) return tvm.sum(x(*x_indices) * y(*y_indices), axis=k) elif trans_a and trans_b: # For example, A: (6, 8, 10), B: (6, 12, 8), C = A' * B' : \ # (6, 10, 12) # indices means the space of (6, 10, 12), n=3, k = 8 # x_indices = indices[:1]+(8, )+indices[1:2] = (6, 8, 10) # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 12, 8) x_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 2): (n_len - 1)] y_indices = indices[:(n_len - 2)] + indices[(n_len - 1):] + (k, ) return tvm.sum(x(*x_indices) * y(*y_indices), axis=k) else: # For example, A: (6, 15, 16), B: (6, 16, 18), C = A * B : \ # (6, 15, 18) # indices means the space of (6, 15, 18), n=3, k = 16 # x_indices = indices[:2]+(16, ) = (6, 15, 16) # y_indices = indices[:1]+(16, )+indices[2:] = (6, 16, 18) x_indices = indices[:(n_len - 1)] + (k, ) y_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 1):] return tvm.sum(x(*x_indices) * y(*y_indices), axis=k) def check_supportted_shape_size(shape_x, shape_y, limit, trans_a, trans_b): """ check shape size for operator ---------- shape: shape of data limit: limit of the product Returns ------- None """ # This function is used to check whether the shape is too large to \ # cause a timeout. # shape_x = (a,b,c,d,e,k) shape_y = (a,b,c,d,k,f) # t_1 : time consumed by each addition operation # t_2 : time consumed by each multiplication operation # t_all : time consumed by a complete calculation # t_all is approximately equal to (a*b*c*d)*(e*k*f)*(t_1+t_2) # As (t_1 + t_2) is a constant, so t_all is proportional to \ # (a * b * c * d * e * k * f) len_x = len(shape_x) len_y = len(shape_y) if (len_x < 2) or (len_y < 2): raise RuntimeError("Only tensors of rank>=2 are supported!") shape_x = list(shape_x) shape_y = list(shape_y) tmp_shape_x = shape_x[:] if trans_a: tmp_shape_x = shape_x[:-2] + [shape_x[-1], shape_x[-2]] tmp_shape_y = shape_y[:] if trans_b: tmp_shape_y = shape_y[:-2] + [shape_y[-1], shape_y[-2]] union_shape = tmp_shape_x + [tmp_shape_y[-1]] union_size = reduce(lambda i, j: i * j, union_shape) if union_size > limit: raise RuntimeError("the shape is too large to calculate") if data_dtype in ["float16", "float32", "int32"]: type_shape_map = { 'float16': SHAPE_SIZE_FP16_LIMIT, 'float32': SHAPE_SIZE_FP32_LIMIT, 'int32': SHAPE_SIZE_INT32_LIMIT } check_supportted_shape_size(shape_x, shape_y, type_shape_map[data_dtype], trans_a, trans_b) x_size = len(shape_x) y_size = len(shape_y) shape_a = shape_x shape_b = shape_y if trans_a is True: shape_x = transpose_tensor(shape_x, x_size) if trans_b is True: shape_y = transpose_tensor(shape_y, y_size) check_matmul(shape_x, shape_y) last_axis = shape_x[x_size - 1] x_temp = tvm.placeholder(shape_a, name="input_1", dtype=data_dtype) y_temp = tvm.placeholder(shape_b, name="input_2", dtype=data_dtype) # output shape output_shape = () for i in range(x_size - 1): output_shape = output_shape + (shape_x[i], ) output_shape = output_shape + (shape_y[x_size - 1], ) result = tvm.compute( output_shape, lambda *indices: _compute(output_shape, x_temp, y_temp, last_axis, trans_a, trans_b, *indices), name="result") schedule = tvm.create_schedule(result.op) if need_print: with build_config: print( tvm.lower(schedule, [x_temp, y_temp, result], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [x_temp, y_temp, result], "cce", name=kernel_name)
def custom_pow(shape, shape_y, dtype, kernel_name="cce_tf_pow", need_build=False, need_print=False): """ calculate x^y, calculating data type is float16 or float32 or int32 when x < 0 , the output is a meaningless value. Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32, int32 kernel_name : cce kernel name, default value is "tf_pow_cce" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32", "int32"] device_api = "cc_device_pow" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not dtype.lower() in supported_dtypes: raise RuntimeError("tf_pow_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_lhs = tvm.placeholder(shape, name="data_lhs", dtype=inp_dtype) data_rhs = tvm.placeholder(shape, name="data_rhs", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" pad_c0 = 0 p_scale = util.create_param_ptr([0], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_power = util.create_param_ptr([0], inp_dtype, "p_power") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_lhs, data_rhs, p_scale, p_shift, p_power, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[2].access_ptr("r"), # scale ins[3].access_ptr("r"), # shift ins[4].access_ptr("r"), # power v_ndim, ins[5].access_ptr("r"), # shape pad_c0, ins[0].access_ptr("r"), # input x v_ndim, v_ndim, ins[5].access_ptr("r"), # shape pad_c0, ins[1].access_ptr("r"), # input y outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) if need_print: with build_config: print( tvm.lower(schedule, [data_lhs, data_rhs, output], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data_lhs, data_rhs, output], "cce", name=kernel_name)
def lars_v2_update(weight, grad, weight_s, grad_s, weight_decay, learning_rate, out, hyperparam=0.001, epsilon=1e-5, use_clip=False, kernel_name="lars_update"): """ the opreator's compute hyper_weight_norm = hyperparam * sqrt(weight_s) grad_weight_norm = sqrt(grad_s) + weight_decay*sqrt(weight_s) + epsilon grad_weight = grad + weight_decay * weight if use_clip == True: coeff = hyper_weight_norm / grad_weight_norm coeff = min(coeff / learning_rate, 1) coeff = max(coeff, 0) else: coeff = hyper_weight_norm / grad_weight_norm grad_new = coeff * grad_weight Parameters: ---------- weight: dict input tensor contains shape and dtype attributes. only support float32. grad: dict input tensor contains shape and dtype attributes. Must have the same dtype and shape as 'weight'. weight_s: dict input tensor contains shape and dtype attributes. Must have the same dtype as 'weight'. grad_s: dict input tensor contains shape and dtype attributes. Must have the same dtype as 'weight'. weight_decay: dict input tensor contains shape and dtype attributes. Must have the same dtype as 'weight'. learning_rate: dict input tensor contains shape and dtype attributes. Must have the same dtype as 'weight'. out: dict output tensor contains shape and dtype attributes. Must have the same dtype and shape as 'weight'. hyperparam: float default value is 0.001 epsilon: float default value is 1e-5 use_clip: bool default value is "False". kernel_name : str kernel name, default value is "lars_update" Returns: None """ check_list = ("float16", "float32") inputs = [weight, grad, weight_s, grad_s, weight_decay, learning_rate] weight_shape = weight.get("shape") grad_shape = grad.get("shape") weight_dtype = weight.get("dtype") grad_dtype = grad.get("dtype") if list(weight_shape) != list(grad_shape): raise RuntimeError("weight and grad must be the same shape") if grad_dtype != weight_dtype: raise RuntimeError("wight and grad must be the same dtype") vdiv_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vdiv", "float32") if weight_dtype == "float32" and not vdiv_support: raise RuntimeError( "Input dtype is float32, but do not support on the platform") input_place_holders = [] for i, input_val in enumerate(inputs): input_dtype = input_val.get("dtype").lower() input_shape = input_val.get("shape") op_utils.check_shape(input_shape) op_utils.check_dtype(input_dtype, check_list) shape_one_dim = (functools_reduce(operator.mul, input_shape), ) input_place_holders.append( tvm.placeholder(shape_one_dim, name="input_data_%d" % i, dtype=input_dtype)) res = lars_v2_update_compute(input_place_holders, hyperparam, epsilon, use_clip, out, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) data = input_place_holders data.append(res) new_config = build_config_update(build_config, "dummy_placeholder", True) with new_config: tvm.build(schedule, data, "cce", name=kernel_name)
def strided_slice_d(input_x, output_x, begin, end, strides=None, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, kernel_name="strided_slice_d"): """ Extracts a strided slice of a tensor (generalized python array indexing). Roughly speaking, this op extracts a slice of size (end-begin)/stride from the given input_ tensor. Starting at the location specified by begin the slice continues by adding stride to the index until all dimensions are not less than end. Note that a stride can be negative, which causes a reverse slice. Parameters ---------- input_x : dict shape and dtype of input output_x : dict shape and dtype of out begin: list. represents the index of the first value to select. end: list. represents the index of the last value to select. strides: list or tuple. step length to select. begin_mask: int a bitmask where a bit i being 1 means to ignore the begin value and instead use the largest interval possible. end_mask: int analogous to `begin_mask`. ellipsis_mask: int a bitmask where bit `i` being 1 means the `i`th position is actually an ellipsis. new_axis_mask: int a bitmask where bit `i` being 1 means the `i`th specification creates a new shape 1 dimension. shrink_axis_mask: int a bitmask where bit `i` implies that the `i`th specification should shrink the dimensionality. kernel_name : str cce kernel name, default value is "strided_slice_d" Returns ------- None """ input_shape = input_x.get("shape") input_dtype = input_x.get("dtype").lower() check_list = ("float16", "float32", "int32", "uint8", "bool", "int8") check_dtype(input_dtype, check_list, param_name="input_x") check_shape(input_shape, param_name="input_x") begin = list(begin) end = list(end) if not _check_parameter(input_shape, begin, end, strides, ellipsis_mask, new_axis_mask, shrink_axis_mask): raise RuntimeError("Parameter Invalid!") if strides is None: strides = _fill_list_with_ones(len(input_shape)) else: strides = list(strides) input_tensor = tvm.placeholder(input_shape, dtype=input_dtype, name='input_tensor') [output, out_shape] = strided_slice_d_compute(input_tensor, output_x, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask, kernel_name=kernel_name) # pylint: disable=locally-disabled,unnecessary-lambda out_tensor = tvm.compute(out_shape, lambda *i: output(*i), name='out_tensor', tag='strided_slice_d|3') input_size = functools_reduce(lambda x, y: x * y, input_shape[0:]) out_size = functools_reduce(lambda x, y: x * y, out_shape[0:]) output_dtype = output_x.get("dtype").lower() output_shape = output_x.get("shape") if input_size == out_size: if output_dtype == "bool": input_x["dtype"] = "int8" output_x["dtype"] = "int8" if len(output_shape) == 0: output_x["shape"] = (1, ) copy_only(input_x, output_x, kernel_name) return output_shape_one = list(output_shape) if ellipsis_mask == 0 and shrink_axis_mask != 0: for i, _ in enumerate(list(input_shape)): if (shrink_axis_mask & 2**i) == 2**i: output_shape_one.insert(i, 1) output_shape = tuple(output_shape_one) # for RL tune getting res fusion_manager.set_op_res(out_tensor) ret, sch = rl_bank.query_rl_bank([out_tensor]) if ret and sch: with build_config: tvm.build(sch, [input_tensor, out_tensor], "cce", name=kernel_name) return sch = tvm.create_schedule(out_tensor.op) sch[output].set_scope(tbe_platform.scope_ubuf) sch_input_shape = [] for dim in output.shape: sch_input_shape.append(dim.value) check_result = _check_last_axis_situation(sch_input_shape, begin, end, strides) if check_result: _schedule_last_axis(sch, sch_input_shape, output, out_tensor, input_dtype) with build_config: tvm.build(sch, [input_tensor, out_tensor], "cce", name=kernel_name) return if _check_tik_branch(input_shape, output_shape, begin, end, strides): begin_shape = copy.deepcopy(begin) end_shape = copy.deepcopy(end) stride_shape = list(strides) stride_shape = copy.deepcopy(stride_shape) input_list = list(input_shape) # update begin_shape, end_shape begin_shape, end_shape, stride_shape = _init_parameter( input_list, begin_shape, end_shape, stride_shape, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask) head_size = 1 for i in range(0, (len(input_shape) - 1)): head_size = head_size * input_shape[i] if input_dtype == "float32" and input_shape[-1] == 2 and \ begin_shape[len(begin_shape) - 1] == 0 and end_shape[len(begin_shape) - 1] == 1 \ and head_size > 128: strided_slice_two_turn_one(input_x, output_x, kernel_name) return if input_list[-1] > 80 and output_shape[-1] == 80: res1 = strided_slice_last_dim_only(input_shape, input_dtype, output_shape, begin_shape, kernel_name) if res1: return if input_list[-1] >= 32 and input_list[-1] < 7500 and len(output_shape) > 1 and \ output_shape[-1] >= 32: res = strided_slice_last_dim_mte(input_shape, input_dtype, output_shape, begin_shape, kernel_name) if res: return res = strided_slice_last_dim(input_shape, input_dtype, output_shape, begin_shape, end_shape, stride_shape, kernel_name) if res: return else: res1 = strided_slice_last_dim_one(input_shape, input_dtype, output_shape, begin_shape, kernel_name) if res1: return split_axis, split_factor = _tilling_axis(out_shape, dtype=input_dtype) core_state = _get_multicore(out_shape, input_dtype, split_axis, split_factor) axis_outer, axis_inner = sch[out_tensor].split( out_tensor.op.axis[split_axis], factor=split_factor) if split_axis == 0: core_num = _get_target_core_num(out_shape[split_axis] // split_factor) axis_outer_outer, axis_outer_inter = sch[out_tensor].split( axis_outer, nparts=core_num) else: core_num = _get_target_core_num(out_shape[0]) axis_outer_outer, axis_outer_inter = sch[out_tensor].split( out_tensor.op.axis[0], nparts=core_num) for i in range(1, split_axis): axis_outer_inter = sch[out_tensor].fuse(axis_outer_inter, out_tensor.op.axis[i]) axis_outer_inter = sch[out_tensor].fuse(axis_outer_inter, axis_outer) sch[output].compute_at(sch[out_tensor], axis_outer_inter) sch[output].emit_insn(output.op.axis[0], insn_cmd.DMA_COPY) # gm-ub if len(out_shape) >= 2: # Convert bytes to Bytes dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( input_dtype) // 8 # 32 means one block size(32 Bytes), divide by 32 to # get the numbers of data that # can be stored in one block. element = 32 // dtype_bytes_size align_axis = _get_align_axis(out_shape) sch[output].storage_align(output.op.axis[align_axis], element, 0) if core_state: thread_block = tvm.thread_axis("blockIdx.x") sch[out_tensor].bind(axis_outer_outer, thread_block) sch[out_tensor].emit_insn(axis_inner, insn_cmd.DMA_COPY) # ub-gm with build_config: tvm.build(sch, [input_tensor, out_tensor], "cce", name=kernel_name)
def unpack(x, y, num=None, axis=0, kernel_name="unpack"): """ unpacks the given dimension of a rank R tensor into rank (R-1) tensors. Parameters ---------- x : dict. shape, dtype and format of value to be unpacked. y: tuple or list the list of output tensor. num : int. the length of the dim axis, automatically inferred if None(default). axis: int. the axis to unpack along. kernel_name : str cce kernel name, default value is "unpack". Returns ------- None """ shape = x.get("shape") dtype = x.get("dtype").lower() format = x.get("format") _check_params(shape, num, axis, format, dtype, kernel_name) # infer the value of num real_axis = axis + len(shape) if axis < 0 else axis num = shape[real_axis] # turn the input shape into three dimensions (a, b, c), so axis = 1 beferdim = 1 for befer_dim in shape[0:real_axis]: beferdim *= befer_dim afterdim = 1 for after_dim in shape[real_axis + 1:]: afterdim *= after_dim reshape = (beferdim, shape[real_axis], afterdim) _, _, is_use_split = check_use_special_optimize(dtype, afterdim, flag=False) reshape_input = x.copy() reshape_input["shape"] = reshape real_axis = 1 # only 1 output tensor, so output equals to input if num == 1: copy_only(reshape_input, reshape_input, kernel_name) # use split elif is_use_split: split_d(reshape_input, y, split_dim=real_axis, num_split=num, kernel_name=kernel_name) else: new_dtype, afterdim, _ = check_use_special_optimize(dtype, afterdim, flag=False) new_shape = (beferdim, reshape[real_axis], afterdim) input_place = tvm.placeholder(new_shape, name="input_place", dtype=new_dtype) sch, build_list = _unpack_schedule(input_place, reshape, y, num, real_axis, dtype) with build_config: tvm.build(sch, build_list, "cce", name=kernel_name)
def custom_Upsample(shape, dtype, scale, data_format="channels_last", kernel_name="cce_darknet_upsample", need_build=False, need_print=False): """ Parameters ---------- shape: input tensor's shape dtype: input tensor's dtype, support:`float16,float32 scale: the upsampling factors data_format: "channels_last" or "channels_first" kernel_name : kernel name, default value is "MyUpsample" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ """ TODO: Please refer to the TE DSL Manual, And code here with TE DSL. """ inp_dtype = dtype.lower() check_list = ["float16", "float32", "int32", "int8", "uint8"] if inp_dtype not in check_list: raise RuntimeError("upsample only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) size = (scale, scale) shape_size = len(shape) if not (shape_size == 4 or shape_size == 5): raise RuntimeError( "upsample only support 4D or 5D while len(shape):%d" % len(shape)) input_tensor = tvm.placeholder(shape, name="input_tensor", dtype=inp_dtype) res = None if shape_size == 5: # shape_size == 5 D-sepecial (N, C1, H, W, C0) output_shape = (shape[0], shape[1], shape[2] * size[0], shape[3] * size[1], shape[4]) res = tvm.compute( output_shape, lambda n, c0, h, w, c: input_tensor[n, c0, h // size[ 0], w // size[1], c]) else: if data_format == "channels_last": output_shape = (shape[0], shape[1] * size[0], shape[2] * size[1], shape[3]) res = tvm.compute( output_shape, lambda n, h, w, c: input_tensor[n, h // size[0], w // size[1], c]) elif data_format == "channels_first": output_shape = (shape[0], shape[1], shape[2] * size[0], shape[3] * size[1]) res = tvm.compute( output_shape, lambda n, c, h, w: input_tensor[n, c, h // size[ 0], w // size[1]]) else: raise RuntimeError( "upsample only support channels_last|channels_first " "while input type %s" % data_format) schedule = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(schedule, [input_tensor, res], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [input_tensor, res], "cce", name=kernel_name)
def avg_pool_grad_d(input_grad, mean_matrix, kernel_matrix, out_grad, orig_input_shape, ksize, strides, padding, data_format='NHWC', kernel_name="cce_avg_pool_grad_dilation"): """ computes average pooling backwards gradients. Parameters: ---------- input_grad: a dict, global model support 'NHWC' or 'NCHW' and padding valid, common model support 'NHWC' and float16 mean_matrix: a dict or nonetype, global model support 'NHWC' or 'NCHW' and padding valid, common model support 'NHWC' and float16 kernel_matrix: a dict or nonetype, global model support 'NHWC' or 'NCHW' and padding valid, common model support 'NHWC' and float16 out_grad: a dict, global model support 'NHWC' or 'NCHW' and padding valid, common model support 'NHWC' and float16 orig_input_shape: orward input shape, 4-D list, global model support 'NHWC' or 'NCHW' and padding valid, common model support 'NHWC' ksize: filter window size, int or 4-D list, support 'NHWC' strides: strides over h and w axis, int or 4-D list, support 'NHWC' or 'NCHW' padding:global model support 'NHWC' or 'NCHW' and padding valid data_format: support 'NHWC' or 'NCHW' kernel_name : cce kernel name, default value is "cce_avg_pool_grad_dilation" Returns ------- None """ input_grad_ori_format = input_grad.get('ori_format') if input_grad_ori_format == "NHWC": kernel_h = ksize[1] kernel_w = ksize[2] stride_h = strides[1] stride_w = strides[2] # transfer 4D to 5D orig_input_shape ON, OHH, OWW, OC = orig_input_shape elif input_grad_ori_format == "NCHW": kernel_h = ksize[2] kernel_w = ksize[3] stride_h = strides[2] stride_w = strides[3] # transfer 4D to 5D orig_input_shape ON, OC, OHH, OWW = orig_input_shape OC1 = _ceil(OC) // BLOCK_SIZE OC0 = BLOCK_SIZE orig_input_shape = ON, OC1, OHH, OWW, OC0 input_grad_shape = input_grad.get("shape") out_grad_shape = out_grad.get("shape") dtype = input_grad.get("dtype").lower() op_utils.check_shape(input_grad_shape, min_rank=INPUT_DIM, max_rank=INPUT_DIM) op_utils.check_shape(orig_input_shape, min_rank=INPUT_DIM, max_rank=INPUT_DIM) op_utils.check_shape(strides, min_rank=SHAPE_SIZE, max_rank=SHAPE_SIZE) op_utils.check_shape(ksize, min_rank=SHAPE_SIZE, max_rank=SHAPE_SIZE) if list(out_grad_shape) != list(orig_input_shape): raise RuntimeError("out_grad_shape must equal input_grad_shape") if stride_h < 1 or stride_w < 1: raise RuntimeError("stride should >= 1") data_dtype = dtype.lower() op_utils.check_dtype(data_dtype, ('float16', )) _, _, HH, WW, _ = orig_input_shape if (HH == kernel_h and WW == kernel_w and input_grad_shape[2] == 1 and input_grad_shape[3] == 1 and padding == 'VALID'): # for mobileV2 net, only support VALID padding. if padding != 'VALID': raise RuntimeError("gobla model ,padding only support VALID ") else: pad_top, pad_left, pad_bottom, pad_right = 0, 0, 0, 0 input_grad = tvm.placeholder(input_grad_shape, name="input_grad", dtype=data_dtype) # input_grad is overlapped result filter_num_h = (HH - kernel_h + pad_top + pad_bottom) // stride_h + 1 filter_num_w = (WW - kernel_w + pad_left + pad_right) // stride_w + 1 # global_avgpool, input FMAP size equals kernel size, kernel number=1 if not (filter_num_h == 1 and filter_num_w == 1): raise RuntimeError("global average pooling, input_grad_h" "and input_grad_w must equel 1") kernel_size_reciprocal = 1.0 / (kernel_h * kernel_w) with tvm.target.cce(): input_grad_fp32 = te.lang.cce.cast_to(input_grad, "float32") grad_tmp = te.lang.cce.vmuls(input_grad_fp32, kernel_size_reciprocal) if data_dtype == "float16": grad_tmp = te.lang.cce.cast_to(grad_tmp, "float16") res = te.lang.cce.broadcast(grad_tmp, orig_input_shape) sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [input_grad, res]} te.lang.cce.cce_build_code(sch, config) else: shape_in = orig_input_shape shape_in_n, shape_in_c1, shape_in_h, shape_in_w, shape_in_c0 = shape_in shape_k = (shape_in_c1, kernel_h, kernel_w, 1, BLOCK_SIZE, BLOCK_SIZE) shape_out = input_grad_shape shape_out_n, shape_out_c1, shape_out_h, shape_out_w, \ shape_out_c0 = shape_out # strides dim is two strides = stride_h, stride_w parameter_check(shape_in, shape_k, shape_out, dtype, strides, padding, kernel_name) shape_in = shape_in_n, shape_in_c1, 1, \ shape_in_h, shape_in_w, shape_in_c0 shape_k = (shape_out_c1, kernel_h * kernel_w, 1, BLOCK_SIZE, BLOCK_SIZE) shape_out = shape_out_n, shape_out_c1, 1, \ shape_out_h, shape_out_w, shape_out_c0 kernel_placeholder = tvm.placeholder(shape_k, dtype=dtype, name='kernel') dout_placeholder = tvm.placeholder(shape_out, dtype=dtype, name='dout') vealuemean_placeholder = tvm.placeholder(shape_out, dtype=dtype, name='dvealuemean') res = avg_pool_grad_compute(shape_in, kernel_placeholder, dout_placeholder, vealuemean_placeholder, [kernel_h, kernel_w], strides, padding) s = avg_pool_grad_schedule(res) with tbe_platform.build_config: tvm.build(s, [ dout_placeholder, vealuemean_placeholder, kernel_placeholder, res ], "cce", name=kernel_name)
def histogram_fixed_width_d(x, range, y, nbins, dtype="int32", kernel_name='histogram_fixed_width_d'): """this operation returns a rank 1 histogram counting the number of entries in `values` that fell into every bin. The bins are equal width and determined by the arguments `value_range` and `nbins`. Parameters ---------- x: dict dict info of input value, must include the keys(shape and dtype). range: dict dict info of input value_range, must include the keys(shape and dtype). the shape must be (2,) or [2] y: dict dict info of output nbins: int number of histogram bins. dtype: str data type for returned histogram. kernel_name: str cce kernel name, default value is "histogram_fixed_width" returns ------- None """ input_shape_list = [x.get("shape"), range.get("shape")] input_dtype = x.get("dtype") dtype_input = input_dtype.lower() check_shape(input_shape_list[0], param_name="x") check_shape(input_shape_list[1], param_name="range") util.compare_tensor_dict_key(x, range, "dtype") data_shape_size = util.check_tensor_shape_size(list(input_shape_list[0])) data_range_shape_size = util.check_tensor_shape_size( list(input_shape_list[1])) check_dtype(dtype_input, ("float16", "float32", "int32"), param_name="x") if data_range_shape_size != 2: raise RuntimeError("the shape of range must be (2,) or [2]") if nbins <= 0: raise RuntimeError("the nbins must be > 0") data = tvm.placeholder([data_shape_size], dtype=dtype_input, name="input_data") range_data = tvm.placeholder([data_range_shape_size], dtype=dtype_input, name="input_range_data") res = histogram_fixed_width_d_compute(data, range_data, y, nbins, kernel_name) sch = tvm.create_schedule(res.op) with build_config: tvm.build(sch, [data, range_data, res], "cce", name=kernel_name)
def cast(input_x, output_y, dst_type, kernel_name="cast"): """ cast a tensor/scaler with input shape form src data type to dst data type. restrictions of input algorithms are as follow only types' groups blow are support tensor process: float16->float32 float16->int32 float32->float16 float32->int32 int8->float32 uint8->float32 int8->float16 uint8->float16 int8->int32 uint8->int32 int32->uint8 // number out of [0,255] can get unexpected result int32->int8 // number out of [-128,127] can get unexpected result int32->float32 // For tans with fp16, only guarantees number in [-1023,1023] get correct result int32->float16 // only guarantees number in [-1023,1023] get correct result scale convert support:(means only support shape [1,]) int64->int32 int64->float32 Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape as input, and the dtype is the dst dtype need to cast kernel_name : str cce kernel name, default value is cast Returns ------- None """ shape = util.scalar2tensor_one(input_x.get("shape")) src_type = input_x.get("dtype").lower() check_shape(shape, param_name="input_x") if src_type == "bool": src_type = "int8" dst_type = _cast_dsttype_conversion(dst_type) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=src_type) if src_type == "int64": check_dtype(dst_type, ("float32", "int32"), param_name="dst_type") res = tvm.extern( [fuseshape], [data], lambda ins, outs: _kernel_ir(outs, ins, dst_type, "int64"), name="res", dtype=dst_type) tensor_list = [data, res] schedule = tvm.create_schedule(res.op) with build_config: tvm.build(schedule, tensor_list, "cce", name=kernel_name) else: with tvm.target.cce(): res = cast_compute(data, output_y, dst_type, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def SpatialTransformer(input_shape, out_shape, dtype="float32", kernel_name="SpatialTransformer", need_build = True, need_print = False): """Spatial Transformer Layer Implements a spatial transformer layer as described in [1]_. Based on [2]_. Parameters ---------- input_shape : the shape of input tensor [num_batch, height, width, num_channels] out_shape: float the height and width of output tensor [out_height, out_width]. out_size: tuple of two ints The size of the output of the network (height, width) dtype: data type kernel_name : kernel name, default value is "SpatialTransformer" need_buid : if need to build CCEC kernel, default value is True need_print : if need to print the ir, default value is False Returns ------- tvm.Tensor References ---------- .. [1] Spatial Transformer Networks Max Jaderberg, Karen Simonyan, Andrew Zisserman, Koray Kavukcuoglu .. [2] https://github.com/tensorflow/models/tree/master/research/transformer """ def _meshgrid(height, width): y0 = tvm.compute((height,), lambda i: -1 + i * 2.0 / (height - 1), name = 'y0') x0 = tvm.compute((width,), lambda i: -1 + i * 2.0 / (width - 1), name = 'x0') y = tvm.compute((height * width,), lambda i: y0[i // width], name = 'y') x = tvm.compute((height * width,), lambda i: x0[i % width], name = 'x') y = topi.reshape(y, (1, height * width)) x = topi.reshape(x, (1, height * width)) ones = tvm.compute((1, height * width), lambda i,j:1, name = 'ones') grid = tvm.compute((3, height * width),lambda i,j: 0.5 * (i - 1) * (i - 2) * x[0,j] + i * (2 - i) * y[0,j] + 0.5 * i * (i-1) * ones[0,j], name = 'grid') #grid = topi.concatenate((x,y,ones),0) #can not use topi.concatenate return grid def _interpolate(im, im_shape, x, y, out_size, dtype): num_batch = im_shape[0] height = im_shape[1] width = im_shape[2] channels = im_shape[3] out_height = out_size[0] out_width = out_size[1] max_y = int(im_shape[1] - 1) max_x = int(im_shape[2] - 1) #[-1,1] -> [0, width-1] x = topi.multiply(topi.add(x, tvm.const(1, dtype=dtype)), width / tvm.const(2, dtype=dtype)) y = topi.multiply(topi.add(y, tvm.const(1, dtype=dtype)), height / tvm.const(2, dtype=dtype)) # do sampling dim3 = out_height * out_width * num_batch x0 = topi.cast(topi.floor(x), 'int32') y0 = topi.cast(topi.floor(y), 'int32') x1 = topi.add(x0,tvm.const(1, dtype="int32")) y1 = topi.add(y0,tvm.const(1, dtype="int32")) x0 = topi.clip(x0, 0, max_x) x1 = topi.clip(x1, 0, max_x) y0 = topi.clip(y0, 0, max_y) y1 = topi.clip(y1, 0, max_y) dim2 = width dim1 = width * height base = tvm.compute((dim3,),lambda i:(i // (out_height * out_width)) * width * height, name = 'base') base_y0 = topi.add(base, topi.multiply(y0, dim2)) base_y1 = topi.add(base, topi.multiply(y1, dim2)) idx_a = topi.add(base_y0, x0) idx_b = topi.add(base_y1, x0) idx_c = topi.add(base_y0, x1) idx_d = topi.add(base_y1, x1) im_flat = topi.reshape(im, (num_batch * height * width, channels)) im_flat = topi.cast(im_flat, dtype) Ia = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_a[i], j], name = 'Ia') Ib = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_b[i], j], name = 'Ib') Ic = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_c[i], j], name = 'Ic') Id = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_d[i], j], name = 'Id') x0_f = topi.cast(x0, dtype) x1_f = topi.cast(x1, dtype) y0_f = topi.cast(y0, dtype) y1_f = topi.cast(y1, dtype) wa = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y1_f, y)), 1) wb = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y, y0_f)), 1) wc = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y1_f, y)), 1) wd = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y, y0_f)), 1) output = topi.add(topi.add(topi.add(topi.multiply(wa, Ia), topi.multiply(wb, Ib)),topi.multiply(wc, Ic)), topi.multiply(wd, Id)) return output def _transform(theta, input_dim, out_size, input_shape, dtype): num_batch = input_shape[0] height = input_shape[1] width = input_shape[2] num_channels = input_shape[3] theta = topi.reshape(theta, (num_batch, 2, 3)) theta = topi.cast(theta, dtype) out_height = out_size[0] out_width = out_size[1] grid = _meshgrid(out_height, out_width) grid = topi.reshape(grid, (num_batch, 3, out_height*out_width)) grid = topi.cast(grid, dtype=dtype) k = tvm.reduce_axis((0, 3), 'k') T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g') x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s') y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s') x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,)) y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,)) input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype) output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels]) return output num_batch = input_shape[0] input_height = input_shape[1] input_width = input_shape[2] channel = input_shape[3] U = tvm.placeholder((num_batch, input_height, input_width, channel), name="U", dtype=dtype) theta = tvm.placeholder((num_batch, 6, 1, 1), dtype=dtype) output = _transform(theta, U, out_shape, input_shape, dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [U, theta, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [U, theta, output], "cce", name=kernel_name)
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod", need_build=False, need_print=False): """ do element-wise truncatemod operation between two input tensors Parameters: ---------- shape1 : shape of input data1 shape2 : shape of input data2 dtype : source data type, support float16,float32,int32 kernel_name : cce kernel name, default value is "cce_tf_truncatemod" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ max_dim = 8 shape1_len = len(shape1) shape2_len = len(shape2) if shape1_len > max_dim or shape2_len > max_dim: raise RuntimeError( "mod_cce only support up to %d dimensions while the shape's \ dimensions is %d, %d" % (max_dim, shape1_len, shape2_len)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape1) util.check_shape_rule(shape2) util.check_shape_size(shape1, SHAPE_SIZE_LIMIT) util.check_shape_size(shape2, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] device_api_map = {"float16": "cc_device_truncatemod_float16", "float32": "cc_device_truncatemod_float", "int32": "cc_device_truncatemod_int32"} dtype = dtype.lower() if dtype not in check_list: raise RuntimeError( "tf_truncatemod_cce only support %s while dtype is %s" % ( ",".join(check_list), dtype)) shape1, shape2, shape_out = util.produce_shapes(shape1, shape2) util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() device_api = device_api_map[inp_dtype] # block block_num = "block_num" block_idx = "block_idx" # x param v_xndim_cnt = tvm.const(len(shape1), "int32") p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape") xpad_c0 = tvm.const(0, "int32") data_input_x = tvm.placeholder(shape1, name="data_input_x", dtype=inp_dtype) # y param v_yndim_cnt = tvm.const(len(shape2), "int32") p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape") ypad_c0 = tvm.const(0, "int32") data_input_y = tvm.placeholder(shape2, name="data_input_y", dtype=inp_dtype) # output v_out_ndim_cnt = tvm.const(len(shape_out), "int32") p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape") out_padc0 = tvm.const(0, "int32") output = tvm.extern(shape_out, [p_xshape, data_input_x, p_yshape, data_input_y, p_out_shape], lambda ins, outs: tvm.call_extern("int32_t", device_api, block_num, block_idx, v_xndim_cnt, ins[0].access_ptr("r"), # shape x xpad_c0, ins[1].access_ptr("r"), # input x v_yndim_cnt, ins[2].access_ptr("r"), # shape y ypad_c0, ins[3].access_ptr("r"), # input y v_out_ndim_cnt, ins[4].access_ptr("r"), # shape out out_padc0, outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) # print IR if need_print: with build_config: print(tvm.lower(schedule, [data_input_x, data_input_y, output], simple_mode=True)) # Compile to generate the cce file if need_build: with build_config: tvm.build(schedule, [data_input_x, data_input_y, output], "cce", name=kernel_name)
def max_pool_grad_grad_with_argmax(x, grad, argmax, y, ksize, strides, padding="VALID", kernel_name="cce_max_pool_grad_grad" "_with_argmax"): """ Computes second-order gradients of the maxpooling function. Parameters ---------- x: dict Include info about ori_input, format, ori_format, shape, ori_shape, dtype. grad: dict Include info about grad of ori_input, format, ori_format, shape, ori_shape, dtype. argmax: dict Include info about ori_input, format, ori_format, shape, ori_shape, dtype. y: dict Include info about result of function, format, ori_format, shape, ori_shape, dtype. ksize: list or tuple The size of the window for each dimension of the input tensor. strides: list or tuple The stride of the sliding window of the input tensor. padding: str The type of padding algorithm to use. Only support "VALID" or "SAME" kernel_name: str Cce kernel name, default value is "cce_max_pool_grad_grad_with_argmax" Returns ------- None """ check_shape_and_format_vailded(x, grad, argmax, y, ksize, strides, padding, kernel_name) shape_x = x.get("shape") shape_grad = grad.get("shape") shape_argmax = argmax.get("shape") shape_argmax = (shape_argmax[0], shape_argmax[1], shape_argmax[2], shape_argmax[3] * shape_argmax[4], 1) dtype_x = x.get("dtype").lower() dtype_grad = grad.get("dtype").lower() ori_format_x = x.get("ori_format") x_tensor = tvm.placeholder(shape_x, dtype=dtype_x, name="input_x") # argmax is continuous bool, real type is uint16 _, _, _, howo, _ = shape_argmax shape_argmax_boolean = (shape_argmax[0], shape_argmax[1] * shape_argmax[2], howo // 16, 16, shape_argmax[4]) shape_argmax_boolean = list(shape_argmax_boolean[:-1]) + list( [shape_argmax_boolean[-1] * 16]) argmax_tensor = tvm.placeholder(shape_argmax_boolean, dtype="bool", name="argmax") grad_tensor = tvm.placeholder(shape_grad, dtype=dtype_grad, name="input_grad") compute_list = _max_pool_grad_grad_with_argmax_compute( [x_tensor, argmax_tensor, grad_tensor], x, argmax, grad, y, ksize, strides, padding, ori_format_x, kernel_name) res = compute_list[-1] sch = tvm.create_schedule(res.op) _max_pool_grad_grad_with_argmax_schedule(compute_list, [sch]) tensor_list = [x_tensor, grad_tensor, argmax_tensor, res] new_config = build_config_update(build_config, "dummy_placeholder", True) with new_config: tvm.build(sch, tensor_list, "cce", name=kernel_name)
def custom_Exp(shape, dtype, gamma, alpha, beta, kernel_name="cce_exp", need_build=False, need_print=False): """ calculate gamma **(alpha * data + beta), calculate exp(log(gamma) * alpha * data) * (gamma ** beta) Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support \ float16, float32 gamma : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, base alpha : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, scale beta : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, shift kernel_name : cce kernel name, default value is "cce_exp" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "DeviceExp" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not dtype.lower() in supported_dtypes: raise RuntimeError( "caffe_exp_layer_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) if gamma != -1 and gamma <= 0: # api cc_device_exp_c handle gamma == -1 as e raise ValueError( "please ensure gamma is greater than 0, where gamma = %s" % str(gamma)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" pad_c0 = 0 p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale") p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift") p_base = util.create_param_ptr([gamma], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") # scale --> alpha, shitf --> beta, base --> gamma output = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape pad_c0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(schedule, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data_input, output], "cce", name=kernel_name)
def custom_expm1(shape, dtype, kernel_name="cce_tf_expm1", need_build=False, need_print=False): """ algorithm: expm1 calculating data's expm1, y= (e ** x) - 1,dtype is float16 or float32. Parameters ---------- shape : shape of data. dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32. kernel_name : cce kernel name, default value is "cce_tf_expm1". need_buid : if need to build CCEC kernel, default value is False. need_print : if need to print the ir, default value is False. Returns ------- None """ # [aicpu] int32_t cc_device_exp(uint32_t blockNum, uint32_t blockIdx, int32_t dataType, const void *scale, const void *shift, # const void *base, int32_t dimCnt, int32_t *shape, uint32_t padC0, const void *x, void *y); supported_dtypes = ["float16", "float32"] util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("tf_expm1_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) # step 1. calculate y = exp ** x by aicpu api device_api = "DeviceExp" v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([1], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_base = util.create_param_ptr([-1], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output_exp = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output_exp", dtype=inp_dtype) offset = tvm.const((-1), dtype=inp_dtype) # step 2. cauculate y = exp ** x - 1 by tvm output = tvm.compute( shape, lambda *indice: output_exp(*indice) + offset.astype(inp_dtype), name="output") # step 3. schedule the computation by tvm s = tvm.create_schedule(output.op) # step 4. build by tvm if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)