def zn_2_hwcn(src, dst, src_format, dst_format, kernel_name='zn_2_hwcn'): """ algorithm: zn_2_hwcn calculating: change data format from Zn to HWCN Parameters ---------- src: dict contains shape and dtype information of input tensor dst: dict contains shape and dtype information of output tensor src_format: str represents the format of input tensor, only support "Zn" dst_format: str represents the format of output tensor, only support "HWCN" kernel_name: str cce kernel name, default value is "zn_2_hwcn" Returns ------- None """ _check_parameters(src, dst, src_format, dst_format, kernel_name) dst_shape = dst.get("shape") dtype = src.get("dtype") h_i, w_i, c_i, n_i = dst_shape c_0 = 16 if dtype == "int8": c_0 = 32 c_1 = _ceil_div(c_i, c_0) n_ni = 16 n_no = _ceil_div(n_i, n_ni) shape_zn = [c_1*h_i*w_i, n_no, n_ni, c_0] branch = _get_ir_branch(shape_zn, dtype) data = tvm.placeholder(shape_zn, dtype=dtype, name="data") if branch == "more_row": res = tvm.extern(dst_shape, [data], lambda ins, outs: _more_row_ir(outs[0], ins[0], c_0), name="res", dtype=dtype) else: res = tvm.extern(dst_shape, [data], lambda ins, outs: _split_row_ir(outs[0], ins[0]), name="res", dtype=dtype) tensor_list = [data, res] sch = tvm.create_schedule(res.op) with build_config: tvm.build(sch, tensor_list, "cce", name=kernel_name)
def depthwise_weight_6d_2_4d(x, y, src_format, dst_format, kernel_name="depthwise_weight_6d_2_4d"): """Operation and Schedule for depthwise_weight_6d_2_4d. Parameters ---------- x: shape and dtype of input, the dtype support float16, float32, int32, uint16. y: the shape and dtype of outputs, the dtype same as input. src_format: the source data_format dst_format: the target data_format kernel_name : cce kernel name, default value is "depthwise_weight_6d_2_4d" Returns ------- convert C1HWNCoC0 tp HWCN """ _check_parameters(x, y, src_format, dst_format) output_shape = y.get("shape") channel_size = output_shape[2] input_shape = x.get("shape") dtype = x.get("dtype") channel_4d = channel_size op_utils.check_shape(input_shape, param_name="x") check_list = ("float16", "float32", "int32", "uint16") dtype = dtype.lower() op_utils.check_dtype(dtype, check_list, param_name="x") input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype) six2four = _Six2FourParam(input_shape, channel_4d) res = tvm.extern( [six2four.get_out_shape()], [input_data], lambda ins, outs: _intrin_factor(six2four, dtype, ins, outs), name="res", dtype=dtype) sch = tvm.create_schedule(res.op) build_list = [input_data, res] with build_config: tvm.build(sch, build_list, "cce", name=kernel_name)
def depthwise_weight_4d_2_6d(x, y, src_format, dst_format, kernel_name="depthwise_weight_4d_2_6d"): """Operation and Schedule for depthwise_weight_4d_2_6d. Parameters ---------- x: shape and dtype of input, the dtype support float16, float32, int32, uint16. y: the shape and dtype of outputs, the dtype same as input. src_format: the source data_format dst_format: the target data_format kernel_name : cce kernel name, default value is "depthwise_weight_4d_2_6d" Returns ------- convert HWCN to C1HWNCoC0 """ if src_format.lower() != "hwcn": raise RuntimeError("dst_format must be HWCN!") if dst_format.lower() != "c1hwncoc0": raise RuntimeError("src_format must be C1HWNCoC0 !") input_shape = x.get("shape") dtype = x.get("dtype") op_utils.check_shape(input_shape, param_name="x") check_list = ("float16", "float32", "int32", "uint16") dtype = dtype.lower() op_utils.check_dtype(dtype, check_list, param_name="x") input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype) four2six = _Four2SixParam(input_shape) res = tvm.extern( [four2six.get_out_shape()], [input_data], lambda ins, outs: _intrin_factor(four2six, dtype, ins, outs), name="res", dtype=dtype) sch = tvm.create_schedule(res.op) build_list = [input_data, res] with build_config: tvm.build(sch, build_list, "cce", name=kernel_name)
def histogram_fixed_width_d_compute(x, range, y, nbins, kernel_name="histogram_fixed_width_d"): """TVM calculation process, used for fusion operation compute for histogram_fixed_width Parameters ---------- x: TVM tensor the placeholders of x range: TVM tensor the placeholders of range y: dict dict info of output, not used nbins: int number of histogram bins. dtype: str data type for returned histogram. kernel_name: str cce kernel name, not used Returns ------- res: TVM tensor the result histogram_fixed_width """ dtype = "int32" input_values_shape = te.lang.cce.util.shape_to_list(x.shape) value_range_shape = te.lang.cce.util.shape_to_list(range.shape) res = tvm.extern( [nbins], [x, range], lambda ins, outs: _histogram_fixed_width_ir( outs, ins, nbins, [input_values_shape, value_range_shape]), name="res", dtype=dtype) return res
def custom_Exp(shape, dtype, gamma, alpha, beta, kernel_name="cce_exp", need_build=False, need_print=False): """ calculate gamma **(alpha * data + beta), calculate exp(log(gamma) * alpha * data) * (gamma ** beta) Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support \ float16, float32 gamma : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, base alpha : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, scale beta : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, shift kernel_name : cce kernel name, default value is "cce_exp" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "DeviceExp" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not dtype.lower() in supported_dtypes: raise RuntimeError( "caffe_exp_layer_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) if gamma != -1 and gamma <= 0: # api cc_device_exp_c handle gamma == -1 as e raise ValueError( "please ensure gamma is greater than 0, where gamma = %s" % str(gamma)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" pad_c0 = 0 p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale") p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift") p_base = util.create_param_ptr([gamma], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") # scale --> alpha, shitf --> beta, base --> gamma output = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape pad_c0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(schedule, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data_input, output], "cce", name=kernel_name)
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod", need_build=False, need_print=False): """ do element-wise truncatemod operation between two input tensors Parameters: ---------- shape1 : shape of input data1 shape2 : shape of input data2 dtype : source data type, support float16,float32,int32 kernel_name : cce kernel name, default value is "cce_tf_truncatemod" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ max_dim = 8 shape1_len = len(shape1) shape2_len = len(shape2) if shape1_len > max_dim or shape2_len > max_dim: raise RuntimeError( "mod_cce only support up to %d dimensions while the shape's \ dimensions is %d, %d" % (max_dim, shape1_len, shape2_len)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape1) util.check_shape_rule(shape2) util.check_shape_size(shape1, SHAPE_SIZE_LIMIT) util.check_shape_size(shape2, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] device_api_map = {"float16": "cc_device_truncatemod_float16", "float32": "cc_device_truncatemod_float", "int32": "cc_device_truncatemod_int32"} dtype = dtype.lower() if dtype not in check_list: raise RuntimeError( "tf_truncatemod_cce only support %s while dtype is %s" % ( ",".join(check_list), dtype)) shape1, shape2, shape_out = util.produce_shapes(shape1, shape2) util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() device_api = device_api_map[inp_dtype] # block block_num = "block_num" block_idx = "block_idx" # x param v_xndim_cnt = tvm.const(len(shape1), "int32") p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape") xpad_c0 = tvm.const(0, "int32") data_input_x = tvm.placeholder(shape1, name="data_input_x", dtype=inp_dtype) # y param v_yndim_cnt = tvm.const(len(shape2), "int32") p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape") ypad_c0 = tvm.const(0, "int32") data_input_y = tvm.placeholder(shape2, name="data_input_y", dtype=inp_dtype) # output v_out_ndim_cnt = tvm.const(len(shape_out), "int32") p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape") out_padc0 = tvm.const(0, "int32") output = tvm.extern(shape_out, [p_xshape, data_input_x, p_yshape, data_input_y, p_out_shape], lambda ins, outs: tvm.call_extern("int32_t", device_api, block_num, block_idx, v_xndim_cnt, ins[0].access_ptr("r"), # shape x xpad_c0, ins[1].access_ptr("r"), # input x v_yndim_cnt, ins[2].access_ptr("r"), # shape y ypad_c0, ins[3].access_ptr("r"), # input y v_out_ndim_cnt, ins[4].access_ptr("r"), # shape out out_padc0, outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) # print IR if need_print: with build_config: print(tvm.lower(schedule, [data_input_x, data_input_y, output], simple_mode=True)) # Compile to generate the cce file if need_build: with build_config: tvm.build(schedule, [data_input_x, data_input_y, output], "cce", name=kernel_name)
def cast(input_x, output_y, dst_type, kernel_name="cast"): """ cast a tensor/scaler with input shape form src data type to dst data type. restrictions of input algorithms are as follow only types' groups blow are support tensor process: float16->float32 float16->int32 float32->float16 float32->int32 int8->float32 uint8->float32 int8->float16 uint8->float16 int8->int32 uint8->int32 int32->uint8 // number out of [0,255] can get unexpected result int32->int8 // number out of [-128,127] can get unexpected result int32->float32 // For tans with fp16, only guarantees number in [-1023,1023] get correct result int32->float16 // only guarantees number in [-1023,1023] get correct result scale convert support:(means only support shape [1,]) int64->int32 int64->float32 Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape as input, and the dtype is the dst dtype need to cast kernel_name : str cce kernel name, default value is cast Returns ------- None """ shape = util.scalar2tensor_one(input_x.get("shape")) src_type = input_x.get("dtype").lower() check_shape(shape, param_name="input_x") if src_type == "bool": src_type = "int8" dst_type = _cast_dsttype_conversion(dst_type) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=src_type) if src_type == "int64": check_dtype(dst_type, ("float32", "int32"), param_name="dst_type") res = tvm.extern( [fuseshape], [data], lambda ins, outs: _kernel_ir(outs, ins, dst_type, "int64"), name="res", dtype=dst_type) tensor_list = [data, res] schedule = tvm.create_schedule(res.op) with build_config: tvm.build(schedule, tensor_list, "cce", name=kernel_name) else: with tvm.target.cce(): res = cast_compute(data, output_y, dst_type, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def custom_round(shape, dtype, kernel_name="cce_round", need_build=False, need_print=False): """ doing round operations, calculating data type is float16 or float32 or int32 Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype kernel_name : cce kernel name, default value is "cce_round" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ check_list = ["float16", "float32", "int32"] device_api_map = { "float16": "cc_device_round_float16", "float32": "cc_device_round_float", "int32": "cc_device_round_int32" } max_dim = 8 shape_len = len(shape) if shape_len > max_dim: raise RuntimeError( "round_cce only support up to %d dimensions while the shape's dimension is %d" % (max_dim, shape_len)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in check_list): raise RuntimeError("round_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) device_api = device_api_map[inp_dtype] block_num = "block_num" block_idx = "block_idx" v_ndim = tvm.const(len(shape), "int32") padC0 = tvm.const(0, "int32") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_input, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_ndim, ins[1].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def drop_out_do_mask(input_tensor, input_mask, input_keep_prob, output, kernel_name="dropout_do_mask"): """ algorithm: tf_dropout_do_mask scale_x = x*(1 / keep_prob) res = select(mask == 1, scale_x, 0) Parameters ---------- input_tensor : dict,shape and dtype of input_tensor,only support float16 and float32 input_mask : dict,shape and dtype of input_mask shape of mask,1D, dtype == uint8 length=(size(shape_tensor)+ELEMS_BATCH_PROCESS_FP16 -1)/ELEMS_BATCH_PROCESS_FP16*ELEMS_BATCH_PROCESS_FP16/8 eg. shape_tensor=[2,5,8] shape_mask=[16] shape_res=[2,5,8] shape_tensor=[15,17,19] shape_mask=[608] shape_res=[15,17,19] input_keep_prob : dict,shape and dtype of input_keep_prob shape of keep_prob, only 1 parament and equals to (1) prob scale (0.0,1.0] NOTICE: type same as dytpe output : dict,shape and dtype of output kernel_name : str cce kernel name, default value is "dropout_do_mask" Returns ------- None """ shape_tensor = input_tensor.get("shape") shape_mask = input_mask.get("shape") shape_keep_prob = input_keep_prob.get("shape") dtype = input_tensor.get("dtype") if shape_keep_prob == 1: shape_keep_prob = (shape_keep_prob, ) check_shape(shape_tensor, param_name="input_tensor") check_dtype(dtype.lower(), ["float16", "float32"], param_name="input_tensor") if len(shape_mask) != 1: raise RuntimeError("The length of mask shape must be 1") if shape_keep_prob not in [(1, ), [ 1, ]]: raise RuntimeError("Only support shape (1, ) or [1, ]") # functools_reduce: product of all dimension # Align to ELEMS_BATCH_PROCESS_FP16 product_mask = (functools_reduce(lambda x, y: x*y, shape_tensor[:]) + ELEMS_BATCH_PROCESS_FP16 - 1) // \ ELEMS_BATCH_PROCESS_FP16 * ELEMS_BATCH_PROCESS_FP16 // 8 if product_mask != shape_mask[0]: raise RuntimeError("The mask[0] should=%d, but now=%d" % (product_mask, shape_mask[0])) data_tensor = tvm.placeholder( (functools_reduce(lambda x, y: x * y, shape_tensor), ), dtype=dtype, name="data_tensor") data_mask = tvm.placeholder( (functools_reduce(lambda x, y: x * y, shape_mask), ), dtype='uint8', name="data_mask") keep_prob_tensor = tvm.placeholder(shape_keep_prob, dtype=dtype, name="keep_prob_tensor") const_1 = tvm.const(1.0, dtype=dtype) res = tvm.extern([shape_tensor, shape_mask, shape_keep_prob], [data_tensor, data_mask, keep_prob_tensor], lambda ins, outs: _kernel_ir(outs, ins, const_1), name="res", dtype=dtype) tensor_list = [data_tensor, data_mask, keep_prob_tensor, res] schedule = tvm.create_schedule(res.op) with build_config: tvm.build(schedule, tensor_list, "cce", name=kernel_name)
def custom_pow(shape, shape_y, dtype, kernel_name="cce_tf_pow", need_build=False, need_print=False): """ calculate x^y, calculating data type is float16 or float32 or int32 when x < 0 , the output is a meaningless value. Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32, int32 kernel_name : cce kernel name, default value is "tf_pow_cce" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32", "int32"] device_api = "cc_device_pow" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not dtype.lower() in supported_dtypes: raise RuntimeError("tf_pow_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_lhs = tvm.placeholder(shape, name="data_lhs", dtype=inp_dtype) data_rhs = tvm.placeholder(shape, name="data_rhs", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" pad_c0 = 0 p_scale = util.create_param_ptr([0], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_power = util.create_param_ptr([0], inp_dtype, "p_power") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_lhs, data_rhs, p_scale, p_shift, p_power, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[2].access_ptr("r"), # scale ins[3].access_ptr("r"), # shift ins[4].access_ptr("r"), # power v_ndim, ins[5].access_ptr("r"), # shape pad_c0, ins[0].access_ptr("r"), # input x v_ndim, v_ndim, ins[5].access_ptr("r"), # shape pad_c0, ins[1].access_ptr("r"), # input y outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) if need_print: with build_config: print( tvm.lower(schedule, [data_lhs, data_rhs, output], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data_lhs, data_rhs, output], "cce", name=kernel_name)
def custom_expm1(shape, dtype, kernel_name="cce_tf_expm1", need_build=False, need_print=False): """ algorithm: expm1 calculating data's expm1, y= (e ** x) - 1,dtype is float16 or float32. Parameters ---------- shape : shape of data. dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32. kernel_name : cce kernel name, default value is "cce_tf_expm1". need_buid : if need to build CCEC kernel, default value is False. need_print : if need to print the ir, default value is False. Returns ------- None """ # [aicpu] int32_t cc_device_exp(uint32_t blockNum, uint32_t blockIdx, int32_t dataType, const void *scale, const void *shift, # const void *base, int32_t dimCnt, int32_t *shape, uint32_t padC0, const void *x, void *y); supported_dtypes = ["float16", "float32"] util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("tf_expm1_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) # step 1. calculate y = exp ** x by aicpu api device_api = "DeviceExp" v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([1], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_base = util.create_param_ptr([-1], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output_exp = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output_exp", dtype=inp_dtype) offset = tvm.const((-1), dtype=inp_dtype) # step 2. cauculate y = exp ** x - 1 by tvm output = tvm.compute( shape, lambda *indice: output_exp(*indice) + offset.astype(inp_dtype), name="output") # step 3. schedule the computation by tvm s = tvm.create_schedule(output.op) # step 4. build by tvm if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def custom_Power(shape, dtype, gamma, alpha, beta, kernel_name="cce_caffe_power", need_build=False, need_print=False): """ calculate (alpha * data + beta) ** gamma, calulation method exp(gamma * log(alpha * data + beta)). when alpha * data + beta < 0 , the output is a meaningless value. Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32 gamma : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma alpha : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma beta : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma kernel_name : string kernel name in generated CCE kernal. default value is "cce_caffe_power" need_buid : bool if need to build CCEC kernel need_print : bool if need to print Halide IR Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "cc_device_pow" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("power_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim_x = len(shape) v_ndim_y = 0 p_shape_y = 0 p_input_y = "nullptr" block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale") p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift") p_power = util.create_param_ptr([gamma], inp_dtype, "p_power") p_shape_x = util.create_param_ptr(shape, "int32", "p_shape_x") # scale --> alpha, shitf --> beta, power --> gamma output = tvm.extern( shape, [data_input, p_scale, p_shift, p_power, p_shape_x], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # power v_ndim_x, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x v_ndim_y, v_ndim_y, p_shape_y, padC0, p_input_y, outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def custom_exp(shape, dtype, kernel_name="cce_tf_exp", need_build=False, need_print=False): """ algorithm: exp calculating data's exp,y= e ** x ,dtype is float16, Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32 kernel_name : cce kernel name, default value is "cce_tf_exp" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "DeviceExp" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("tf_exp_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([1], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_base = util.create_param_ptr([-1], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def _tranpose_notchange_last_two(data, shape_5hd, dst_shape_full, dst_shape, perm, dtype, max_dim, shape_all): """ permutes the dimensions and the last axis is not transposed Parameters ---------- data: tvm.tensor tensor of input data shape_res: list or tuple shape of output tensor perm: list or tuple permutation of the dimension of tensor dtype: str the data type Returns ------- sch: tvm.schedule the compute schedule tensor_list: list list of tensor """ def _permute(*index): """ function of permute the dimensions of data """ for i, item in enumerate(_perm_to_flag(perm)): if i == 0: res_axis = (index[item], ) else: res_axis = res_axis + (index[item], ) return res_axis # c1hwnc0 to nc1hwc0 data_ub = tvm.compute(shape_5hd, lambda *index: data(*_permute(*index)), name="data_ub") res_5hd = tvm.compute(shape_5hd, lambda *index: data_ub(*index), name="res_5hd") # nc1hwc0 to nchw if dtype == "float32": branch = _get_ir_branch(shape_5hd, dtype, shape_all) if branch == "more_dim": res = tvm.extern(dst_shape_full, [res_5hd], lambda ins, outs: _more_dim_ir( outs[0], ins[0], max_dim, shape_all), name="res", dtype=dtype) elif branch == "one_dim": res = tvm.extern(dst_shape_full, [res_5hd], lambda ins, outs: _one_dim_ir( outs[0], ins[0], max_dim, shape_all), name="res", dtype=dtype) else: res = tvm.extern(dst_shape_full, [res_5hd], lambda ins, outs: _split_dim_ir( outs[0], ins[0], max_dim, shape_all), name="res", dtype=dtype) else: branch_fp16 = _get_ir_branch_fp16(dst_shape_full, dtype, shape_all) if branch_fp16 == "more_dim_fp16": res = tvm.extern(dst_shape_full, [res_5hd], lambda ins, outs: _more_dim_ir_fp16( outs[0], ins[0], max_dim, shape_all), name="res", dtype=dtype) else: res = tvm.extern(dst_shape_full, [res_5hd], lambda ins, outs: _split_dim_ir_fp16( outs[0], ins[0], max_dim, shape_all), name="res", dtype=dtype) res_end = tvm.extern(dst_shape, [res], lambda ins, outs: _temp_ir(outs[0], ins[0]), name="res_end", dtype=dtype) sch = tvm.create_schedule(res_end.op) args = [sch, data, res_5hd, data_ub, shape_5hd, dtype] sch, _ = _schedule_for_not_change_last(args) tensor_list = [data, res_end, res_5hd, res] return sch, tensor_list