def _tiling_axis(shape, dtype): """ Calculate the tile parameters. Parameters ---------- shape: list or tuple the shape of tensor. dtype: str the dtype of tensor. Returns ------- split_axis: int the target axis that is used for tile the tensor. split_factor: int the factor used when tile the target axis. """ total_ele, ele_each_block, _ = _get_public_param(dtype) tiling_shape = [dim for dim in shape] if shape[-1] % ele_each_block != 0: last_ele = ((shape[-1] + ele_each_block - 1) // ele_each_block) * ele_each_block tiling_shape[-1] = int(last_ele) split_axis = 0 split_factor = 1 for index, _ in enumerate(tiling_shape): ele_cnt = function_reduce(lambda x, y: x * y, tiling_shape[index:]) if ele_cnt <= total_ele: split_axis = index - 1 split_factor = total_ele // ele_cnt break elif index == len(tiling_shape) - 1: split_axis = index split_factor = total_ele break if split_axis < 0: split_axis = 0 split_factor = tiling_shape[0] return split_axis, split_factor
def _tilling_axis(shape, dtype): """ calculate the split parameters according to different shapes Parameters ---------- shape : list or tuple shape of tensor dtype : string buffer date type Returns ------- split_axis : the target axis that is used for spliting the tensor to find the maximum amount of data can be stored and processed every time on UB. split_factor : the factor used when spliting the target axis. For example, for data of float16, [1024, 1024, 256] will be split to [1024, 7, 164, 256], UB processes 164*256 elements every time. In this case, the split_axis is 1 and the split_factor is 164. """ # Number of Tensor in assign_sub tensor_num = 2 # ub_size_bytes is the size of the UB expressed by bytes(mod 8 bits). ub_size_bytes = cce.CceProductParams().getParams("Unified_Buffer") # dtype_bytes_size for float16 is 2, for float32 is 4 dtype_bytes_size = cce.cce_intrin.get_bit_len(dtype) // BYTES_TO_BITS # total_ele is the maximum amount of data that can be stored in UB. if dtype in ("int8", "uint8"): dtype_bytes_size_fp16 = cce.cce_intrin.get_bit_len( "float16") // BYTES_TO_BITS total_ele = ub_size_bytes // (dtype_bytes_size + dtype_bytes_size_fp16) // tensor_num else: total_ele = ub_size_bytes // dtype_bytes_size // tensor_num shape_value = shape[-1] if dtype in ("int8", "uint8"): bytes_size = dtype_bytes_size + dtype_bytes_size_fp16 else: bytes_size = dtype_bytes_size ele_num = total_ele // 16 * (shape_value * bytes_size // SHAPE_THREHOLD + 1) if ele_num > total_ele // 2: total_ele = total_ele // 2 else: total_ele = total_ele // 16 * \ (shape_value * bytes_size // SHAPE_THREHOLD // 2 + 1) # To initialize the split_axis and the split_factor. split_axis = 0 split_factor = 1 # To find the appropriate axis from the first one to the last # by comparing the amount of the elements of the split tensor with # the maximum amount of data that can be stored in UB. for index, _ in enumerate(shape): ele_cnt = function_reduce(lambda x, y: x * y, shape[index:]) if ele_cnt <= total_ele: split_axis = index - 1 split_factor = total_ele // ele_cnt break # when the last axis is still over the size of UB, we choose to split the # last axis, and the split_factor is set as the maximum amount of data # that can be stored in UB. if shape[-1] > total_ele: split_axis = len(shape) - 1 split_factor = (total_ele // TILING_SIZE) * TILING_SIZE # when the amount of the elements of the tensor is less than the size of UB, # it means UB can process the whole tensor in one time. But the split_axis # has already been set to "-1", split_axis and split_factor # should be initialized into "0" and shape[0] if split_axis < 0: split_axis = 0 split_factor = shape[0] return split_axis, split_factor
def ascend_requant_compute(x, req_scale, y, relu_flag=False, kernel_name='ascend_requant'): """ int32 -> int8 Parameters: ---------- x : the placeholder of input req_scale: the placeholder of requant num y : the dict of output. relu_flag : the relu mode when true the result to do relu kernel_name : cce kernel name, default value is "ascend_requant" Returns: res : the result of ascend_requant ------- None """ x_shape = x.shape x_shape_list = te.lang.cce.util.shape_to_list(x_shape) align_shape = x_shape_list.copy() # the tensor is a constant or vector based on the original shape ori_shape_req = req_scale.op.attrs['ori_shape'] ori_shape_req_list = te.lang.cce.util.shape_to_list(ori_shape_req) req_dim = function_reduce(lambda x, y: x * y, ori_shape_req_list[:]) tensor_flag = False if req_dim > 1: tensor_flag = True c1_index = 1 if _is_nz_format(x): c1_index = len(x_shape) - 4 if x.op.tag == "depthwise_conv2d": align_shape[4] = 16 align_shape[3] = (x_shape_list[3] + 15) // 16 * 16 align_shape[2] = 1 if tensor_flag: align_shape[1] = (x_shape_list[1] * x_shape_list[2] * 16 + 31) \ // 32 * 32 // 16 else: align_shape[1] = x_shape_list[1] * x_shape_list[2] align_shape[0] = x_shape_list[0] if tensor_flag: res_ub = tvm.compute( align_shape, lambda i, j, a, k, l: tvm.vdeq_cast(x(i, j // 2, j % 2, k, l), req_scale(0, j, 0, 0, l), "int8", do_relu=relu_flag), name='s32_to_s8', tag="requant_vector") else: res_ub = tvm.compute( align_shape, lambda i, j, a, k, l: tvm.deq_cast(x( i, j // 2, j % 2, k, l), req_scale(0, 0, 0, 0, 0), "int8"), name='s32_to_s8', tag="requant_scale") else: align_shape[c1_index] = (align_shape[c1_index] + 1) // 2 * 2 align_shape[-2] = (align_shape[-2] + 15) // 16 * 16 res_ub = _s32_to_s8_normal_compute(x, req_scale, align_shape, c1_index, tensor_flag, relu_flag) if _is_nz_format(x): res = _format_transfer_nz(align_shape, res_ub, c1_index) return res res_ub_reform = _format_transfer(align_shape, res_ub, c1_index) res_shape = te.lang.cce.util.shape_to_list(res_ub_reform.shape) res_shape[-2] = x.shape[-2] res = tvm.compute(res_shape, lambda *indice: res_ub_reform(*indice), name='requant_remove_pad', tag="requant_remove_pad") return res
def ascend_dequant_s16_compute(x0, deq_scale, x1, y, relu_flag=False, kernel_name='ascend_dequant_s16'): """ int32 -> int16 Parameters: ---------- x : the placeholder of input deq: the placeholder of requant num x1: the placeholder of add input tensor y: the dict of output relu_flag : the relu mode when true the result to do relu, default value is False kernel_name : cce kernel name, default value is "ascend_dequant_s16" Returns: res : the result of ascend_dequant_s16 ------- None """ x0_shape = x0.shape x0_shape_list = te.lang.cce.util.shape_to_list(x0_shape) align_shape = x0_shape_list.copy() ori_shape_deq = deq_scale.op.attrs['ori_shape'] ori_shape_deq_list = te.lang.cce.util.shape_to_list(ori_shape_deq) deq_dim = function_reduce(lambda x, y: x * y, ori_shape_deq_list[:]) tensor_flag = False if deq_dim > 1: tensor_flag = True c1_index = 1 if _is_nz_format(x0): c1_index = len(x0_shape) - 4 align_shape[-2] = (align_shape[-2] + 15) // 16 * 16 res_ub = _s32_to_s16_normal_compute(x0, deq_scale, x1, align_shape, c1_index, tensor_flag, relu_flag) if _is_nz_format(x0): res = tvm.compute(align_shape, lambda *i: res_ub[i], name='res', tag='dequant_s16_NZ') return res res_shape = te.lang.cce.util.shape_to_list(res_ub.shape) res_shape[-2] = x0.shape[-2] res = tvm.compute(res_shape, lambda *indice: res_ub(*indice), name='dequant_s16_remove_pad', tag="dequant_s16_remove_pad") return res
def reduce_2_tuple(shape): return (function_reduce(operator.mul, shape), )
def ascend_requant_s16_compute(x, req_scale, x1, y, y1, dual_output, relu_flag, kernel_name='ascend_requant_s16'): """ int16 -> int8 Parameters: ---------- x : the placeholder of input req_scale: the placeholder of req_scale x1: the placeholder of x1 y : the dict of output. y1 : the dict of output1. dual_output : the sqrt mode when true return 2 result, default value is False relu_flag : the relu mode when true the result to do relu, default value is False kernel_name : cce kernel name, default value is "ascend_requant_s16" Returns: res : the result of ascend_requant_s16 which is list ------- None """ x_shape = x.shape x_shape_list = te.lang.cce.util.shape_to_list(x_shape) align_shape = x_shape_list.copy() ori_shape_req = req_scale.op.attrs['ori_shape'] ori_shape_req_list = te.lang.cce.util.shape_to_list(ori_shape_req) req_dim = function_reduce(lambda x, y: x * y, ori_shape_req_list[:]) tensor_flag = False if req_dim > 1: tensor_flag = True c1_index = 1 if _is_nz_format(x): c1_index = len(x_shape) - 4 align_shape[c1_index] = (align_shape[c1_index] + 1) // 2 * 2 res_s16, res_ub = _s16_to_s8_normal_compute(x, x1, req_scale, x_shape, align_shape, c1_index, tensor_flag, relu_flag) res = _format_transfer(align_shape, res_ub, c1_index) if _is_nz_format(x): res = tvm.compute(align_shape, lambda *i: res[i], name='res', tag='requant_s16_NZ') if dual_output: return [res, res_s16] return [res]
def ascend_quant(x, y, scale, offset, sqrt_mode=False, round_mode="Round", kernel_name="ascend_quant"): """ float16/float32 -> int8 Parameters: ---------- x : the dict of input y : the dict of output scale : the data of scale offset : the data of offset sqrt_mode : the sqrt mode when true the result to do sqrt round_mode : the data conversion mode kernel_name : cce kernel name, default value is "ascend_quant" Returns: ------- None """ _check_params(x, y, scale, offset, sqrt_mode, round_mode, kernel_name) shape = x.get("shape") input_dtype = x.get("dtype").lower() input_format = x.get("format") x_l1_fusion_type, y_l1_fusion_type, attr = _check_l1_fusion(x, y) if input_format == "NC1HWC0": if x_l1_fusion_type != -1: input_shape = shape attr["l1_fusion_flag"] = x_l1_fusion_type else: # change to N,C1,H*W,C0 input_shape = (shape[0], shape[1], shape[2] * shape[3], shape[4]) else: # nz change to 1,C1,N1*N0,C0 equivalence N,C1,H*W,C0 batch = 1 if len(shape) > 4: batch = function_reduce(lambda x, y: x * y, shape[:-4]) input_shape = (batch, shape[-4], shape[-3] * shape[-2], shape[-1]) input_x = tvm.placeholder(input_shape, name="input_x", dtype=input_dtype, attrs=attr) res = ascend_quant_compute(input_x, y, scale, offset, sqrt_mode, round_mode, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [input_x, res]} te.lang.cce.cce_build_code(sch, config)
def ascend_dequant(x, deq_scale, y, sqrt_mode=False, relu_mode=False, kernel_name='ascend_dequant'): """ int32 -> fp16 Parameters: ---------- x : the dict of input deq_scale: the dict of dequant num offset: the dict of offset num y : the dict of output. sqrt_mode : the sqrt mode when true the result to do sqrt relu_flag : the relu mode when true the result to do relu kernel_name : cce kernel name, default value is "ascend_dequant" Returns: ------- None """ _check_params(x, deq_scale, kernel_name) shape_x = x.get("shape") shape_deq = deq_scale.get("shape") dtype_x = x.get("dtype") dtype_deq = deq_scale.get("dtype") x_format = x.get("format") ori_shape_deq = deq_scale.get("ori_shape") attr = {"ori_shape": ori_shape_deq} if dtype_deq == "uint64" and sqrt_mode: raise RuntimeError("ascend dequant when deq_scale dtype is uint64," "sqrt_mode only support False ") if x_format == "NC1HWC0": # n, C1, H*W, C0 shape_x = [shape_x[0], shape_x[1], shape_x[2] * shape_x[3], shape_x[4]] shape_deq = [ shape_deq[0], shape_deq[1], shape_deq[2] * shape_deq[3], shape_deq[4] ] else: # C1,N1,N0,C0 change to 1,C1,N1*N0,C0 equivalence N,C1,H*W,C0 x_batch = 1 if len(shape_x) > 4: x_batch = function_reduce(lambda x, y: x * y, shape_x[:-4]) shape_x = [ x_batch, shape_x[-4], shape_x[-3] * shape_x[-2], shape_x[-1] ] shape_deq = [ shape_deq[0], shape_deq[1], shape_deq[2] * shape_deq[3], shape_deq[4] ] input_x = tvm.placeholder(shape_x, dtype_x, "x") input_deq = tvm.placeholder(shape_deq, name="deq_scale", dtype=dtype_deq, attrs=attr) with tvm.target.cce(): res = ascend_dequant_compute_v2(input_x, input_deq, y, sqrt_mode, relu_mode, kernel_name) sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [input_x, input_deq, res] } te.lang.cce.cce_build_code(sch, config)
def ascend_dequant_compute_v2(x, deq_scale, y, sqrt_mode=False, relu_flag=False, kernel_name='ascend_dequant'): """ int32 -> fp16 Parameters: ---------- x : the placeholder of input deq_scale: the placeholder of dequant num offset: the placeholder of offset num y : the dict of output. sqrt_mode : the sqrt mode when true the result to do sqrt relu_flag : the relu mode when true the result to do relu kernel_name : cce kernel name, default value is "ascend_dequant" Returns: res : the result of ascend_dequant ------- None """ ori_shape_deq = deq_scale.op.attrs['ori_shape'] ori_shape_deq_list = te.lang.cce.util.shape_to_list(ori_shape_deq) deq_dim = function_reduce(lambda x, y: x * y, ori_shape_deq_list[:]) tensor_flag = False if deq_dim > 1: tensor_flag = True align_shape = te.lang.cce.util.shape_to_list(x.shape) align_shape[-2] = (align_shape[-2] + 15) // 16 * 16 x_ub = tvm.compute(x.shape, lambda *i: x(*i), name='x_ub', tag="dequant_x_ub") deq_ub = tvm.compute(deq_scale.shape, lambda *i: deq_scale(*i), name='deq_ub', tag="dequant_deq_ub") x_l0c = tvm.compute(align_shape, lambda *i: x_ub(*i), name='x_l0c', tag="dequant_x_l0c") if tensor_flag: if _is_support_v200_instruction(): res = _dequant_v200_v2(x_l0c, deq_ub, align_shape, x.shape, relu_flag, tensor_flag) else: res = _vector_dequant_v100_v2(x_l0c, deq_ub, align_shape, x.shape, relu_flag, sqrt_mode) else: if _is_support_v200_instruction(): res = _dequant_v200_v2(x_l0c, deq_ub, align_shape, x.shape, relu_flag, tensor_flag) else: res = _scalar_dequant_v100_v2(x_l0c, deq_ub, align_shape, x.shape, relu_flag, sqrt_mode) return res
def ascend_dequant_compute(x, deq_scale, y, sqrt_mode=False, relu_flag=False, kernel_name='ascend_dequant'): """ int32 -> fp16 Parameters: ---------- x : the placeholder of input deq_scale: the placeholder of dequant num offset: the placeholder of offset num y : the dict of output. sqrt_mode : the sqrt mode when true the result to do sqrt relu_flag : the relu mode when true the result to do relu kernel_name : cce kernel name, default value is "ascend_dequant" Returns: res : the result of ascend_dequant ------- None """ def shape_to_list(shape): """ trans shape to list shape """ tmp = [] for i in shape: tmp.append(i.value) return tmp x_shape = x.shape deq_shape = deq_scale.shape x_shape_list = shape_to_list(x_shape) deq_shape_list = shape_to_list(deq_shape) ori_shape_deq = deq_scale.op.attrs['ori_shape'] ori_shape_deq_list = te.lang.cce.util.shape_to_list(ori_shape_deq) deq_dim = function_reduce(lambda x, y: x * y, ori_shape_deq_list[:]) tensor_flag = False if deq_dim > 1: tensor_flag = True align_shape = x_shape_list.copy() if x.op.tag != "depthwise_conv2d": align_shape[2] = (align_shape[2] + 15) // 16 * 16 if x.op.tag == "matmul" or x.op.tag == "matmul_gemv": shape_matmul_origin = x.op.attrs['shape'] c1_index = len(x_shape) - 4 res = _matmul_compute(x, x_shape, deq_scale, sqrt_mode, relu_flag, shape_matmul_origin, c1_index, tensor_flag) return res if x.op.tag == "depthwise_conv2d": align_shape[4] = 16 align_shape[3] = (x_shape_list[3] + 15) // 16 * 16 align_shape[2] = 1 if deq_shape_list[1] == 1: tensor_dict = {} tensor_dict["mad_ubuf"] = x.op.input_tensors[0] if x.op.attrs['bias_flag'].value == 1: tensor_dict["flag_is_dequant_bias"] = True tensor_dict["mad_after_bias"] = tensor_dict[ "mad_ubuf"].op.input_tensors[0] tensor_dict["mad_bias"] = tensor_dict[ "mad_after_bias"].op.input_tensors[0] tensor_dict["mad"] = \ tensor_dict["mad_after_bias"].op.input_tensors[1] tensor_dict["mad_bias_ub_brc"] = tensor_dict[ "mad_bias"].op.input_tensors[0] tensor_dict["bias_gm"] = tensor_dict[ "mad_bias_ub_brc"].op.input_tensors[0] else: tensor_dict["mad"] = \ tensor_dict["mad_ubuf"].op.input_tensors[0] tensor_dict["im2col_fractal"] = \ tensor_dict["mad"].op.input_tensors[0] tensor_dict["filter_reshape"] = \ tensor_dict["mad"].op.input_tensors[1] tensor_dict["filter_buf"] = \ tensor_dict["filter_reshape"].op.input_tensors[ 0] tensor_dict["im2col_row_major"] = tensor_dict[ "im2col_fractal"].op.input_tensors[0] tensor_dict["fmap"] = \ tensor_dict["im2col_row_major"].op.input_tensors[0] x_ori_shape = tensor_dict["fmap"].op.attrs["ori_shape"] x_ori_shape_list = te.lang.cce.util.shape_to_list(x_ori_shape) align_shape[1] = (x_ori_shape_list[3] + 15) // 16 else: align_shape[1] = (deq_shape_list[1] * deq_shape_list[4]) // 16 align_shape[0] = x_shape_list[0] if tensor_flag: if _is_support_v200_instruction(): res = _vector_depthwise_fused_v200(x, x_shape, align_shape, deq_scale, relu_flag) else: res = _vector_depthwise_fused_v100(x, x_shape, align_shape, deq_scale, relu_flag, sqrt_mode) else: if _is_support_v200_instruction(): res = _scalar_depthwise_fused_v200(x, x_shape, align_shape, deq_scale, relu_flag) else: res = _scalar_depthwise_fused_v100(x, x_shape, align_shape, deq_scale, relu_flag, sqrt_mode) return res if tensor_flag: if _is_support_v200_instruction(): res = _vector_dequant_v200(x, x_shape, align_shape, deq_scale, relu_flag) else: res = _vector_dequant_v100(x, x_shape, align_shape, deq_scale, relu_flag, sqrt_mode) else: if _is_support_v200_instruction(): res = _scalar_dequant_v200(x, x_shape, align_shape, deq_scale) else: res = _scalar_dequant_v100(x, x_shape, align_shape, deq_scale, relu_flag, sqrt_mode) return res