def __init__(self, input_x, output_y, num, axis, kernel_name): self.input_x = input_x self.num = num self.kernel_name = kernel_name self.dtype = input_x.get("dtype").lower() self.dim_info_vars = [] self.ub_tensor_list = [] self.res_tensor_list = [] self.virtual_node = None self.sch_list = [] self.arg_list = [] self.rules = [] self.compile_vars = {} self.dim_vars = [] self.dim_bounds = [] self.output_shape = [] self.x_reshape = None self.left_range = None self.right_range = None self._normalize_shape() self._trans_input_shape(axis) self.new_axis = 1 self._input_placeholder = None self.block_idx = tvm.thread_axis('blockIdx.x') self.ub_size = cce_conf.get_soc_spec(cce_conf.UB_SIZE) self.core_num = cce_conf.get_soc_spec(cce_conf.CORE_NUM)
def _tiling(shape, dtype, wo_out, stride, kernel): ub_size_bytes = get_soc_spec(UB_SIZE) dtype_bytes_size = get_bit_len(dtype) // 8 total_ele = ub_size_bytes // dtype_bytes_size // 2 nc1h, _, c0 = shape core_num = get_soc_spec(CORE_NUM) fused_axis_block_factor, w_block_factor = nc1h, wo_out if fused_axis_block_factor >= core_num: fused_axis_block_factor = _ceil(fused_axis_block_factor, core_num) else: w_block_factor = _ceil(wo_out, _ceil(core_num, fused_axis_block_factor)) fused_axis_block_factor = 1 wo_buffer_num = 4 wi_buffer_num = 2 # for wi = (wo - 1) * stride + kernel # wo_buffer_num * N * C1 * H * Wo * C0 + wi_buffer_num * N * C1 * H * Wi * C0 <= total_ele nc1wo_limit = (total_ele // (c0) + (stride - kernel) * wi_buffer_num) // (wo_buffer_num + stride * wi_buffer_num) nc1_limit = (total_ele // (c0)) // (wo_buffer_num * wo_out + wi_buffer_num * wo_out * stride - wi_buffer_num * (stride - kernel)) if nc1_limit > 1: fused_factor, wo_factor = min(fused_axis_block_factor, nc1_limit), w_block_factor else: fused_factor, wo_factor = 1, nc1wo_limit // 8 * 8 return [fused_axis_block_factor, w_block_factor], [fused_factor, wo_factor]
def _min_l1_byte(): # Forth : L1 limitation, Mainly required by chip al1_min_byte = C0 * C0 * 2 if dedy_w % C0 == 0: bl1_min_byte = filter_h_dilation * fmap_w * C0 * 2 else: bl1_min_byte = (filter_h_dilation + stride_h) * fmap_w * C0 * 2 l1_size = get_soc_spec("L1_SIZE") # L1 size if (al1_min_byte + bl1_min_byte) > l1_size: args_dict = {'errCode': 'E60022'} raise RuntimeError(args_dict, err_mana.get_error_message(args_dict))
def conv_layer_cce_shape_calc(shape_in, shape_w, in_dtype, \ w_dtype, optim_dict): """ Parameters ---------- shape_in: shape of feature map shape_w: shape of weight in_dtype: the feature map data type w_dtype: the weight data type optim_dict: optimize feature dict Returns ------- None """ block_size_k = CUBE_MKN[in_dtype]['mac'][1] if optim_dict["c0_optim_flg"] and cce_conf.get_soc_spec("SOC_VERSION") in \ ("Ascend710", "Ascend615", "Ascend610", "Hi3796CV300CS"): block_size_k = 4 fmap_shape_nc1hwc0 = (shape_in[0], (shape_in[1] + block_size_k - 1) \ // block_size_k, shape_in[2], shape_in[3], block_size_k) out_channel, in_channel_weight, filter_h, filter_w = shape_w block_size_k = CUBE_MKN[w_dtype]['mac'][1] block_size_n = CUBE_MKN[w_dtype]['mac'][2] if optim_dict["c0_optim_flg"]: filter_shape_frac_z = ((4 * filter_h * filter_w + block_size_k - 1) \ // block_size_k, out_channel // block_size_n, block_size_n, block_size_k) else: filter_shape_frac_z = (in_channel_weight * filter_h * filter_w \ // block_size_k, out_channel // block_size_n, block_size_n, block_size_k) return fmap_shape_nc1hwc0, filter_shape_frac_z
def _get_mad_dtype(w_dtype): """ algorithm: get the dtype of mad Parameters ---------- w_dtype: the dtype of filter Returns ------- mad dtype """ mad_dtype = "float32" if w_dtype == 'int8': mad_dtype = "int32" elif get_soc_spec("SOC_VERSION") in ("Hi3796CV300ES", "Hi3796CV300CS"): mad_dtype = "float16" return mad_dtype
def conv3d_backprop_filter_cce(shape_x, shape_out_backprop, filter_sizes, strides, pads, dilations=(1, 1, 1, 1), x_dtype='float16', out_backprop_dtype='float16', res_dtype='float32', kernel_name="conv3d_backprop_filter_cce"): """ Topi interface of conv3d backprop filter Parameters: ---------- shape_x : The shape of feature map. 5-D with shape [batch, depth, channels, height, weight]. shape_out_backprop : The shape of gradients. 5-D with shape [batch, depth, channels, height, weight]. filter_sizes : The shape of filter. 5-D with shape [batch, depth, channels, height, weight]. strides : A list of ints. The stride of the sliding window. pads : "SAME"or"VALID", indicating the type of pads algorithm to use, or list. dilations : An optional list of ints. Default value is [1, 1, 1, 1]. x_dtype : The dtype of feature map data. Default value is float16. out_backprop_dtype : The dtype of gradients data. Default value is float16. res_dtype : The dtype of result(De/Dw) data. Default value is float32. kernel_name : Cce kernel name. Default value is "conv3d_backprop_filter_cce" need_build : If need to build CCEC kernel. Default value is False. Returns : None ---------- """ def _ceil(x_1, x_2): if x_2 == 0: args_dict = { 'errCode': 'E62502', 'first_operand': str(x_1), 'second_operand': str(x_2) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) return (x_1 + x_2 - 1) // x_2 if get_soc_spec("SOC_VERSION") in ("Hi3796CV300ES", "Hi3796CV300CS"): res_dtype = "float16" res = check_conv3dbp_filter_params(shape_x, shape_out_backprop, filter_sizes, strides, pads, dilations, x_dtype, out_backprop_dtype, res_dtype, kernel_name) shape_x, shape_out_backprop, filter_sizes, strides, pads, dilations, \ x_dtype, out_backprop_dtype, res_dtype, kernel_name = res fmap_batch, fmap_depth, fmap_channel, fmap_h, fmap_w = shape_x dedy_batch, dedy_d, dedy_channel, dedy_h, dedy_w = shape_out_backprop c0_size = cce_params.C0_SIZE # Channel axis should be align with 16 shape_dedy = (dedy_batch, dedy_d, \ _ceil(dedy_channel, c0_size), dedy_h, dedy_w, c0_size) shape_fmap = (fmap_batch, fmap_depth, \ _ceil(fmap_channel, c0_size), fmap_h, fmap_w, c0_size) dedy = tvm.placeholder(shape_dedy, name="dedy", dtype=out_backprop_dtype) fmap = tvm.placeholder(shape_fmap, name="fmap", dtype=x_dtype) dedw = te.lang.cce.conv3d_backprop_filter_compute( input_x=fmap, out_backprop=dedy, filter_sizes=filter_sizes, strides=strides, padding=pads, dilations=dilations, res_dtype=res_dtype, kernel_name=kernel_name) tensor_list_input = [fmap, dedy] with tvm.target.cce(): sch = generic.auto_schedule(dedw) real_outs = sch.cce_special["real_out_tensor"] tensor_list = tensor_list_input + real_outs config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def _select_format(params): inputs = params[0] weights = params[1] c0_optim_flg = False shape_x = inputs.get("ori_shape") shape_x = scalar2tensor_one(shape_x) format_fm = inputs.get("ori_format") if format_fm == "NCHW": shape_fm = shape_x elif format_fm == "NHWC": shape_fm = [shape_x[0], shape_x[3], shape_x[1], shape_x[2]] else: err_man.raise_err_input_format_invalid("conv2d", "inputs", \ ["NCHW", "NHWC"], format_fm) shape_w = weights.get("ori_shape") if (not isinstance(shape_w, (tuple, list))) or len(shape_w) != 4: err_man.raise_err_should_be_4d("conv2d", "weights") format_w = weights.get("ori_format") if format_w == "NCHW": shape_filter = shape_w elif format_w == "NHWC": shape_filter = [shape_w[0], shape_w[3], shape_w[1], shape_w[2]] elif format_w == "HWCN": shape_filter = [shape_w[3], shape_w[2], shape_w[0], shape_w[1]] else: err_man.raise_err_input_format_invalid("conv2d", "weights", \ ["NCHW", "NHWC", "HWCN"], format_w) if shape_fm[1] <= 4: c0_optim_flg = True if (shape_filter[2] == 1) and (shape_filter[3] == 1): c0_optim_flg = False # format NC1HWC0_C04 can only be used at first conv layer # for those soc using NC1HWC0_C04, ensure is_first_layer == 1 if inputs.get("is_first_layer") != 1 and \ cce_conf.get_soc_spec("SOC_VERSION") \ in ("Ascend710", "Ascend615", "Ascend610", "Hi3796CV300CS"): c0_optim_flg = False if c0_optim_flg: if cce_conf.get_soc_spec("SOC_VERSION") in \ ("Ascend710", "Ascend615", "Ascend610", "Hi3796CV300CS"): input0 = gen_param(classify="input0", name="x", datatype="float16,float16,int8,int8", format="NC1HWC0_C04,NC1HWC0," "NC1HWC0_C04,NC1HWC0") else: input0 = gen_param(classify="input0", name="x", datatype="float16,float16,int8,int8", format="NC1HWC0,NC1HWC0," "NC1HWC0,NC1HWC0") input1 = gen_param(classify="input1", name="filter", datatype="float16,float16,int8,int8", format="FRACTAL_Z_C04,FRACTAL_Z," "FRACTAL_Z_C04,FRACTAL_Z") input2 = gen_param(classify="input2", name="bias", datatype="float16,float16,int32,int32", format="ND,ND,ND,ND") input3 = gen_param(classify="input3", name="offset_w", datatype="int8,int8,int8,int8", format="ND,ND,ND,ND") output0 = gen_param(classify="output0", name="y", datatype="float16,float16,int32,int32", format="NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0") else: # only dynamic_hw or dynamic_batch is supported by dynamic conv2d if (shape_fm[0] == -1 and -1 not in shape_fm[1:]) or \ (shape_fm[2] == -1 and shape_fm[3] == -1 and -1 not in shape_fm[:2]): input0 = gen_param(classify="input0", name="x", datatype="float16", format="NC1HWC0", unknownshape_format="NC1HWC0") input1 = gen_param(classify="input1", name="filter", datatype="float16", format="FRACTAL_Z", unknownshape_format="FRACTAL_Z") input2 = gen_param(classify="input2", name="bias", datatype="float16", format="ND") input3 = gen_param(classify="input3", name="offset_w", datatype="int8", format="ND") output0 = gen_param(classify="output0", name="y", datatype="float16", format="NC1HWC0", unknownshape_format="NC1HWC0") else: input0 = gen_param(classify="input0", name="x", datatype="float16,int8", format="NC1HWC0,NC1HWC0") input1 = gen_param(classify="input1", name="filter", datatype="float16,int8", format="FRACTAL_Z,FRACTAL_Z") input2 = gen_param(classify="input2", name="bias", datatype="float16,int32", format="ND,ND") input3 = gen_param(classify="input3", name="offset_w", datatype="int8,int8", format="ND,ND") output0 = gen_param(classify="output0", name="y", datatype="float16,int32", format="NC1HWC0,NC1HWC0") return [input0, input1, input2, input3, output0]
def conv_layer_cce_para_check(shape_in, shape_w, padh, padw, strideh, stridew, in_dtype, w_dtype, res_dtype, offset_w_dtype, bias, kernel_name, dilateh=1, dilatew=1, optim_dict=None, fusion_para=None): """ Parameters ---------- shape_in: shape of feature map shape_w: shape of weight padh: H direction padding padw: W direction padding strideh: H direction stride stridew: W direction stride in_dtype: the feature map data type w_dtype: the weight data type res_dtype: the result data type offset_w_dtype: weight offset data type, default 'int32' bias: the tag for bias or not fusion_para: the config for L1 or L2 Fusion kernel_name: cce kernel name dilateh: H direction spacing between kernel dilatew: W direction spacing between kernel optim_dict: optimize feature dict Returns ------- None """ check_kernel_name(kernel_name) check_dtype_rule(offset_w_dtype, ['int32']) if cce_conf.get_soc_spec("SOC_VERSION") in ("Ascend310", "Hi3796CV300ES", \ "Ascend710", "Ascend615", "Ascend610", "Hi3796CV300CS"): check_dtype_rule(in_dtype, ('int8', "float16")) check_dtype_rule(w_dtype, ('int8', "float16")) check_dtype_rule(res_dtype, ('int32', "float16")) else: check_dtype_rule(in_dtype, ['float16']) check_dtype_rule(w_dtype, ['float16']) check_dtype_rule(res_dtype, ['float16']) if isinstance(padh, list): if len(padh) != PAD_SHAPE_DIM: err_man.raise_err_specific_user("conv2d", "Dimension must be "\ + str(PAD_SHAPE_DIM) + \ " when padh is a list.") pad_top = padh[0] pad_bottom = padh[1] else: pad_top = padh pad_bottom = padh if isinstance(padw, list): if len(padw) != PAD_SHAPE_DIM: err_man.raise_err_specific_user("conv2d", "Dimension must be "\ + str(PAD_SHAPE_DIM) + \ " when padw is a list.") pad_left = padw[0] pad_right = padw[1] else: pad_left = padw pad_right = padw if optim_dict is None: optim_dict = {"c0_optim_flg": False} optim_off = shape_in[1] > 4 or shape_w[1] > 4 or \ (shape_w[2] == 1 and shape_w[3] == 1) if optim_dict.get("c0_optim_flg") is True: if optim_off: err_man.raise_err_specific_user("conv2d", "Invalid "\ + "config for c0=4 optimize feature.") if fusion_para is None: fusion_para = {"input_memory_type": 0, "output_memory_type": 0, "valid_shape": (), "slice_offset": (), \ "l1_fusion_type": -1, \ "fmap_l1_addr_flag": 0, \ "fmap_l1_valid_size": -1} dilation_not_pass = (dilateh > 1 or dilatew > 1) and w_dtype == 'int8' if dilation_not_pass: err_man.raise_err_specific_user("conv2d", "Quant conv does not "\ + "support dilate > 1.") shape_in, shape_w = check_conv_shape(shape_in, shape_w, pad_top, pad_bottom, pad_left, pad_right, strideh, stridew, in_dtype, w_dtype, fusion_para, optim_dict, dilateh, dilatew) return shape_in, shape_w
def calc_para_from_tensor(inputs, weights, bias, offset_w, strides, pads, dilations, offset_x, kernel_name, data_format="NCHW"): shape_w = [] for i in weights.op.attrs['ori_shape']: shape_w.append(i.value) shape_fm = [] for i in inputs.shape: shape_fm.append(i.value) input_h = shape_fm[2] input_w = shape_fm[3] format_w = weights.op.attrs['ori_format'].value all_fmt = ["NCHW", "NHWC", "HWCN"] if format_w not in all_fmt: err_man.raise_err_input_format_invalid("conv2d", \ "weights", ["NCHW", "NHWC", "HWCN"], format_w) pos_c = format_w.find('C') pos_h = format_w.find('H') pos_w = format_w.find('W') weight_h = shape_w[pos_h] weight_w = shape_w[pos_w] shape_c = shape_w[pos_c] if len(strides) != 4: err_man.raise_err_should_be_4d("conv2d", "strides") if len(dilations) != 4: err_man.raise_err_should_be_4d("conv2d", "directions") format_x = inputs.op.attrs['ori_format'].value all_fmt = ["NCHW", "NHWC"] if format_x not in all_fmt: err_man.raise_err_input_format_invalid("conv2d", \ "input", ["NCHW", "NHWC"], format_x) pos_h = data_format.find('H') pos_w = data_format.find('W') strideh = strides[pos_h] stridew = strides[pos_w] dlt_h = dilations[pos_h] dlt_w = dilations[pos_w] if len(pads) == 4: padh = [pads[0], pads[1]] padw = [pads[2], pads[3]] else: err_man.raise_err_should_be_4d("conv2d", "pads shape") fusion_para = _conv2d_compute_fusion_para(inputs) valid_shape = fusion_para.get("valid_shape") if valid_shape and valid_shape[2] == shape_fm[2]: valid_shape = () fusion_para["valid_shape"] = () fusion_para["slice_offset"] = () if valid_shape: input_h = valid_shape[2] input_w = valid_shape[3] strideh = _trans_stride(input_h, weight_h, strideh, padh, dlt_h) stridew = _trans_stride(input_w, weight_w, stridew, padw, dlt_w) para_dict = {"pad_h": padh, "pad_w": padw, "stride_h": strideh, "stride_w": stridew, "dilate_h": dlt_h, "dilate_w": dlt_w, "offset_x": offset_x, "filter_h": weight_h, "filter_w": weight_w, "bias_tensor": bias, "offset_w_tensor": offset_w, "fusion_para": fusion_para, "kernel_name": kernel_name} if cce_conf.get_soc_spec("SOC_VERSION") in \ ("Hi3796CV300ES", "Hi3796CV300CS"): para_dict["mad_dtype"] = "float16" if weights.dtype != "float16": para_dict["mad_dtype"] = "int32" else: if cce_conf.get_soc_spec("SOC_VERSION") in ("Ascend310",) \ and weights.dtype == "int8": para_dict["mad_dtype"] = "int32" c0_optim_flg = False if shape_c <= 4 and ("format" in weights.op.attrs and weights.op.attrs['format'].value == "FRACTAL_Z_C04"): c0_optim_flg = True if (weight_h == 1) and (weight_w == 1): err_man.raise_err_specific_user("conv2d", "weight shape does "\ + "not support that H and W are both equal to 1 when C0=4.") if fusion_para["input_memory_type"] == 1: err_man.raise_err_specific_input_shape("conv2d", "c0 optim not "\ + "support fmap from L1 directly (instead of DDR)") optim_dict = {"c0_optim_flg": c0_optim_flg} return para_dict, optim_dict
def get_tilling(m_dim, k_dim, n_dim): """ get tilling parameters Parameters ---------- k_dim: int k axis length Returns ------- tilling_info: dict tilling parameters """ block_num = cce_conf.get_soc_spec(cce_conf.CORE_NUM) l1_size = cce_conf.get_soc_spec(cce_conf.L1_SIZE) l1_limit = l1_size // 4 // 2 ub_size = cce_conf.get_soc_spec(cce_conf.UB_SIZE) ub_limit = ub_size // 2 tilling_info = {} m_l0_factor = 1 block_factor = 1 if m_dim > block_num: block_factor = m_dim // block_num n_l0_factor = n_dim k_l0_factor = k_dim c_0 = 16 dtype_mad_size = 4 fracz_size = c_0 * c_0 * dtype_mad_size one_mn_size = k_l0_factor * fracz_size if k_l0_factor > 32: k_l0_factor = 32 one_mn_size = k_l0_factor * fracz_size ub_used = m_l0_factor * n_l0_factor * one_mn_size while ub_used > ub_limit: if m_l0_factor > 1: m_l0_factor -= 1 else: n_l0_factor -= 1 ub_used = m_l0_factor * n_l0_factor * one_mn_size if m_l0_factor > 1: while m_dim % m_l0_factor != 0: m_l0_factor -= 1 if n_l0_factor > 1: while n_dim % n_l0_factor != 0: n_l0_factor -= 1 if l1_limit > one_mn_size * m_l0_factor: m_l1_factor = 1 else: m_l1_factor = l1_limit // (one_mn_size * m_l0_factor) if l1_limit > one_mn_size * n_l0_factor: n_l1_factor = 1 else: n_l1_factor = l1_limit // (one_mn_size * n_l0_factor) tilling_info["block"] = block_factor tilling_info["m_l1"] = m_l1_factor tilling_info["n_l1"] = n_l1_factor tilling_info["k_l1"] = 1 tilling_info["m_l0"] = 1 tilling_info["n_l0"] = n_l0_factor tilling_info["k_l0"] = k_l0_factor return tilling_info
def _is_support_v200_instruction(): if cce_conf.get_soc_spec("SOC_VERSION") in ("Ascend710", "Ascend610", "Ascend615", "Hi3796CV300CS"): return True return False
def conv2dcompress_compute(inputs, weight_compress, compress_index, bias, offset_w, outputs, strides, pads, dilations, groups=1, data_format='NHWC', offset_x=0, kernel_name="conv2dcompress"): """ conv2dcompress compute Notice ------ only used by framework combine with IR Parameters ---------- inputs: tvm placeholder input 5hd feature map tensor weight_compress: tvm placeholder input frac_z compress weight tensor compress_index: tvm placeholder input ND compress index outputs: tvm placeholder output tensor, dtype must be assigned bias: tvm placeholder or None input 1d bias tensor offset_w: tvm placeholder or None offset_w bias tensor strides: tuple/list of 4 integers stride on H/W, format sensitive pads: tuple/list of 4 integers [pad_top, pad_bottom, pad_left, pad_right] dilations: tuple/list of 4 integers dilation on H/W, format sensitive groups: int param for group covolution data_format: string input data format offset_x: int offset for fmap Returns ------- tvm compute """ compress_index_shape = compress_index.shape[0] para_dict, optim_dict = calc_para_from_tensor(inputs, weight_compress, bias, offset_w, strides, pads, dilations, offset_x, kernel_name, data_format) if cce_conf.get_soc_spec("SOC_VERSION") in ("Hi3796CV300ES") and \ para_dict["filter_h"] * para_dict["filter_w"] > MAX_FITLER_HW: err_man.raise_err_specific("conv2d", \ "Min tiling still exceed ub buffer, " \ + "when open weight unzip") res = conv_compress(inputs, weight_compress, compress_index, \ compress_index_shape, para_dict, optim_dict) return res
def _conv_layer_compress_cce(shape_in, shape_w, shape_index, in_dtype, w_dtype, index_dtype, res_dtype, padh, padw, strideh, stridew, dilateh=1, dilatew=1, offset_x=0, offset_w_dtype='int32', offset_w=False, bias=False, optim_dict=None, fusion_para=None, kernel_name="cce_conv", need_build=False, need_print=False): """ Parameters ---------- shape_in: shape of feature map shape_w: shape of compress weight shape_index: shape of compress index in_dtype: the feature map data type w_dtype: the compress weight data type index_dtype: the index data type res_dtype: the result data type padh: H direction padding padw: W direction padding strideh: H direction stride stridew: W direction stride dilateh: H direction spacing between kernel dilatew: W direction spacing between kernel offset_x: the offset for fmap offset_w_dtype: weight offset data type, default 'int32' offset_w: the tag for offset_w or not bias: the tag for bias or not fusion_para: the config for L2 Fusion input_memory_type: feature map from L2/GM, 0 for GM, 2 for L2 output_memory_type: calculation results are outputs to L2/GM valid_shape: valid shape in L1 buffer, NC1HWC0 slice_offset: the offset of each dimension between valid shape and shape in kernel_name: cce kernel name, default value is "cce_conv" need_build: if need to build CCEC kernel, default value is False need_print: if need to print the ir, default value is False Returns ------- wrapped_tensor """ # for pylint, otherwise "Dangerous default value [] as argument" if optim_dict is None: optim_dict = {"c0_optim_flg": False} if fusion_para is None: fusion_para = { "input_memory_type": 0, "output_memory_type": 0, "valid_shape": (), "slice_offset": (), "l1_fusion_type": -1 } in_dtype = in_dtype.lower() w_dtype = w_dtype.lower() index_dtype = index_dtype.lower() res_dtype = res_dtype.lower() offset_w_dtype = offset_w_dtype.lower() mad_dtype = 'float32' if w_dtype == 'int8': mad_dtype = 'int32' shape_in = list(shape_in) shape_w = list(shape_w) shape_in, shape_w = \ conv_layer_cce_para_check(shape_in, shape_w, padh, padw, strideh, stridew, in_dtype, w_dtype, res_dtype, offset_w_dtype, bias, kernel_name, dilateh, dilatew, optim_dict, fusion_para) out_channel, in_channel_weight, filter_h, filter_w = shape_w fmap_shape_nc1hwc0, filter_shape_frac_z = conv_layer_cce_shape_calc( shape_in, shape_w, in_dtype, w_dtype, optim_dict) if cce_conf.get_soc_spec("SOC_VERSION") in ("Hi3796CV300ES") and \ filter_h * filter_w > MAX_FITLER_HW: err_man.raise_err_specific("conv2d", \ "Min tiling still exceed ub buffer, " \ + "when open weight unzip") tensor_list = [] with tvm.target.cce(): data = tvm.placeholder(fmap_shape_nc1hwc0, name='Fmap', dtype=in_dtype) tensor_list.append(data) weight = tvm.placeholder(filter_shape_frac_z, name='Filter', dtype=w_dtype) tensor_list.append(weight) compress_index_shape = tvm.var("compress_index_shape", dtype="int32") compress_index = tvm.placeholder((compress_index_shape, ), name='compress_index', dtype=index_dtype) bias_tensor = None offset_w_tensor = None if bias: bias_tensor = tvm.placeholder((out_channel, ), name='bias_tensor', dtype=res_dtype) tensor_list.append(bias_tensor) conv_res = conv_compress(data, weight, compress_index, compress_index_shape, { "bias_tensor": bias_tensor, "offset_w_tensor": offset_w_tensor, "pad_h": padh, "pad_w": padw, "stride_h": strideh, "stride_w": stridew, "dilate_h": dilateh, "dilate_w": dilatew, "filter_h": filter_h, "filter_w": filter_w, "offset_x": offset_x, "res_dtype": res_dtype, "mad_dtype": mad_dtype, "fusion_para": fusion_para, "kernel_name": kernel_name }, optim_dict=optim_dict, dsl_flag=False) sch = auto_schedule(conv_res) tensor_list.append(compress_index) tensor_list.append(conv_res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": tensor_list } cce_build_code(sch, config)