Example #1
0
    def __init__(self, input_x, output_y, num, axis, kernel_name):
        self.input_x = input_x
        self.num = num
        self.kernel_name = kernel_name
        self.dtype = input_x.get("dtype").lower()

        self.dim_info_vars = []
        self.ub_tensor_list = []
        self.res_tensor_list = []
        self.virtual_node = None
        self.sch_list = []
        self.arg_list = []
        self.rules = []
        self.compile_vars = {}
        self.dim_vars = []
        self.dim_bounds = []
        self.output_shape = []
        self.x_reshape = None
        self.left_range = None
        self.right_range = None

        self._normalize_shape()
        self._trans_input_shape(axis)
        self.new_axis = 1

        self._input_placeholder = None
        self.block_idx = tvm.thread_axis('blockIdx.x')

        self.ub_size = cce_conf.get_soc_spec(cce_conf.UB_SIZE)
        self.core_num = cce_conf.get_soc_spec(cce_conf.CORE_NUM)
Example #2
0
    def _tiling(shape, dtype, wo_out, stride, kernel):
        ub_size_bytes = get_soc_spec(UB_SIZE)
        dtype_bytes_size = get_bit_len(dtype) // 8
        total_ele = ub_size_bytes // dtype_bytes_size // 2

        nc1h, _, c0 = shape
        core_num = get_soc_spec(CORE_NUM)
        fused_axis_block_factor, w_block_factor = nc1h, wo_out

        if fused_axis_block_factor >= core_num:
            fused_axis_block_factor = _ceil(fused_axis_block_factor, core_num)
        else:
            w_block_factor = _ceil(wo_out, _ceil(core_num, fused_axis_block_factor))
            fused_axis_block_factor = 1
        wo_buffer_num = 4
        wi_buffer_num = 2

        # for wi = (wo - 1) * stride + kernel
        # wo_buffer_num * N * C1 * H * Wo * C0 + wi_buffer_num * N * C1 * H * Wi * C0 <= total_ele
        nc1wo_limit = (total_ele // (c0) +
                       (stride - kernel) * wi_buffer_num) // (wo_buffer_num + stride * wi_buffer_num)
        nc1_limit = (total_ele // (c0)) // (wo_buffer_num * wo_out + wi_buffer_num * wo_out * stride - wi_buffer_num *
                                            (stride - kernel))

        if nc1_limit > 1:
            fused_factor, wo_factor = min(fused_axis_block_factor, nc1_limit), w_block_factor
        else:
            fused_factor, wo_factor = 1, nc1wo_limit // 8 * 8
        return [fused_axis_block_factor, w_block_factor], [fused_factor, wo_factor]
    def _min_l1_byte():
        # Forth : L1 limitation, Mainly required by chip
        al1_min_byte = C0 * C0 * 2

        if dedy_w % C0 == 0:
            bl1_min_byte = filter_h_dilation * fmap_w * C0 * 2
        else:
            bl1_min_byte = (filter_h_dilation + stride_h) * fmap_w * C0 * 2

        l1_size = get_soc_spec("L1_SIZE")  # L1 size
        if (al1_min_byte + bl1_min_byte) > l1_size:
            args_dict = {'errCode': 'E60022'}
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))
Example #4
0
def conv_layer_cce_shape_calc(shape_in, shape_w, in_dtype, \
    w_dtype, optim_dict):
    """

    Parameters
    ----------
    shape_in: shape of feature map

    shape_w: shape of weight

    in_dtype: the feature map data type

    w_dtype: the weight data type

    optim_dict: optimize feature dict

    Returns
    -------
    None

    """
    block_size_k = CUBE_MKN[in_dtype]['mac'][1]
    if optim_dict["c0_optim_flg"] and cce_conf.get_soc_spec("SOC_VERSION") in \
    ("Ascend710", "Ascend615", "Ascend610", "Hi3796CV300CS"):
        block_size_k = 4
    fmap_shape_nc1hwc0 = (shape_in[0], (shape_in[1] + block_size_k - 1) \
                          // block_size_k,
                          shape_in[2], shape_in[3], block_size_k)

    out_channel, in_channel_weight, filter_h, filter_w = shape_w
    block_size_k = CUBE_MKN[w_dtype]['mac'][1]
    block_size_n = CUBE_MKN[w_dtype]['mac'][2]
    if optim_dict["c0_optim_flg"]:
        filter_shape_frac_z = ((4 * filter_h * filter_w + block_size_k - 1) \
                               // block_size_k,
                               out_channel // block_size_n, block_size_n,
                               block_size_k)
    else:
        filter_shape_frac_z = (in_channel_weight * filter_h * filter_w \
                               // block_size_k,
                               out_channel // block_size_n, block_size_n,
                               block_size_k)
    return fmap_shape_nc1hwc0, filter_shape_frac_z
Example #5
0
def _get_mad_dtype(w_dtype):
    """
    algorithm: get the dtype of mad

    Parameters
    ----------
    w_dtype: the dtype of filter

    Returns
    -------
    mad dtype
    """
    mad_dtype = "float32"
    if w_dtype == 'int8':
        mad_dtype = "int32"
    elif get_soc_spec("SOC_VERSION") in ("Hi3796CV300ES",
                                         "Hi3796CV300CS"):
        mad_dtype = "float16"

    return mad_dtype
def conv3d_backprop_filter_cce(shape_x,
                               shape_out_backprop,
                               filter_sizes,
                               strides,
                               pads,
                               dilations=(1, 1, 1, 1),
                               x_dtype='float16',
                               out_backprop_dtype='float16',
                               res_dtype='float32',
                               kernel_name="conv3d_backprop_filter_cce"):
    """
    Topi interface of conv3d backprop filter

    Parameters:
    ----------
    shape_x : The shape of feature map.
              5-D with shape [batch, depth, channels, height, weight].

    shape_out_backprop : The shape of gradients.
                         5-D with shape [batch, depth, channels, height, weight].

    filter_sizes : The shape of filter.
                   5-D with shape [batch, depth, channels, height, weight].

    strides : A list of ints. The stride of the sliding window.

    pads : "SAME"or"VALID",
           indicating the type of pads algorithm to use, or list.

    dilations : An optional list of ints. Default value is [1, 1, 1, 1].

    x_dtype : The dtype of feature map data. Default value is float16.

    out_backprop_dtype : The dtype of gradients data.
                         Default value is float16.

    res_dtype : The dtype of result(De/Dw) data. Default value is float32.

    kernel_name : Cce kernel name.
                  Default value is "conv3d_backprop_filter_cce"

    need_build : If need to build CCEC kernel. Default value is False.

    Returns : None
    ----------
    """
    def _ceil(x_1, x_2):
        if x_2 == 0:
            args_dict = {
                'errCode': 'E62502',
                'first_operand': str(x_1),
                'second_operand': str(x_2)
            }
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))
        return (x_1 + x_2 - 1) // x_2

    if get_soc_spec("SOC_VERSION") in ("Hi3796CV300ES", "Hi3796CV300CS"):
        res_dtype = "float16"

    res = check_conv3dbp_filter_params(shape_x, shape_out_backprop,
                                       filter_sizes, strides, pads, dilations,
                                       x_dtype, out_backprop_dtype, res_dtype,
                                       kernel_name)
    shape_x, shape_out_backprop, filter_sizes, strides, pads, dilations, \
    x_dtype, out_backprop_dtype, res_dtype, kernel_name = res
    fmap_batch, fmap_depth, fmap_channel, fmap_h, fmap_w = shape_x
    dedy_batch, dedy_d, dedy_channel, dedy_h, dedy_w = shape_out_backprop

    c0_size = cce_params.C0_SIZE  # Channel axis should be align with 16
    shape_dedy = (dedy_batch, dedy_d, \
                  _ceil(dedy_channel, c0_size), dedy_h, dedy_w, c0_size)
    shape_fmap = (fmap_batch, fmap_depth, \
                  _ceil(fmap_channel, c0_size), fmap_h, fmap_w, c0_size)
    dedy = tvm.placeholder(shape_dedy, name="dedy", dtype=out_backprop_dtype)
    fmap = tvm.placeholder(shape_fmap, name="fmap", dtype=x_dtype)

    dedw = te.lang.cce.conv3d_backprop_filter_compute(
        input_x=fmap,
        out_backprop=dedy,
        filter_sizes=filter_sizes,
        strides=strides,
        padding=pads,
        dilations=dilations,
        res_dtype=res_dtype,
        kernel_name=kernel_name)

    tensor_list_input = [fmap, dedy]
    with tvm.target.cce():
        sch = generic.auto_schedule(dedw)

    real_outs = sch.cce_special["real_out_tensor"]
    tensor_list = tensor_list_input + real_outs

    config = {"name": kernel_name, "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
Example #7
0
    def _select_format(params):
        inputs = params[0]
        weights = params[1]
        c0_optim_flg = False
        shape_x = inputs.get("ori_shape")
        shape_x = scalar2tensor_one(shape_x)
        format_fm = inputs.get("ori_format")
        if format_fm == "NCHW":
            shape_fm = shape_x
        elif format_fm == "NHWC":
            shape_fm = [shape_x[0], shape_x[3], shape_x[1], shape_x[2]]
        else:
            err_man.raise_err_input_format_invalid("conv2d", "inputs", \
                ["NCHW", "NHWC"], format_fm)

        shape_w = weights.get("ori_shape")
        if (not isinstance(shape_w, (tuple, list))) or len(shape_w) != 4:
            err_man.raise_err_should_be_4d("conv2d", "weights")
        format_w = weights.get("ori_format")
        if format_w == "NCHW":
            shape_filter = shape_w
        elif format_w == "NHWC":
            shape_filter = [shape_w[0], shape_w[3], shape_w[1], shape_w[2]]
        elif format_w == "HWCN":
            shape_filter = [shape_w[3], shape_w[2], shape_w[0], shape_w[1]]
        else:
            err_man.raise_err_input_format_invalid("conv2d", "weights", \
                ["NCHW", "NHWC", "HWCN"], format_w)
        if shape_fm[1] <= 4:
            c0_optim_flg = True
        if (shape_filter[2] == 1) and (shape_filter[3] == 1):
            c0_optim_flg = False
        # format NC1HWC0_C04 can only be used at first conv layer
        # for those soc using NC1HWC0_C04, ensure is_first_layer == 1
        if inputs.get("is_first_layer") != 1 and \
            cce_conf.get_soc_spec("SOC_VERSION") \
            in ("Ascend710", "Ascend615", "Ascend610", "Hi3796CV300CS"):
            c0_optim_flg = False
        if c0_optim_flg:
            if cce_conf.get_soc_spec("SOC_VERSION") in \
            ("Ascend710", "Ascend615", "Ascend610", "Hi3796CV300CS"):
                input0 = gen_param(classify="input0", name="x",
                                   datatype="float16,float16,int8,int8",
                                   format="NC1HWC0_C04,NC1HWC0,"
                                          "NC1HWC0_C04,NC1HWC0")
            else:
                input0 = gen_param(classify="input0", name="x",
                                   datatype="float16,float16,int8,int8",
                                   format="NC1HWC0,NC1HWC0,"
                                          "NC1HWC0,NC1HWC0")
            input1 = gen_param(classify="input1", name="filter",
                               datatype="float16,float16,int8,int8",
                               format="FRACTAL_Z_C04,FRACTAL_Z,"
                                      "FRACTAL_Z_C04,FRACTAL_Z")
            input2 = gen_param(classify="input2", name="bias",
                               datatype="float16,float16,int32,int32",
                               format="ND,ND,ND,ND")
            input3 = gen_param(classify="input3", name="offset_w",
                               datatype="int8,int8,int8,int8",
                               format="ND,ND,ND,ND")
            output0 = gen_param(classify="output0", name="y",
                                datatype="float16,float16,int32,int32",
                                format="NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0")
        else:
            # only dynamic_hw or dynamic_batch is supported by dynamic conv2d
            if (shape_fm[0] == -1 and -1 not in shape_fm[1:]) or \
                (shape_fm[2] == -1 and shape_fm[3] == -1 and -1 not in shape_fm[:2]):
                input0 = gen_param(classify="input0", name="x",
                                   datatype="float16",
                                   format="NC1HWC0",
                                   unknownshape_format="NC1HWC0")
                input1 = gen_param(classify="input1", name="filter",
                                   datatype="float16",
                                   format="FRACTAL_Z",
                                   unknownshape_format="FRACTAL_Z")
                input2 = gen_param(classify="input2", name="bias",
                                   datatype="float16",
                                   format="ND")
                input3 = gen_param(classify="input3", name="offset_w",
                                   datatype="int8",
                                   format="ND")
                output0 = gen_param(classify="output0", name="y",
                                    datatype="float16",
                                    format="NC1HWC0",
                                    unknownshape_format="NC1HWC0")
            else:
                input0 = gen_param(classify="input0", name="x",
                                   datatype="float16,int8",
                                   format="NC1HWC0,NC1HWC0")
                input1 = gen_param(classify="input1", name="filter",
                                   datatype="float16,int8",
                                   format="FRACTAL_Z,FRACTAL_Z")
                input2 = gen_param(classify="input2", name="bias",
                                   datatype="float16,int32",
                                   format="ND,ND")
                input3 = gen_param(classify="input3", name="offset_w",
                                   datatype="int8,int8",
                                   format="ND,ND")
                output0 = gen_param(classify="output0", name="y",
                                    datatype="float16,int32",
                                    format="NC1HWC0,NC1HWC0")
        return [input0, input1, input2, input3, output0]
Example #8
0
def conv_layer_cce_para_check(shape_in, shape_w, padh, padw, strideh, stridew,
                              in_dtype, w_dtype, res_dtype, offset_w_dtype,
                              bias, kernel_name, dilateh=1, dilatew=1,
                              optim_dict=None, fusion_para=None):
    """

    Parameters
    ----------
    shape_in: shape of feature map

    shape_w: shape of weight

    padh: H direction padding

    padw: W direction padding

    strideh: H direction stride

    stridew: W direction stride

    in_dtype: the feature map data type

    w_dtype: the weight data type

    res_dtype: the result data type

    offset_w_dtype: weight offset data type, default 'int32'

    bias: the tag for bias or not

    fusion_para: the config for L1 or L2 Fusion

    kernel_name: cce kernel name

    dilateh: H direction spacing between kernel

    dilatew: W direction spacing between kernel

    optim_dict: optimize feature dict

    Returns
    -------
    None

    """
    check_kernel_name(kernel_name)
    check_dtype_rule(offset_w_dtype, ['int32'])
    if cce_conf.get_soc_spec("SOC_VERSION") in ("Ascend310", "Hi3796CV300ES", \
        "Ascend710", "Ascend615", "Ascend610", "Hi3796CV300CS"):
        check_dtype_rule(in_dtype, ('int8', "float16"))
        check_dtype_rule(w_dtype, ('int8', "float16"))
        check_dtype_rule(res_dtype, ('int32', "float16"))
    else:
        check_dtype_rule(in_dtype, ['float16'])
        check_dtype_rule(w_dtype, ['float16'])
        check_dtype_rule(res_dtype, ['float16'])

    if isinstance(padh, list):
        if len(padh) != PAD_SHAPE_DIM:
            err_man.raise_err_specific_user("conv2d", "Dimension must be "\
                                            + str(PAD_SHAPE_DIM) + \
                                            " when padh is a list.")
        pad_top = padh[0]
        pad_bottom = padh[1]
    else:
        pad_top = padh
        pad_bottom = padh

    if isinstance(padw, list):
        if len(padw) != PAD_SHAPE_DIM:
            err_man.raise_err_specific_user("conv2d", "Dimension must be "\
                                            + str(PAD_SHAPE_DIM) + \
                                            " when padw is a list.")
        pad_left = padw[0]
        pad_right = padw[1]
    else:
        pad_left = padw
        pad_right = padw
    if optim_dict is None:
        optim_dict = {"c0_optim_flg": False}
    optim_off = shape_in[1] > 4 or shape_w[1] > 4 or \
                (shape_w[2] == 1 and shape_w[3] == 1)
    if optim_dict.get("c0_optim_flg") is True:
        if optim_off:
            err_man.raise_err_specific_user("conv2d", "Invalid "\
                + "config for c0=4 optimize feature.")

    if fusion_para is None:
        fusion_para = {"input_memory_type": 0, "output_memory_type": 0,
                       "valid_shape": (), "slice_offset": (), \
                       "l1_fusion_type": -1, \
                       "fmap_l1_addr_flag": 0, \
                       "fmap_l1_valid_size": -1}

    dilation_not_pass = (dilateh > 1 or dilatew > 1) and w_dtype == 'int8'
    if dilation_not_pass:
        err_man.raise_err_specific_user("conv2d", "Quant conv does not "\
            + "support dilate > 1.")

    shape_in, shape_w = check_conv_shape(shape_in, shape_w,
                                         pad_top, pad_bottom,
                                         pad_left, pad_right, strideh, stridew,
                                         in_dtype, w_dtype, fusion_para,
                                         optim_dict, dilateh, dilatew)

    return shape_in, shape_w
Example #9
0
def calc_para_from_tensor(inputs, weights, bias, offset_w, strides, pads,
                          dilations, offset_x, kernel_name,
                          data_format="NCHW"):

    shape_w = []
    for i in weights.op.attrs['ori_shape']:
        shape_w.append(i.value)
    shape_fm = []
    for i in inputs.shape:
        shape_fm.append(i.value)

    input_h = shape_fm[2]
    input_w = shape_fm[3]

    format_w = weights.op.attrs['ori_format'].value
    all_fmt = ["NCHW", "NHWC", "HWCN"]
    if format_w not in all_fmt:
        err_man.raise_err_input_format_invalid("conv2d", \
        "weights", ["NCHW", "NHWC", "HWCN"], format_w)

    pos_c = format_w.find('C')
    pos_h = format_w.find('H')
    pos_w = format_w.find('W')
    weight_h = shape_w[pos_h]
    weight_w = shape_w[pos_w]
    shape_c = shape_w[pos_c]

    if len(strides) != 4:
        err_man.raise_err_should_be_4d("conv2d", "strides")
    if len(dilations) != 4:
        err_man.raise_err_should_be_4d("conv2d", "directions")

    format_x = inputs.op.attrs['ori_format'].value

    all_fmt = ["NCHW", "NHWC"]
    if format_x not in all_fmt:
        err_man.raise_err_input_format_invalid("conv2d", \
        "input", ["NCHW", "NHWC"], format_x)
    pos_h = data_format.find('H')
    pos_w = data_format.find('W')
    strideh = strides[pos_h]
    stridew = strides[pos_w]
    dlt_h = dilations[pos_h]
    dlt_w = dilations[pos_w]

    if len(pads) == 4:
        padh = [pads[0], pads[1]]
        padw = [pads[2], pads[3]]
    else:
        err_man.raise_err_should_be_4d("conv2d", "pads shape")

    fusion_para = _conv2d_compute_fusion_para(inputs)

    valid_shape = fusion_para.get("valid_shape")
    if valid_shape and valid_shape[2] == shape_fm[2]:
        valid_shape = ()
        fusion_para["valid_shape"] = ()
        fusion_para["slice_offset"] = ()
    if valid_shape:
        input_h = valid_shape[2]
        input_w = valid_shape[3]

    strideh = _trans_stride(input_h, weight_h, strideh, padh, dlt_h)
    stridew = _trans_stride(input_w, weight_w, stridew, padw, dlt_w)

    para_dict = {"pad_h": padh, "pad_w": padw, "stride_h": strideh,
                 "stride_w": stridew, "dilate_h": dlt_h, "dilate_w": dlt_w,
                 "offset_x": offset_x, "filter_h": weight_h,
                 "filter_w": weight_w, "bias_tensor": bias,
                 "offset_w_tensor": offset_w,
                 "fusion_para": fusion_para,
                 "kernel_name": kernel_name}

    if cce_conf.get_soc_spec("SOC_VERSION") in \
    ("Hi3796CV300ES", "Hi3796CV300CS"):
        para_dict["mad_dtype"] = "float16"
        if weights.dtype != "float16":
            para_dict["mad_dtype"] = "int32"
    else:
        if cce_conf.get_soc_spec("SOC_VERSION") in ("Ascend310",) \
        and weights.dtype == "int8":
            para_dict["mad_dtype"] = "int32"

    c0_optim_flg = False
    if shape_c <= 4 and ("format" in weights.op.attrs and
                         weights.op.attrs['format'].value == "FRACTAL_Z_C04"):
        c0_optim_flg = True
        if (weight_h == 1) and (weight_w == 1):
            err_man.raise_err_specific_user("conv2d", "weight shape does "\
                + "not support that H and W are both equal to 1 when C0=4.")

        if fusion_para["input_memory_type"] == 1:
            err_man.raise_err_specific_input_shape("conv2d", "c0 optim not "\
                + "support fmap from L1 directly (instead of DDR)")

    optim_dict = {"c0_optim_flg": c0_optim_flg}

    return para_dict, optim_dict
Example #10
0
def get_tilling(m_dim, k_dim, n_dim):
    """
    get tilling parameters
    Parameters
    ----------
    k_dim: int
        k axis length

    Returns
    -------
    tilling_info: dict
        tilling parameters
    """
    block_num = cce_conf.get_soc_spec(cce_conf.CORE_NUM)
    l1_size = cce_conf.get_soc_spec(cce_conf.L1_SIZE)
    l1_limit = l1_size // 4 // 2
    ub_size = cce_conf.get_soc_spec(cce_conf.UB_SIZE)
    ub_limit = ub_size // 2

    tilling_info = {}
    m_l0_factor = 1
    block_factor = 1
    if m_dim > block_num:
        block_factor = m_dim // block_num

    n_l0_factor = n_dim
    k_l0_factor = k_dim
    c_0 = 16
    dtype_mad_size = 4
    fracz_size = c_0 * c_0 * dtype_mad_size
    one_mn_size = k_l0_factor * fracz_size
    if k_l0_factor > 32:
        k_l0_factor = 32
        one_mn_size = k_l0_factor * fracz_size
    ub_used = m_l0_factor * n_l0_factor * one_mn_size
    while ub_used > ub_limit:
        if m_l0_factor > 1:
            m_l0_factor -= 1
        else:
            n_l0_factor -= 1
        ub_used = m_l0_factor * n_l0_factor * one_mn_size

    if m_l0_factor > 1:
        while m_dim % m_l0_factor != 0:
            m_l0_factor -= 1

    if n_l0_factor > 1:
        while n_dim % n_l0_factor != 0:
            n_l0_factor -= 1

    if l1_limit > one_mn_size * m_l0_factor:
        m_l1_factor = 1
    else:
        m_l1_factor = l1_limit // (one_mn_size * m_l0_factor)

    if l1_limit > one_mn_size * n_l0_factor:
        n_l1_factor = 1
    else:
        n_l1_factor = l1_limit // (one_mn_size * n_l0_factor)

    tilling_info["block"] = block_factor
    tilling_info["m_l1"] = m_l1_factor
    tilling_info["n_l1"] = n_l1_factor
    tilling_info["k_l1"] = 1
    tilling_info["m_l0"] = 1
    tilling_info["n_l0"] = n_l0_factor
    tilling_info["k_l0"] = k_l0_factor

    return tilling_info
Example #11
0
def _is_support_v200_instruction():
    if cce_conf.get_soc_spec("SOC_VERSION") in ("Ascend710", "Ascend610",
                                                "Ascend615", "Hi3796CV300CS"):
        return True
    return False
Example #12
0
def conv2dcompress_compute(inputs,
                           weight_compress,
                           compress_index,
                           bias,
                           offset_w,
                           outputs,
                           strides,
                           pads,
                           dilations,
                           groups=1,
                           data_format='NHWC',
                           offset_x=0,
                           kernel_name="conv2dcompress"):
    """
    conv2dcompress compute

    Notice
    ------
    only used by framework combine with IR

    Parameters
    ----------
    inputs: tvm placeholder
        input 5hd feature map tensor
    weight_compress: tvm placeholder
        input frac_z compress weight tensor
    compress_index: tvm placeholder
        input ND compress index
    outputs: tvm placeholder
        output tensor, dtype must be assigned
    bias: tvm placeholder or None
        input 1d bias tensor
    offset_w: tvm placeholder or None
        offset_w bias tensor
    strides: tuple/list of 4 integers
        stride on H/W, format sensitive
    pads: tuple/list of 4 integers
        [pad_top, pad_bottom, pad_left, pad_right]
    dilations: tuple/list of 4 integers
        dilation on H/W, format sensitive
    groups: int
        param for group covolution
    data_format: string
        input data format
    offset_x: int
        offset for fmap

    Returns
    -------
    tvm compute
    """
    compress_index_shape = compress_index.shape[0]

    para_dict, optim_dict = calc_para_from_tensor(inputs, weight_compress,
                                                  bias, offset_w, strides,
                                                  pads, dilations, offset_x,
                                                  kernel_name, data_format)

    if cce_conf.get_soc_spec("SOC_VERSION") in ("Hi3796CV300ES") and \
    para_dict["filter_h"] * para_dict["filter_w"] > MAX_FITLER_HW:
        err_man.raise_err_specific("conv2d", \
            "Min tiling still exceed ub buffer, " \
            + "when open weight unzip")

    res = conv_compress(inputs, weight_compress, compress_index, \
                        compress_index_shape, para_dict, optim_dict)

    return res
Example #13
0
def _conv_layer_compress_cce(shape_in,
                             shape_w,
                             shape_index,
                             in_dtype,
                             w_dtype,
                             index_dtype,
                             res_dtype,
                             padh,
                             padw,
                             strideh,
                             stridew,
                             dilateh=1,
                             dilatew=1,
                             offset_x=0,
                             offset_w_dtype='int32',
                             offset_w=False,
                             bias=False,
                             optim_dict=None,
                             fusion_para=None,
                             kernel_name="cce_conv",
                             need_build=False,
                             need_print=False):
    """

    Parameters
    ----------
    shape_in: shape of feature map

    shape_w: shape of compress weight

    shape_index: shape of compress index

    in_dtype: the feature map data type

    w_dtype: the compress weight data type

    index_dtype: the index data type

    res_dtype: the result data type

    padh: H direction padding

    padw: W direction padding

    strideh: H direction stride

    stridew: W direction stride

    dilateh: H direction spacing between kernel

    dilatew: W direction spacing between kernel

    offset_x: the offset for fmap

    offset_w_dtype: weight offset data type, default 'int32'

    offset_w: the tag for offset_w or not

    bias: the tag for bias or not

    fusion_para: the config for L2 Fusion
                input_memory_type: feature map from L2/GM, 0 for GM, 2 for L2
                output_memory_type: calculation results are outputs to L2/GM
                valid_shape: valid shape in L1 buffer, NC1HWC0
                slice_offset: the offset of each dimension
                              between valid shape and shape in
    kernel_name: cce kernel name, default value is "cce_conv"

    need_build: if need to build CCEC kernel, default value is False

    need_print: if need to print the ir, default value is False

    Returns
    -------
    wrapped_tensor

    """
    # for pylint, otherwise "Dangerous default value [] as argument"
    if optim_dict is None:
        optim_dict = {"c0_optim_flg": False}

    if fusion_para is None:
        fusion_para = {
            "input_memory_type": 0,
            "output_memory_type": 0,
            "valid_shape": (),
            "slice_offset": (),
            "l1_fusion_type": -1
        }
    in_dtype = in_dtype.lower()
    w_dtype = w_dtype.lower()
    index_dtype = index_dtype.lower()
    res_dtype = res_dtype.lower()
    offset_w_dtype = offset_w_dtype.lower()

    mad_dtype = 'float32'
    if w_dtype == 'int8':
        mad_dtype = 'int32'

    shape_in = list(shape_in)
    shape_w = list(shape_w)

    shape_in, shape_w = \
            conv_layer_cce_para_check(shape_in, shape_w, padh, padw,
                                      strideh, stridew, in_dtype, w_dtype,
                                      res_dtype, offset_w_dtype, bias,
                                      kernel_name, dilateh, dilatew,
                                      optim_dict, fusion_para)
    out_channel, in_channel_weight, filter_h, filter_w = shape_w

    fmap_shape_nc1hwc0, filter_shape_frac_z = conv_layer_cce_shape_calc(
        shape_in, shape_w, in_dtype, w_dtype, optim_dict)

    if cce_conf.get_soc_spec("SOC_VERSION") in ("Hi3796CV300ES") and \
    filter_h * filter_w > MAX_FITLER_HW:
        err_man.raise_err_specific("conv2d", \
            "Min tiling still exceed ub buffer, " \
            + "when open weight unzip")

    tensor_list = []
    with tvm.target.cce():
        data = tvm.placeholder(fmap_shape_nc1hwc0, name='Fmap', dtype=in_dtype)
        tensor_list.append(data)
        weight = tvm.placeholder(filter_shape_frac_z,
                                 name='Filter',
                                 dtype=w_dtype)
        tensor_list.append(weight)

        compress_index_shape = tvm.var("compress_index_shape", dtype="int32")
        compress_index = tvm.placeholder((compress_index_shape, ),
                                         name='compress_index',
                                         dtype=index_dtype)
        bias_tensor = None
        offset_w_tensor = None
        if bias:
            bias_tensor = tvm.placeholder((out_channel, ),
                                          name='bias_tensor',
                                          dtype=res_dtype)
            tensor_list.append(bias_tensor)

        conv_res = conv_compress(data,
                                 weight,
                                 compress_index,
                                 compress_index_shape, {
                                     "bias_tensor": bias_tensor,
                                     "offset_w_tensor": offset_w_tensor,
                                     "pad_h": padh,
                                     "pad_w": padw,
                                     "stride_h": strideh,
                                     "stride_w": stridew,
                                     "dilate_h": dilateh,
                                     "dilate_w": dilatew,
                                     "filter_h": filter_h,
                                     "filter_w": filter_w,
                                     "offset_x": offset_x,
                                     "res_dtype": res_dtype,
                                     "mad_dtype": mad_dtype,
                                     "fusion_para": fusion_para,
                                     "kernel_name": kernel_name
                                 },
                                 optim_dict=optim_dict,
                                 dsl_flag=False)
        sch = auto_schedule(conv_res)
        tensor_list.append(compress_index)
        tensor_list.append(conv_res)

    config = {
        "print_ir": need_print,
        "need_build": need_build,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    cce_build_code(sch, config)