Exemple #1
0
def benchmark(input_0, kernel, stride, pad):
    sh, sw = stride
    n, c1, h, w, c0 = input_0.shape
    kh, kw = kernel

    [ph_h, ph_t, pw_h, pw_t], [out_size_h, out_size_w] = \
        cal_pad_shapes_by_strategy(input_0.shape, kernel, stride, pad)
    out_size_w = get_value(out_size_w, akg.tvm.expr.IntImm)
    out_size_h = get_value(out_size_h, akg.tvm.expr.IntImm)

    out_shape = (n, c1, out_size_h, out_size_w, c0)
    mask_shape = (n, c1, kh, kw, out_size_h, out_size_w, c0)

    min_value = -65504.0 if input_0.dtype == 'float16' \
        else -340282346638528859811704183484516925440.0

    out = np.full(out_shape, min_value, dtype=input_0.dtype)
    mask = np.zeros(mask_shape)

    inputpad = np.full((n, c1, h + ph_h + ph_t, w + pw_h + pw_t, c0),
                       np.finfo(input_0.dtype).min,
                       dtype=input_0.dtype)
    inputpad[:, :, ph_h:ph_h + h, pw_h:pw_h + w, :] = input_0

    for i in range(out_size_h):
        for j in range(out_size_w):
            out[:, :, i, j, :] = \
                np.max(inputpad[:, :, i * sh:i * sh + kh,
                       j * sw:j * sw + kw, :], axis=(2, 3))

    kerneled_shape_tmp = (inputpad.shape[0], inputpad.shape[1], kh * kw,
                          inputpad.shape[4])
    maxid = np.zeros(out_shape)
    for i in range(out_size_h):
        for j in range(out_size_w):
            maxid[:, :, i, j, :] = \
                np.argmax(np.reshape(
                    inputpad[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw, :],
                    kerneled_shape_tmp), axis=2)

    mask_shape_f = [n, c1, kh * kw, out_size_h, out_size_w, c0]
    mask = np.reshape(mask, tuple(mask_shape_f))

    index_shape = [n, c1, 1, out_size_h, out_size_w, c0]

    def cal_num(shape):
        return reduce(lambda i, j: i * j,
                      [shape[i] for i in range(len(shape))])

    n_indexs = [i for i in range(n) for _ in range(cal_num(index_shape[1:]))]
    c1_indexs = [
        i for i in range(c1) for _ in range(cal_num(index_shape[2:]))
    ] * n
    ho_indexs = [i for i in range(out_size_h)
                 for _ in range(cal_num(index_shape[4:]))] * \
        cal_num(index_shape[:3])
    wo_indexs = [i for i in range(out_size_w)
                 for _ in range(cal_num(index_shape[5:]))] * \
        cal_num(index_shape[:4])
    c0_indexs = list(range(c0)) * cal_num(index_shape[:-1])

    mask[n_indexs, c1_indexs,
         maxid.flatten().astype(np.int32), ho_indexs, wo_indexs, c0_indexs] = 1
    mask = np.reshape(mask, tuple(mask_shape))

    out = out.astype(input_0.dtype)
    mask = mask.astype(input_0.dtype)
    return out, mask
Exemple #2
0
def maxpool_ad(head,
               data,
               forward,
               mask,
               kernel,
               stride,
               pad,
               target=utils.CCE):
    """
    automatic differentiate of maxpool with manual schedule.

    Supported Platforms:
        'Ascend'
    """
    shape = get_shape(data)
    dtype = data.dtype

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    [ph_h, _, pw_h, _], [out_size_h, out_size_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, pad)
    batch_size, input_c1, input_h, input_w, input_c0 = shape

    # tile size one is proved to be the most efficient one
    tile_scale_h = 1
    tile_scale_w = 1

    tile_h = stride_h * tile_scale_h

    if kernel_h == stride_h:  # non-overlapping case
        tile_h_pad_u = ph_h % stride_h
    elif kernel_h % stride_h == 0:
        tile_h_pad_u = kernel_h - stride_h - ph_h
    else:
        tile_h_pad_u = kernel_h - kernel_h % stride_h - ph_h
    tile_h_pad_l = kernel_h - stride_h + ph_h
    tile_input_h = tile_h + tile_h_pad_u + tile_h_pad_l
    tile_h_out = (input_h - 1) // tile_h + 1

    if ph_h % stride_h == 0:
        pad_output_h = ph_h // stride_h
    else:
        pad_output_h = ph_h // stride_h + 1

    if tile_h_pad_u % stride_h == 0:
        pad_output_h -= tile_h_pad_u // stride_h
    else:
        pad_output_h -= tile_h_pad_u // stride_h + 1

    tile_output_h = (tile_input_h - kernel_h) // stride_h + 1

    tile_w = stride_w * tile_scale_w
    if kernel_w == stride_w:  # non-overlapping case
        tile_w_pad_u = pw_h % stride_w
    elif kernel_w % stride_w == 0:
        tile_w_pad_u = kernel_w - stride_w - pw_h
    else:
        tile_w_pad_u = kernel_w - kernel_w % stride_w - pw_h
    tile_w_pad_l = kernel_w - stride_w + pw_h
    tile_input_w = tile_w + tile_w_pad_u + tile_w_pad_l
    tile_w_out = (input_w - 1) // tile_w + 1

    if pw_h % stride_w == 0:
        pad_output_w = pw_h // stride_w
    else:
        pad_output_w = pw_h // stride_w + 1

    if tile_w_pad_u % stride_w == 0:
        pad_output_w -= tile_w_pad_u // stride_w
    else:
        pad_output_w -= tile_w_pad_u // stride_w + 1

    tile_output_w = (tile_input_w - kernel_w) // stride_w + 1

    def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
        head_reshaped = akg.tvm.compute(
            (batch_size, input_c1, tile_h_out, tile_w_out, tile_output_h,
             tile_output_w, input_c0),
            lambda b, c1, h_out, w_out, oh, ow, c0: akg.tvm.expr.Select(
                akg.tvm.any(
                    h_out * tile_scale_h + pad_output_h + oh < 0, h_out *
                    tile_scale_h + pad_output_h + oh > out_size_h - 1, w_out *
                    tile_scale_w + pad_output_w + ow < 0, w_out * tile_scale_w
                    + pad_output_w + ow > out_size_w - 1),
                akg.tvm.const(0.0, dtype=dtype),
                head_(b, c1, h_out * tile_scale_h + pad_output_h + oh, w_out *
                      tile_scale_w + pad_output_w + ow, c0)),
            name="head_reshaped")

        mask_reshaped = akg.tvm.compute(
            (batch_size, input_c1, tile_h_out, tile_w_out, tile_output_h,
             tile_output_w, kernel_h, kernel_w, input_c0),
            lambda b, c1, h_out, w_out, oh, ow, kh, kw, c0: akg.tvm.expr.
            Select(
                akg.tvm.any(
                    h_out * tile_scale_h + pad_output_h + oh < 0, h_out *
                    tile_scale_h + pad_output_h + oh > out_size_h - 1, w_out *
                    tile_scale_w + pad_output_w + ow < 0, w_out * tile_scale_w
                    + pad_output_w + ow > out_size_w - 1),
                akg.tvm.const(0.0, dtype=dtype),
                mask(b, c1, kh, kw, h_out * tile_scale_h + pad_output_h + oh,
                     w_out * tile_scale_w + pad_output_w + ow, c0)),
            name="mask_reshaped")

        d_data = akg.tvm.compute(
            (batch_size, input_c1, tile_h_out, tile_w_out, tile_output_h,
             tile_output_w, kernel_h, kernel_w, input_c0),
            lambda b, c1, h_out, w_out, oh, ow, kh, kw, c0: mask_reshaped(
                b, c1, h_out, w_out, oh, ow, kh, kw, c0) * head_reshaped(
                    b, c1, h_out, w_out, oh, ow, c0),
            name="d_data")

        data_reorg = akg.tvm.compute(
            (batch_size, input_c1, tile_h_out, tile_w_out, tile_output_h,
             tile_output_w, tile_h, tile_w, input_c0),
            lambda b, c1, h_out, w_out, oh, ow, h, w, c0: akg.tvm.expr.Select(
                akg.tvm.any(h + tile_h_pad_u < oh * stride_h, h + tile_h_pad_u
                            > oh * stride_h + kernel_h - 1, w + tile_w_pad_u <
                            ow * stride_w, w + tile_w_pad_u > ow * stride_w +
                            kernel_w - 1), akg.tvm.const(0, dtype=dtype),
                d_data(b, c1, h_out, w_out, oh, ow, h + tile_h_pad_u - oh *
                       stride_h, w + tile_w_pad_u - ow * stride_w, c0)),
            name="data_reorg")

        result_tile = akg.topi.sum(data_reorg, [4, 5])

        result = akg.tvm.compute(
            shape,
            lambda b, c1, h, w, c0: result_tile(
                b, c1, h // tile_h, w // tile_w, h % tile_h, w % tile_w, c0),
            name="result")
        return [result]

    # override differentiation computation with custom function
    [dl_ddata
     ] = akg.differentiate(forward, [data],
                           head,
                           None,
                           None,
                           override={forward: ([data], custom_maxpool_fdiff)})

    # schedule for differetiation operation
    s = akg.tvm.create_schedule([dl_ddata.op])

    # get computations
    result = dl_ddata
    result_tile = result.op.input_tensors[0]
    data_reorg = result_tile.op.input_tensors[0]
    d_data = data_reorg.op.input_tensors[0]
    mask_reshaped = d_data.op.input_tensors[0]
    head_reshaped = d_data.op.input_tensors[1]

    def comp_func(s):

        data_ub = s.cache_read(mask, "local.UB", [mask_reshaped])
        head_ub = s.cache_read(head, "local.UB", [head_reshaped])
        result_ub = s.cache_write(result, "local.UB")

        s[d_data].set_scope("local.UB")
        s[data_reorg].set_scope("local.UB")
        s[mask_reshaped].set_scope("local.UB")
        s[head_reshaped].set_scope("local.UB")
        s[result_tile].set_scope("local.UB")

        s[result_ub].compute_inline()

        # inline inputs
        s[head_ub].compute_inline()
        s[data_ub].compute_inline()

        # result_tile dependencies
        s[data_reorg].compute_inline()
        b, c1, h_out, w_out, h, w, c0 = result_tile.op.axis
        oh, ow = result_tile.op.reduce_axis
        s[result_tile].reorder(b, c1, h_out, w_out, h, w, oh, ow, c0)

        s[d_data].compute_at(s[result_tile], w_out)
        s[mask_reshaped].compute_at(s[result_tile], w_out)
        s[head_reshaped].compute_at(s[result_tile], w_out)

        # tile result
        b, c1, h, w, c0 = result.op.axis
        h_out, h_in = s[result].split(h, tile_h)
        w_out, w_in = s[result].split(w, tile_w)
        s[result].reorder(b, c1, h_out, w_out, h_in, w_in, c0)
        s[result_tile].compute_at(s[result], w_out)

    return dl_ddata, comp_func
Exemple #3
0
def avgpool_grad(x, dy, kernel, stride, pad):
    """
    Gradient for avgpool.

    Args:
        x (tvm.tensor.Tensor): Forward input tensor of type float16.
        dy (tvm.tensor.Tensor): Gradient for forward output of type float16.
        kernel (Union[list, tuple]): Two int numbers for window size of H and W for pooling.
        stride (Union[list, tuple]): Two int numbers for stride size of H and W for pooling.
        pad (Union[str, list, tuple]): Padding strategy for pooling.

    Returns:
        Gradient of forward input tensor.
    """
    dtype = x.dtype
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16)
    shape = get_shape(x)
    vc_util.check_shape(shape)

    if len(shape) != 5:
        raise RuntimeError("Only support 5-dim pooling!")
    if shape[-1] % 16 != 0:
        raise RuntimeError("Last shape must be divisible by 16!")
    if len(kernel) != 2:
        raise RuntimeError("Only support 2-dim kernel!")
    if len(stride) != 2:
        raise RuntimeError("Only support 2-dim stride!")
    if isinstance(pad, (list, tuple)) and len(pad) != 4:
        raise RuntimeError(
            "Only support string or list/tuple of 4 int numbers!")

    dim_info, _ = set_dim_func_(x, dy, kernel, stride, pad)
    attrs = {DIM: dim_info}

    @script
    def grad(zero, one_div_ksize, x, dy, kh, kw, sh, sw, ph_h, ph_t, pw_h,
             pw_t):
        tmpdx = allocate((x.shape[0], x.shape[1], x.shape[2] + ph_h + ph_t,
                          x.shape[3] + pw_h + pw_t, x.shape[4]), x.dtype)
        dy_tmp = allocate(dy.shape, dy.dtype)
        dx = output_tensor(x.shape, x.dtype)

        for n in range(tmpdx.shape[0]):
            for c1 in range(tmpdx.shape[1]):
                for h in range(tmpdx.shape[2]):
                    for w in range(tmpdx.shape[3]):
                        for c0 in range(tmpdx.shape[4]):
                            tmpdx[n, c1, h, w, c0] = zero

        for n in range(dy.shape[0]):
            for c1 in range(dy.shape[1]):
                for i in range(dy.shape[2]):
                    for j in range(dy.shape[3]):
                        for c0 in range(dy.shape[4]):
                            dy_tmp[n, c1, i, j,
                                   c0] = dy[n, c1, i, j, c0] * one_div_ksize
                            for ah in range(kh):
                                for aw in range(kw):
                                    if dy.shape[2] == 1 and dy.shape[3] == 1:
                                        tmpdx[n, c1, i * sh + ah, j * sw + aw,
                                              c0] = dy_tmp[n, c1, i, j, c0]
                                    else:
                                        tmpdx[n, c1, i * sh + ah, j * sw + aw, c0] = \
                                            tmpdx[n, c1, i * sh + ah, j * sw + aw, c0] + dy_tmp[n, c1, i, j, c0]

        if ph_h > 0 or ph_t > 0 or pw_h > 0 or pw_t > 0:
            for n in range(dx.shape[0]):
                for c1 in range(dx.shape[1]):
                    for h in range(dx.shape[2]):
                        for w in range(dx.shape[3]):
                            for c0 in range(dx.shape[4]):
                                dx[n, c1, h, w, c0] = tmpdx[n, c1, h + ph_h,
                                                            w + pw_h, c0]
            return dx
        else:
            return tmpdx

    kh, kw = kernel
    sh, sw = stride

    [ph_h, ph_t, pw_h,
     pw_t], _ = cal_pad_shapes_by_strategy(shape, kernel, stride, pad)

    zero = akg.tvm.const(0.0, dtype=dtype)
    one_div_ksize = akg.tvm.const(1.0 / (kh * kw), dtype=dtype)
    params = [kh, kw, sh, sw, ph_h, ph_t, pw_h, pw_t]
    output = grad(zero, one_div_ksize, x, dy,
                  *tuple(akg.tvm.convert(i) for i in params))

    attrs["loop_partition_unroll"] = 1
    return output, attrs
Exemple #4
0
def maxpool_grad(x, y, dy, kernel, stride, pad):
    """
    Performs the gradient of maxpool pooling on the input datas.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        x (tvm.tensor.Tensor): Tensor of type float16, float32.
        y (tvm.tensor.Tensor): Tensor, the maxpool result.
        dy (tvm.tensor.Tensor): Tensor, the gradient needed to be propagation.
        kernel (Union[List, Tuple]): two int numbers for pooling window's size.
        stride (Union[List, Tuple]): two int numbers for window's stride.
        pad (Union[String, List, Tuple]): padding, should be 'VALID','SAME' or
            instance of list(four int numbers, as 'CONSTANTS' strategy).
            Support **pad** is the same as avgpool's **Strategies**.

    Returns:
        Tensor as result for gradient of maxpooling.
    """
    attrs = get_attrs()
    dim_info, _, attrs_info = maxpool_grad_set_dim_func(
        x, y, dy, kernel, stride, pad)
    attrs.update(attrs_info)
    attrs[DIM] = dim_info

    shape = get_shape(x)
    ori_dtype = x.dtype
    vc_util.ops_dtype_check(ori_dtype, vc_util.DtypeForDavinci.ALL_FLOAT)

    if utils.product_is_mini() and ori_dtype == 'float32':
        raise RuntimeError("Maxpool only support"
                           "\'float16\' while platform is mini_v100!")
    dtype = ori_dtype

    if len(shape) != 5:
        raise ValueError("Only support 5-dim pooling!")
    if shape[-1] % 16 != 0:
        raise ValueError("Last shape must be divisible by 16!")
    if len(kernel) != 2:
        raise ValueError("Only support 2-dim kernel!")
    if len(stride) != 2:
        raise ValueError("Only support 2-dim stride!")
    if not isinstance(pad, str) \
            and not (isinstance(pad, (list, tuple)) and len(pad) == 4):
        raise ValueError("Only support string or list/tuple of 4 int numbers!")

    vc_util.check_shape(shape)

    in_n, in_c1, in_h, in_w, in_c0 = shape
    k_h, k_w = kernel
    s_h, s_w = stride
    [ph_h, ph_t, pw_h, pw_t], [y_h, y_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, pad)
    k_h_hybrid = k_h
    k_w_hybrid = k_w

    yn = in_n
    yc1 = in_c1
    yc0 = in_c0

    @script(capture=locals())
    def max_pool_grad_hybrid(zero_, one_, min_value_, x_, y_, dy_):
        x_dummy_ = allocate(
            (in_n, in_c1, ph_h + in_h + ph_t, pw_h + in_w + pw_t, in_c0),
            x_.dtype, "local")
        x_img_ = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0),
                          x_.dtype, "local")
        y_img_ = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0),
                          x_.dtype)
        mask_ = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0),
                         x_.dtype)
        mask_new = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0),
                            dy_.dtype)
        mask_res = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0),
                            dy_.dtype)
        output_pre = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0),
                              dy_.dtype)
        output_dummy_body = allocate(
            (in_n, in_c1, ph_h + in_h + ph_t, pw_h + in_w + pw_t, in_c0),
            dy_.dtype)
        output = output_tensor((in_n, in_c1, in_h, in_w, in_c0), dy_.dtype)

        for n in range(yn):
            for c1 in range(yc1):
                for h in range(y_h):

                    for kh in range(k_h_hybrid):
                        for iw in range(pw_h + in_w + pw_t):
                            for c0 in range(yc0):
                                x_dummy_[n, c1, h * s_h + kh, iw,
                                         c0] = min_value_
                                output_dummy_body[n, c1, h * s_h + kh, iw,
                                                  c0] = zero_

                    for kh in range(k_h_hybrid):
                        for iw in range(in_w):
                            for c0 in range(yc0):
                                if (h * s_h + kh >= ph_h
                                        and h * s_h + kh < in_h + ph_h):
                                    x_dummy_[n, c1, h * s_h + kh,
                                             iw + pw_h, c0] = \
                                        x_[n, c1, h * s_h + kh - ph_h, iw, c0]

                    for kh in range(k_h_hybrid):
                        for iw in range(in_w):
                            for c0 in range(yc0):
                                if (h * s_h + kh >= ph_h
                                        and h * s_h + kh < in_h + ph_h):
                                    output_dummy_body[n, c1,
                                                      h * s_h + kh, iw + pw_h, c0] = \
                                        output[n, c1, h * s_h + kh - ph_h, iw, c0]

                    for w in range(y_w):
                        for kh in range(k_h_hybrid):
                            for kw in range(k_w_hybrid):
                                for c0 in range(yc0):
                                    x_img_[n, c1, h, w, kh, kw, c0] = \
                                        x_dummy_[n, c1, h * s_h + kh,
                                                 w * s_w + kw, c0]
                                    y_img_[n, c1, h, w, kh, kw, c0] = \
                                        y_[n, c1, h, w, c0]
                                    mask_[n, c1, h, w, kh, kw, c0] = zero_ \
                                        if x_img_[n, c1, h, w, kh, kw, c0] \
                                        < y_img_[n, c1, h, w, kh, kw, c0] \
                                        else one_
                        for kh in range(k_h_hybrid):
                            for kw in range(k_w_hybrid):
                                for c0 in range(yc0):
                                    mask_new[n, c1, h, w, kh, kw, c0] = zero_
                                for kh_0 in range(kh):
                                    for kw_0 in range(k_w_hybrid):
                                        for c0 in range(yc0):
                                            mask_new[n, c1, h, w, kh,
                                                     kw, c0] = \
                                                mask_new[n, c1, h, w,
                                                         kh, kw, c0] \
                                                + mask_[n, c1, h, w,
                                                        kh_0, kw_0, c0]
                                for kw_0 in range(kw + 1):
                                    for c0 in range(yc0):
                                        mask_new[n, c1, h, w, kh, kw, c0] = \
                                            mask_new[n, c1, h, w, kh, kw, c0] \
                                            + mask_[n, c1, h, w, kh, kw_0, c0]
                        for kh in range(k_h_hybrid):
                            for kw in range(k_w_hybrid):
                                for c0 in range(yc0):
                                    mask_res[n, c1, h, w, kh, kw, c0] = \
                                        zero_ \
                                        if mask_new[n, c1, h, w, kh, kw, c0] \
                                        > mask_[n, c1, h, w, kh, kw, c0] \
                                        else mask_[n, c1, h, w, kh, kw, c0]
                                    output_pre[n, c1, h, w, kh, kw, c0] = \
                                        mask_res[n, c1, h, w, kh, kw, c0] \
                                        * dy_[n, c1, h, w, c0]
                                    output_dummy_body[n, c1,
                                                      h * s_h + kh, w * s_w + kw, c0] += \
                                        output_pre[n, c1, h, w, kh, kw, c0]
                    for kh in range(k_h_hybrid):
                        for iw in range(in_w):
                            for c0 in range(yc0):
                                if (h * s_h + kh >= ph_h
                                        and h * s_h + kh < in_h + ph_h):
                                    output[n, c1, h * s_h + kh - ph_h,
                                           iw, c0] = \
                                        output_dummy_body[n, c1,
                                                          h * s_h + kh, iw + pw_h, c0]

        return output

    zero = akg.tvm.const(0.0, dtype=dtype)
    one = akg.tvm.const(1.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else
                              -340282346638528859811704183484516925440.0,
                              dtype=dtype)
    output = max_pool_grad_hybrid(zero, one, min_value, x, y, dy)
    return output, attrs
Exemple #5
0
def maxpool_with_argmax(data, kernel, stride, strategy):
    """
    Performs the max pooling on the input datas.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or
            instance of list(four int numbers, as 'CONSTANTS' strategy).
            Support **Strategies** is the same as avgpool.

    Returns:
        tvm.tensor.Tensor, result for gradient of maxpooling.
    """
    attrs = get_attrs()
    dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride,
                                                strategy)[0]
    for k, v in attr_map_v2.items():
        attrs[k] = v
    if dim_info != "":
        attrs['dim'] = dim_info
    attrs["custom_tiling"] = maxpool_with_argmax_tiling_strategy(
        data, kernel, stride, strategy)
    shape = get_shape(data)
    dtype = data.dtype

    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16)
    vc_util.check_shape(kernel, 2, 'Kernel')
    vc_util.check_shape(stride, 2, 'Stride')

    pad_strategy_check(strategy)

    kernel_h, kernel_w = kernel
    in_n, in_c1, _, _, in_c0 = shape

    [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad = [ph_h, ph_t, pw_h, pw_t]
    zero = akg.tvm.const(0.0, dtype=dtype)
    one = akg.tvm.const(1.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else
                              -340282346638528859811704183484516925440.0,
                              dtype=dtype)

    # fmap img2col l1 -> ub in zZ format by fractal
    fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w,
                             in_c0)

    fmap_img2col_ub = img2col(data,
                              fmap_img2col_shape_ub,
                              kernel_h,
                              kernel_w,
                              pad,
                              stride,
                              min_value,
                              tag='')

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)
    reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h")
    reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w")
    output = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: akg.tvm.max(
            fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0],
            axis=[reduce_axis_h, reduce_axis_w]),
        name="pooling_max")

    pooling_mask = akg.tvm.compute(
        fmap_img2col_shape_ub,
        lambda n, c1, kh, kw, oh, ow, c0: akg.tvm.if_then_else(
            fmap_img2col_ub[n, c1, kh, kw, oh, ow, c0] < output[
                n, c1, oh, ow, c0], zero, one),
        name="pooling_mask")

    mask_flag = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0],
        name="mask_flag")

    mask_init = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0],
        name="mask_init")

    # spec 2
    @script(capture=locals())
    def hybrid_first_max(mask_, flag_, flag2_, zero_, one_):
        output_ = allocate(
            (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0),
            mask_.dtype, 'local')
        for n_i in range(in_n):
            for c1_i in range(in_c1):
                for oh_i in range(out_h):
                    for ow_i in range(out_w):
                        for c0_i in range(in_c0):
                            output_[n_i, c1_i, 0, 0, oh_i, ow_i,
                                    c0_i] = flag2_[n_i, c1_i, oh_i, ow_i, c0_i]
                for kh_i in range(kernel_h):
                    for kw_i in range(kernel_w):
                        for oh_i in range(out_h):
                            for ow_i in range(out_w):
                                for c0_i in range(in_c0):
                                    output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \
                                        mask_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] -\
                                        flag_[n_i, c1_i, oh_i, ow_i, c0_i]
                                    output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \
                                        max(output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i], zero_)
                                    flag_[n_i, c1_i, oh_i, ow_i, c0_i] =\
                                        flag_[n_i, c1_i, oh_i, ow_i, c0_i] +\
                                        output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i]
        return output_

    mask_first_max = hybrid_first_max(pooling_mask, mask_flag, mask_init, zero,
                                      one)
    return output, mask_first_max, attrs
Exemple #6
0
def quantized_maxpool_tiling_strategy(data, kernel, stride, pad, quant_algo):
    """Custom tiling for quantized maxpool."""
    batch, c_1, fm_h, fm_w, c_0 = get_shape(data)
    _, [out_h, out_w] = \
        cal_pad_shapes_by_strategy(get_shape(data), kernel, stride, pad)

    strategy = list()
    if c_0 == 16:
        h_cut = out_h
        if fm_h >= 50 and fm_w >= 50:
            h_cut = 3
        dim_ind = 0
        tiling_params = list()
        if batch > 1:
            tiling_params.append([1, ct_util.TileConstraint.FACTOR, dim_ind])
            dim_ind = dim_ind + 1
        if c_1 > 1:
            tiling_params.append([1, ct_util.TileConstraint.FACTOR, dim_ind])
            dim_ind = dim_ind + 1
        tiling_params.append([h_cut, ct_util.TileConstraint.FACTOR, dim_ind])
        tiling_params.append(
            ["H", ct_util.TileConstraint.SET_AXIS_INFO, dim_ind])
        tiling_params.append(
            [out_w, ct_util.TileConstraint.FACTOR, dim_ind + 1])

        if quant_algo is not None:
            tiling_params.append(
                [kernel[0], ct_util.TileConstraint.FACTOR, dim_ind + 2])
            tiling_params.append(
                [kernel[1], ct_util.TileConstraint.FACTOR, dim_ind + 3])
            tiling_params.append(
                [16, ct_util.TileConstraint.FACTOR, dim_ind + 4])
        else:
            tiling_params.append(
                [kernel[0], ct_util.TileConstraint.FACTOR, dim_ind + 3])
            tiling_params.append(
                [kernel[1], ct_util.TileConstraint.FACTOR, dim_ind + 4])
            tiling_params.append(
                [16, ct_util.TileConstraint.FACTOR, dim_ind + 2])

        for para in tiling_params:
            strategy += ct_util.create_constraint_on_axis(values=para[0],
                                                          constraints=para[1],
                                                          axis=para[2])

        #  if batch > 1:
        #      strategy += ct_util.create_constraint_on_axis(
        #          values=1,
        #          constraints=ct_util.TileConstraint.FACTOR,
        #          axis=dim_ind)
        #      dim_ind = dim_ind + 1
        #  if c_1 > 1:
        #      strategy += ct_util.create_constraint_on_axis(
        #          values=1,
        #          constraints=ct_util.TileConstraint.FACTOR,
        #          axis=dim_ind)
        #      dim_ind = dim_ind + 1
        #  strategy += ct_util.create_constraint_on_axis(
        #      values=h_cut,
        #      constraints=ct_util.TileConstraint.FACTOR,
        #      axis=dim_ind)
        #  strategy += ct_util.create_constraint_on_axis(
        #      values="H",
        #      constraints=ct_util.TileConstraint.SET_AXIS_INFO,
        #      axis=dim_ind)
        #  strategy += ct_util.create_constraint_on_axis(
        #      values=out_w,
        #      constraints=ct_util.TileConstraint.FACTOR,
        #      axis=dim_ind+1)
        #  strategy += ct_util.create_constraint_on_axis(
        #      values=kernel[0],
        #      constraints=ct_util.TileConstraint.FACTOR,
        #      axis=dim_ind+2)
        #  strategy += ct_util.create_constraint_on_axis(
        #      values=kernel[1],
        #      constraints=ct_util.TileConstraint.FACTOR,
        #      axis=dim_ind+3)
        #  strategy += ct_util.create_constraint_on_axis(
        #      values=16,
        #      constraints=ct_util.TileConstraint.FACTOR,
        #      axis=dim_ind+4)
    return strategy
Exemple #7
0
def maxpool_with_argmax_dynamic(data, kernel, stride, strategy):
    """
    Performs the max pooling on the input datas.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or
            instance of list(four int numbers, as 'CONSTANTS' strategy).
            Support **Strategies** is the same as avgpool.

    Returns:
        tvm.tensor.Tensor, result for gradient of maxpooling.
    """
    attrs = get_dynamic_attrs()
    dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride,
                                                strategy)[0]
    for k, v in attr_map_v2.items():
        attrs[k] = v
    if dim_info != "":
        attrs['dim'] = dim_info
    # attrs["custom_tiling"] = maxpool_with_argmax_custom_tiling_strategy(data)
    attrs["enable_feature_library"] = True
    shape = get_shape(data)
    dtype = data.dtype

    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16)
    vc_util.check_shape(kernel, 2, 'Kernel')
    vc_util.check_shape(stride, 2, 'Stride')

    pad_strategy_check(strategy)

    kernel_h, kernel_w = kernel
    in_n, in_c1, _, _, in_c0 = shape

    [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad = [ph_h, ph_t, pw_h, pw_t]
    zero = akg.tvm.const(0.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else
                              -340282346638528859811704183484516925440.0,
                              dtype=dtype)

    # fmap img2col l1 -> ub in zZ format by fractal
    fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w,
                             in_c0)

    fmap_img2col_ub = img2col(data,
                              fmap_img2col_shape_ub,
                              kernel_h,
                              kernel_w,
                              pad,
                              stride,
                              min_value,
                              tag='')

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)
    reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h")
    reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w")
    output = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: akg.tvm.max(
            fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0],
            axis=[reduce_axis_h, reduce_axis_w]),
        name="pooling_max")

    zero = akg.tvm.const(0.0, dtype=dtype)
    mask_first_max_shape = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w,
                            in_c0)
    mask_first_max = akg.tvm.compute(mask_first_max_shape,
                                     lambda *indice: zero,
                                     name="mask_first_max")

    attrs["custom_tiling"] = maxpool_with_argmax_dynamic_tensor_strategy(
        data, fmap_img2col_ub, mask_first_max)
    attrs["dynamic_shape"] = ds.set_dynamic_shape_limit_for_tensor(
        output, [64, 64], [2, 3])
    return output, mask_first_max, attrs
Exemple #8
0
def maxpool(data, kernel, stride, strategy):
    """
    Performs the max pooling on the input data.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID',
            'SAME' or instance of list(four int numbers for 'CONSTANTS' strategy).
            Support **Strategies** is same as avgpool.

    Returns:
        tvm.tensor.Tensor, as result for max pooling.
    """
    attrs = attr_map
    attrs['dim'] = maxpool_set_dim_func(data, kernel, stride, strategy)[0]

    shape = get_shape(data)
    dtype = data.dtype
    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT)
    vc_util.check_shape(kernel, 2, "Kernel")
    vc_util.check_shape(stride, 2, "Stride")

    pad_strategy_check(strategy)

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    in_n, in_c1, in_h, in_w, in_c0 = shape

    [ph_h, _, pw_h, _], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)
    if attrs.get("dynamic") is True:
        # dynamic shape: although we can represent out_h and out_w using input shapes, they are too complicated
        out_h = akg.tvm.var("OUT_H")
        out_w = akg.tvm.var("OUT_W")

    @script(capture=locals())
    def dynamic_max_pool_hybrid_0(zero_, one_, min_value_, x_, in_n, in_c1,
                                  in_h, in_w, in_c0, out_h, out_w):
        output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype)

        for n in range(in_n):
            for c1 in range(in_c1):
                # Head
                for ow in range(out_w):
                    for c0 in range(in_c0):
                        output[n, c1, 0, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for ow in range(out_w):
                            for c0 in range(in_c0):
                                if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1:
                                    output[n, c1, 0, ow, c0] = \
                                        max(output[n, c1, 0, ow, c0],
                                            x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0])
                # Tail
                for oh in range(out_h - 1):
                    for ow in range(out_w):
                        for c0 in range(in_c0):
                            output[n, c1, oh + 1, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for oh in range(out_h - 1):
                            for ow in range(out_w):
                                for c0 in range(in_c0):
                                    if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1\
                                            and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1:
                                        output[n, c1, oh + 1, ow, c0] = max(
                                            output[n, c1, oh + 1, ow, c0],
                                            x_[n, c1,
                                               (oh + 1) * stride_h + kh - ph_h,
                                               ow * stride_w + kw - pw_h, c0])

        return output

    # static shape's hybrid
    @script(capture=locals())
    def static_max_pool_hybrid_0(zero_, one_, min_value_, x_):
        output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype)

        for n in range(in_n):
            for c1 in range(in_c1):
                # Head
                for ow in range(out_w):
                    for c0 in range(in_c0):
                        output[n, c1, 0, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for ow in range(out_w):
                            for c0 in range(in_c0):
                                if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1:
                                    output[n, c1, 0, ow, c0] = \
                                        max(output[n, c1, 0, ow, c0],
                                            x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0])
                # Tail
                for oh in range(out_h - 1):
                    for ow in range(out_w):
                        for c0 in range(in_c0):
                            output[n, c1, oh + 1, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for oh in range(out_h - 1):
                            for ow in range(out_w):
                                for c0 in range(in_c0):
                                    if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1 \
                                            and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1:
                                        output[n, c1, oh + 1, ow, c0] = max(
                                            output[n, c1, oh + 1, ow, c0],
                                            x_[n, c1,
                                               (oh + 1) * stride_h + kh - ph_h,
                                               ow * stride_w + kw - pw_h, c0])

        return output

    zero = akg.tvm.const(0.0, dtype=dtype)
    one = akg.tvm.const(1.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else
                              -340282346638528859811704183484516925440.0,
                              dtype=dtype)
    if attrs.get("dynamic") is True:
        output = dynamic_max_pool_hybrid_0(zero, one, min_value, data, in_n,
                                           in_c1, in_h, in_w, in_c0, out_h,
                                           out_w)
    else:
        output = static_max_pool_hybrid_0(zero, one, min_value, data)

    return output, attrs
Exemple #9
0
def avg_pool_5d_hybrid(a_value, kernel, stride, strategy):
    """avgpool with 5d case via hybrid"""
    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    shape = get_shape(a_value)
    batch_size, c1_, in_size_h, in_size_w, c0_ = shape
    dtype = a_value.dtype
    if len(shape) != 5:
        raise ValueError("Only support 5-dim pooling!")
    if len(kernel) != 2:
        raise ValueError("Only support 2-dim kernel!")

    [pad_height_head, _, pad_width_head, _], [out_size_h, out_size_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    avg_pre = akg.tvm.const(1.0000 / (kernel_w * kernel_h), dtype=dtype)
    zero = akg.tvm.const(0.0, dtype=dtype)

    @script(capture=locals())
    def avg_pool_hybrid(inputs, zero, avg_pre):
        output = output_tensor((batch_size, c1_, out_size_h, out_size_w, c0_),
                               inputs.dtype)

        for n in range(batch_size):
            for c1 in range(c1_):
                # Head
                for ow in range(out_size_w):
                    for c0 in range(c0_):
                        output[n, c1, 0, ow, c0] = zero
                for ow in range(out_size_w):
                    for kh in range(kernel_h):
                        for kw in range(kernel_w):
                            for c0 in range(c0_):
                                if (kh >= pad_height_head) \
                                        and (ow * stride_w + kw - pad_width_head >= 0) \
                                        and (ow * stride_w + kw <= in_size_w + pad_width_head - 1):
                                    output[n, c1, 0, ow, c0] = output[n, c1, 0, ow, c0] +\
                                        inputs[n, c1, kh - pad_height_head,
                                               ow * stride_w + kw - pad_width_head, c0]
                                else:
                                    output[n, c1, 0, ow, c0] += zero
                for ow in range(out_size_w):
                    for c0 in range(c0_):
                        output[n, c1, 0, ow, c0] *= avg_pre
                # Tail
                for oh in range(out_size_h - 1):
                    for ow in range(out_size_w):
                        for c0 in range(c0_):
                            output[n, c1, oh + 1, ow, c0] = zero
                for oh in range(out_size_h - 1):
                    for ow in range(out_size_w):
                        for kh in range(kernel_h):
                            for kw in range(kernel_w):
                                for c0 in range(c0_):
                                    if ((oh + 1) * stride_h + kh <= in_size_h + pad_height_head - 1)\
                                            and (ow * stride_w + kw >= pad_width_head)\
                                            and (ow * stride_w + kw <= in_size_w + pad_width_head - 1):
                                        output[n, c1, oh + 1, ow, c0] = output[n, c1, oh + 1, ow, c0] +\
                                            inputs[n, c1, (oh + 1) * stride_h +
                                                   kh - pad_height_head, ow * stride_w +
                                                   kw - pad_width_head, c0]
                                    else:
                                        output[n, c1, oh + 1, ow, c0] += zero
                for oh in range(out_size_h - 1):
                    for ow in range(out_size_w):
                        for c0 in range(c0_):
                            output[n, c1, oh + 1, ow, c0] *= avg_pre
        return output

    res_value = avg_pool_hybrid(a_value, zero, avg_pre)

    # set dim
    info = dim.Dim()
    # first part
    info.setdim(index=0, axis=0, tilel1=out_size_w, tilel0=0)  # ow
    info.setdim(index=0, axis=1, tilel1=c0_, tilel0=0)  # c0
    info.setdim(index=0, axis=2, tilel1=kernel_h, tilel0=0)  # kh

    # second part
    info.setdim(index=1, axis=0, tilel1=out_size_h - 1, tilel0=0)  # oh-1
    info.setdim(index=1, axis=1, tilel1=out_size_w, tilel0=0)  # ow
    info.setdim(index=1, axis=2, tilel1=c0_, tilel0=0)  # c0
    info.setdim(index=1, axis=3, tilel1=kernel_h, tilel0=0)  # kh

    info = str(info)

    attrs = {DIM: info}
    return res_value, attrs
Exemple #10
0
def avgpool_with_img2col(data, kernel, stride, strategy):
    """
    Performs the avgpool with img2col.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or
            instance of list(four int numbers, as 'CONSTANTS' strategy).
            Support **Strategies** is the same as avgpool.

    Returns:
        tvm.tensor.Tensor, result for gradient of avgpooling.
    """
    shape = get_shape(data)
    dtype = data.dtype

    utils.davinci_format_check(shape, "NC1HWC0", dim=5)
    utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT16)
    utils.check_shape(kernel, 2, "Kernel")
    utils.check_shape(stride, 2, "Stride")

    kernel_h, kernel_w = kernel
    in_n, in_c1, _, _, in_c0 = shape

    [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad = [ph_h, ph_t, pw_h, pw_t]
    pad_value = zero_const(dtype)

    # fmap img2col l1 -> ub in zZ format by fractal
    fmap_img2col_shp_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w,
                           in_c0)
    fmap_img2col_ub = img2col(data,
                              fmap_img2col_shp_ub,
                              kernel_h,
                              kernel_w,
                              pad,
                              stride,
                              pad_value,
                              tag="")

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)
    reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h")
    reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w")
    res_sum = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: akg.tvm.sum(
            fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0],
            axis=[reduce_axis_h, reduce_axis_w]),
        name="pooling_avg")

    dividor = akg.tvm.const(kernel_h * kernel_w, dtype)
    output = akg.tvm.compute(out_shape,
                             lambda *i: res_sum(*i) / dividor,
                             name="res_value")
    return output
Exemple #11
0
def avgpool(data, kernel, stride, strategy, target=utils.CCE):
    """
    Performs the average pooling on the input datas.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.
        Support **Strategies**:

        .. hlist::
          * VALID: will not pad, and drop tailed elements when pooling.
                   Output shape will be  `ceil((pool_shapes[i] - (kernel[i] - 1)) / stride[i])`
            > **example**:
            > params: inputs => 11, kernel width => 5, stride => 4
            > inputs: 1  2  3  4  5  6  7  8  9  10 11
            > 1st window contains: 1 2 3 4 5
            > 2nd window contains: 5 6 7 8 9
            > dropped: 10 11
          * SAME: will pad with zero evenly each side, but will add extra to tail
                  if the total padding amount is odd.
                  Output shape will be  `ceil(pool_shapes[i] / stride[i])`
            > **example**:
            > params: inputs => 10, kernel width => 5, stride => 4
            > inputs: 1  2  3  4  5  6  7  8  9  10
            > paded: 0(pad1) | 1  2  3  4  5  6  7  8  9  10 | 0(pad2) 0(pad3)
            > 1st window contains: 0(pad1) 1 2 3 4
            > 2nd window contains: 4 5 6 7 8
            > 3rd window contains: 8 9 10 0(pad2) 0(pad3)
            > dropped: None
          * CONSTANTS: will pad with zero according to given constants
                       (also dropped tailed elements when pooling).
            > **example**:
            > params: inputs => 10, kernel width => 5, stride => 4, pad => (2, 2)
            > inputs: 1  2  3  4  5  6  7  8  9  10
            > paded: 0(pad1) 0(pad2) | 1  2  3  4  5  6  7  8  9  10 | 0(pad2) 0(pad3)
            > 1st window contains: 0(pad1) 0(pad2) 1 2 3
            > 2nd window contains: 3 4 5 6 7
            > 3rd window contains: 7 8 9 10 0(pad3)
            > dropped: 0(pad4)

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): List or tuple of two int numbers for pooling window's size.
        stride (Union[list, tuple]): List or tuple of two int numbers for window's stride.
        strategy (Union[str, list, tuple]): A string or list or tuple for padding strategy,
            should be 'VALID', 'SAME' or instance of list(including four int numbers,
            as 'CONSTANTS' strategy).

    Returns:
        Tensor as result for average pooling.

    Supported Platforms:
        'Ascend'
    """
    dim_info, _ = avgpool_set_dim_func(data, kernel, stride, strategy)
    attrs = {DIM: dim_info}
    attrs['disable_half_to_float_sum_opt'] = True
    attrs['pragma_disable_whole_component'] = False

    shape = [x.value for x in data.shape]
    dtype = data.dtype
    utils.davinci_format_check(shape, "NC1HWC0", dim=5)
    utils.check_shape(kernel, 2, 'Kernel')
    utils.check_shape(stride, 2, 'Stride')

    if shape[2] > 60 and shape[3] > 60:
        return avg_pool_5d_hybrid(data, kernel, stride, strategy)

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    batch_size, c1, in_size_h, in_size_w, c0 = shape

    [pad_height_head, pad_height_tail, pad_width_head, pad_width_tail], [out_size_h, out_size_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad_shape = (batch_size, c1, in_size_h + pad_height_head + pad_height_tail,
                 in_size_w + pad_width_head + pad_width_tail, c0)

    pad2d = akg.tvm.compute(
        pad_shape,
        lambda n, c1, h, w, c0: akg.tvm.if_then_else(
            akg.tvm.
            any(h < pad_height_head, h > in_size_h + pad_height_head - 1, w <
                pad_width_head, w > in_size_w + pad_width_head - 1),
            akg.tvm.const(0.0, dtype=dtype),
            data[n, c1, h - pad_height_head, w - pad_width_head, c0],
        ),
        name="pad2d")

    axis_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="axis_kernel_h")
    axis_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="axis_kernel_w")

    out_shape = (batch_size, c1, out_size_h, out_size_w, c0)

    dividor = akg.tvm.const(kernel_h * kernel_w, dtype)
    res = akg.tvm.compute(out_shape,
                          lambda n, c1, h, w, c0: akg.tvm.sum(
                              pad2d[n, c1, h * stride_h + axis_kernel_h, w *
                                    stride_w + axis_kernel_w, c0],
                              axis=[axis_kernel_h, axis_kernel_w]),
                          name="res")
    res_value = akg.tvm.compute(
        out_shape,
        lambda n, c1, h, w, c0: res[n, c1, h, w, c0] / dividor,
        name="res_value")
    return res_value, attrs